File size: 1,290 Bytes
b1f4ce9
 
 
 
 
 
 
 
 
 
d6a6892
 
b1f4ce9
d6a6892
b1f4ce9
 
 
d9a327e
b1f4ce9
d847d11
 
d6a6892
d847d11
d6a6892
d847d11
d9a327e
b1f4ce9
d847d11
d9a327e
4b43042
d6a6892
b1f4ce9
 
d6a6892
b1f4ce9
 
d9a327e
b1f4ce9
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import os
import logging
import asyncio
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

logger = logging.getLogger("nexari.chat")
BASE_DIR = "./models/chat"
model = None

# === OPTIMIZED: Llama 3.2 3B (Q4_K_M) ===
# Using Q4_K_M reduces memory bandwidth pressure on the CPU significantly.
REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"

def load_model(local_dir: str = None):
    global model
    if not local_dir: local_dir = BASE_DIR
    try:
        os.makedirs(local_dir, exist_ok=True)
        path = os.path.join(local_dir, FILENAME)
        
        if not os.path.exists(path):
            logger.info(f"⬇️ Downloading Chat Model (Fast Q4)...")
            hf_hub_download(repo_id=REPO_ID, filename=FILENAME, local_dir=local_dir)
        
        model = Llama(
            model_path=path, 
            n_ctx=4096,
            n_threads=2, 
            n_batch=512, # Helps process "Search Results" text block faster
            verbose=False
        )
        logger.info("✅ Chat Model Ready (Turbo Mode)")
        return model
    except Exception as e:
        logger.error(f"Chat Load Error: {e}")
        model = None

async def load_model_async():
    return await asyncio.to_thread(load_model)