import os import logging import asyncio from huggingface_hub import hf_hub_download from llama_cpp import Llama logger = logging.getLogger("nexari.chat") BASE_DIR = "./models/chat" model = None # === OPTIMIZED: Llama 3.2 3B (Q4_K_M) === # Using Q4_K_M reduces memory bandwidth pressure on the CPU significantly. REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF" FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf" def load_model(local_dir: str = None): global model if not local_dir: local_dir = BASE_DIR try: os.makedirs(local_dir, exist_ok=True) path = os.path.join(local_dir, FILENAME) if not os.path.exists(path): logger.info(f"⬇️ Downloading Chat Model (Fast Q4)...") hf_hub_download(repo_id=REPO_ID, filename=FILENAME, local_dir=local_dir) model = Llama( model_path=path, n_ctx=4096, n_threads=2, n_batch=512, # Helps process "Search Results" text block faster verbose=False ) logger.info("✅ Chat Model Ready (Turbo Mode)") return model except Exception as e: logger.error(f"Chat Load Error: {e}") model = None async def load_model_async(): return await asyncio.to_thread(load_model)