import os import logging import asyncio from huggingface_hub import hf_hub_download from llama_cpp import Llama logger = logging.getLogger("nexari.coder") BASE_DIR = "./models/coder" model = None # === OPTIMIZED FOR 2 vCPU: Qwen 2.5 Coder 3B === # 7B was too heavy for 2 vCPU (9 mins). # 3B will be much faster (~1-2 mins) while keeping good logic. REPO_ID = "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF" FILENAME = "qwen2.5-coder-3b-instruct-q6_k.gguf" def load_model(local_dir: str = None): global model if not local_dir: local_dir = BASE_DIR try: os.makedirs(local_dir, exist_ok=True) path = os.path.join(local_dir, FILENAME) # Download (~2.8 GB) if not os.path.exists(path): logger.info(f"⬇️ Downloading Qwen 3B Coder... (Balanced for 2 vCPU)") hf_hub_download(repo_id=REPO_ID, filename=FILENAME, local_dir=local_dir) # STRICTLY 2 THREADS (Matching your 2 vCPU limit) # 4 threads would cause lag/freezing on 2 vCPU. model = Llama( model_path=path, n_ctx=8192, n_threads=2, # Optimized for 2 vCPU hardware verbose=False ) logger.info("✅ Coder Model Ready (Qwen 3B - Optimized)") return model except Exception as e: logger.error(f"Coder Load Error: {e}") model = None async def load_model_async(): return await asyncio.to_thread(load_model)