Spaces:

Nexari-Research
/

Nexari-G1.1

Running

File size: 1,290 Bytes

b1f4ce9
 
 
 
 
 
 
 
 
 
d6a6892
 
b1f4ce9
d6a6892
b1f4ce9
 
 
d9a327e
b1f4ce9
d847d11
 
d6a6892
d847d11
d6a6892
d847d11
d9a327e
b1f4ce9
d847d11
d9a327e
4b43042
d6a6892
b1f4ce9
 
d6a6892
b1f4ce9
 
d9a327e
b1f4ce9

import os
import logging
import asyncio
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

logger = logging.getLogger("nexari.chat")
BASE_DIR = "./models/chat"
model = None

# === OPTIMIZED: Llama 3.2 3B (Q4_K_M) ===
# Using Q4_K_M reduces memory bandwidth pressure on the CPU significantly.
REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"

def load_model(local_dir: str = None):
    global model
    if not local_dir: local_dir = BASE_DIR
    try:
        os.makedirs(local_dir, exist_ok=True)
        path = os.path.join(local_dir, FILENAME)
        
        if not os.path.exists(path):
            logger.info(f"⬇️ Downloading Chat Model (Fast Q4)...")
            hf_hub_download(repo_id=REPO_ID, filename=FILENAME, local_dir=local_dir)
        
        model = Llama(
            model_path=path, 
            n_ctx=4096,
            n_threads=2, 
            n_batch=512, # Helps process "Search Results" text block faster
            verbose=False
        )
        logger.info("✅ Chat Model Ready (Turbo Mode)")
        return model
    except Exception as e:
        logger.error(f"Chat Load Error: {e}")
        model = None

async def load_model_async():
    return await asyncio.to_thread(load_model)