Spaces:
Running
Running
File size: 1,290 Bytes
b1f4ce9 d6a6892 b1f4ce9 d6a6892 b1f4ce9 d9a327e b1f4ce9 d847d11 d6a6892 d847d11 d6a6892 d847d11 d9a327e b1f4ce9 d847d11 d9a327e 4b43042 d6a6892 b1f4ce9 d6a6892 b1f4ce9 d9a327e b1f4ce9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
import os
import logging
import asyncio
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
logger = logging.getLogger("nexari.chat")
BASE_DIR = "./models/chat"
model = None
# === OPTIMIZED: Llama 3.2 3B (Q4_K_M) ===
# Using Q4_K_M reduces memory bandwidth pressure on the CPU significantly.
REPO_ID = "bartowski/Llama-3.2-3B-Instruct-GGUF"
FILENAME = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
def load_model(local_dir: str = None):
global model
if not local_dir: local_dir = BASE_DIR
try:
os.makedirs(local_dir, exist_ok=True)
path = os.path.join(local_dir, FILENAME)
if not os.path.exists(path):
logger.info(f"⬇️ Downloading Chat Model (Fast Q4)...")
hf_hub_download(repo_id=REPO_ID, filename=FILENAME, local_dir=local_dir)
model = Llama(
model_path=path,
n_ctx=4096,
n_threads=2,
n_batch=512, # Helps process "Search Results" text block faster
verbose=False
)
logger.info("✅ Chat Model Ready (Turbo Mode)")
return model
except Exception as e:
logger.error(f"Chat Load Error: {e}")
model = None
async def load_model_async():
return await asyncio.to_thread(load_model)
|