Spaces:

fiewolf1000
/

bc-agent-modle1

Runtime error

App Files Files Community

fiewolf1000 commited on Sep 16

Commit

e7afee2

verified ·

1 Parent(s): 95c14f8

Update model_space.py

Browse files

Files changed (1) hide show

model_space.py +46 -96

model_space.py CHANGED Viewed

@@ -2,14 +2,15 @@ from fastapi import FastAPI, HTTPException, Depends, Header
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
-from llama_cpp import Llama  # GGUF模型加载核心库
-from huggingface_hub import hf
 import time
 import logging
 from typing import AsyncGenerator, Optional
 import asyncio
-# ---------------------- 1. 日志与FastAPI初始化（适配Spaces） ----------------------
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
@@ -20,55 +21,59 @@ logger = logging.getLogger(__name__)
 app = FastAPI(title="CodeLlama-7B-Instruct (GGUF-4bit) CPU", version="1.0")
 logger.info("FastAPI应用初始化完成")
-# 跨域配置（Spaces前端调用需开启）
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # Spaces前端域名自动适配，无需修改
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
 logger.info("跨域中间件配置完成")
-# ---------------------- 2. API密钥验证（保留原逻辑，适配Spaces安全） ----------------------
 def verify_api_key(api_key: Optional[str] = Header(None, alias="api_key")):
-    # Spaces可通过环境变量传密钥，避免硬编码（在Spaces设置中配置NODE_API_KEY）
     valid_key = os.getenv("NODE_API_KEY", "default-node-key-123")
     if not api_key or api_key != valid_key:
         logger.warning(f"无效API密钥：{api_key}")
         raise HTTPException(status_code=401, detail="Invalid or missing API Key")
     return api_key
-# ---------------------- 3. 加载GGUF模型（关键：4-bit量化，CPU优化） ----------------------
-# 模型路径：直接用Hugging Face模型ID（Spaces会自动下载，无需手动传模型文件）
-MODEL_ID = "TheBloke/CodeLlama-7B-Instruct-GGUF"
-MODEL_FILE = "codellama-7b-instruct.Q4_K_M.gguf"  # 4-bit量化文件（内存占用最小且效果好）
 try:
     model_load_start = time.time()
-    logger.info(f"开始加载模型：{MODEL_ID}/{MODEL_FILE}（CPU环境）")
-    # 核心：GGUF模型加载配置（CPU专用，限制内存占用）
     llm = Llama(
-        model_path=f"models/{MODEL_ID}/{MODEL_FILE}",  # Spaces自动缓存路径
-        n_ctx=2048,  # 上下文长度（2048足够代码生成，太大占内存）
-        n_threads=4,  # 线程数（Spaces CPU多为4核，设为4最优）
-        n_threads_batch=4,  # 批处理线程数，与n_threads一致
-        n_gpu_layers=0,  # 0=纯CPU（Spaces免费版无GPU）
-        verbose=False,  # 关闭冗余日志，减少Spaces输出占用
     )
     model_load_end = time.time()
-    logger.info(f"模型加载完成！耗时 {model_load_end - model_load_start:.2f} 秒，内存占用约3.5GB")
 except Exception as e:
-    logger.error(f"模型加载失败：{str(e)}", exc_info=True)
-    raise RuntimeError(f"Model load failed: {str(e)}") from e
-# ---------------------- 4. 数据模型与流式推理（适配Spaces前端接收） ----------------------
 class GenerationRequest(BaseModel):
     prompt: str
-    max_new_tokens: int = 150  # 限制生成长度（CPU环境150足够，太长耗时）
-    temperature: float = 0.6  # 代码生成推荐0.5-0.7，兼顾多样性和准确性
     top_p: float = 0.9
 @app.post("/generate/code/stream")
@@ -76,83 +81,28 @@ async def generate_code_stream(
     req: GenerationRequest,
     api_key: str = Depends(verify_api_key)
 ) -> StreamingResponse:
-    request_id = f"req_{int(time.time() * 1000)}"
-    logger.info(f"收到请求 {request_id}：prompt='{req.prompt[:30]}...'")
-    # 构建CodeLlama指令格式（必须严格遵循，否则生成效果差）
-    formatted_prompt = f"<s>[INST] {req.prompt} [/INST]"
-    # 流式生成器（适配Spaces前端SSE接收）
-    async def stream_generator() -> AsyncGenerator[str, None]:
-        start_time = time.time()
-        generated_total = 0
-        try:
-            # 调用GGUF模型流式生成（CPU优化）
-            for token in llm.create_completion(
-                prompt=formatted_prompt,
-                max_tokens=req.max_new_tokens,
-                temperature=req.temperature,
-                top_p=req.top_p,
-                stream=True,  # 开启流式
-                stop=["</s>"],  # 结束符（避免多余输出）
-                echo=False  # 不返回输入prompt
-            ):
-                # 提取生成的文本片段
-                text_chunk = token["choices"][0]["text"]
-                if text_chunk:
-                    generated_total += len(text_chunk.split())
-                    # 按SSE格式返回（Spaces前端可直接解析）
-                    yield f"data: {text_chunk}\n\n"
-                    await asyncio.sleep(0.05)  # 微调延迟，避免前端接收过快
-            # 生成完成标记
-            total_time = time.time() - start_time
-            yield f"event: end\ndata: 生成完成！共{generated_total}个词，耗时{total_time:.2f}秒\n\n"
-            logger.info(f"请求 {request_id} 完成，耗时 {total_time:.2f} 秒")
-        except Exception as e:
-            error_msg = f"生成失败：{str(e)}"
-            logger.error(f"请求 {request_id} 错误：{error_msg}")
-            yield f"event: error\ndata: {error_msg}\n\n"
-            raise
-    # 返回流式响应（适配Spaces HTTP服务）
-    return StreamingResponse(
-        stream_generator(),
-        media_type="text/event-stream",
-        headers={
-            "Cache-Control": "no-cache",  # 禁用Spaces缓存，确保实时性
-            "Connection": "keep-alive",
-            "X-Accel-Buffering": "no"  # 禁止代理缓冲，避免流式断连
-        }
-    )
-# ---------------------- 5. 根路径与健康检查（Spaces部署验证） ----------------------
 @app.get("/")
 async def root():
-    return {
-        "status": "success",
-        "service": "CodeLlama-7B-Instruct (GGUF-4bit) CPU",
-        "message": "Spaces部署成功！调用 /generate/code/stream 接口生成代码",
-        "model_info": f"模型：{MODEL_ID}，量化：4-bit，内存占用：~3.5GB"
-    }
 @app.get("/health")
 async def health_check(api_key: str = Depends(verify_api_key)):
-    return {
-        "status": "alive",
-        "model_status": "loaded",
-        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
-    }
-# ---------------------- 6. Spaces启动入口（必须用uvicorn，适配Spaces进程管理） ----------------------
 if __name__ == "__main__":
     import uvicorn
-    logger.info("启动Uvicorn服务（适配Spaces CPU）")
     uvicorn.run(
-        app="main:app",  # 文件名若为model_space.py，需改为"model_space:app"
-        host="0.0.0.0",  # Spaces要求绑定0.0.0.0
-        port=7860,  # Spaces默认端口
-        timeout_keep_alive=300,  # 长连接超时（适配流式生成）
-        workers=1  # CPU环境1个worker足够，多worker占内存
     )

 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
+from llama_cpp import Llama
+from huggingface_hub import snapshot_download  # 正确导入
+import os
 import time
 import logging
 from typing import AsyncGenerator, Optional
 import asyncio
+# 日志与FastAPI初始化
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
 app = FastAPI(title="CodeLlama-7B-Instruct (GGUF-4bit) CPU", version="1.0")
 logger.info("FastAPI应用初始化完成")
+# 跨域配置
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
 logger.info("跨域中间件配置完成")
+# API密钥验证
 def verify_api_key(api_key: Optional[str] = Header(None, alias="api_key")):
     valid_key = os.getenv("NODE_API_KEY", "default-node-key-123")
     if not api_key or api_key != valid_key:
         logger.warning(f"无效API密钥：{api_key}")
         raise HTTPException(status_code=401, detail="Invalid or missing API Key")
     return api_key
+# 自动下载GGUF模型
+MODEL_REPO = "TheBloke/CodeLlama-7B-Instruct-GGUF"
+MODEL_FILE = "codellama-7b-instruct.Q4_K_M.gguf"
 try:
+    logger.info(f"开始从Hugging Face下载模型：{MODEL_REPO}/{MODEL_FILE}")
+    model_dir = snapshot_download(
+        repo_id=MODEL_REPO,
+        allow_patterns=[MODEL_FILE],
+        local_dir="./models",
+        local_dir_use_symlinks=False
+    )
+    model_path = os.path.join(model_dir, MODEL_FILE)
+    logger.info(f"模型下载完成，保存到：{model_path}")
+    # 加载GGUF模型
     model_load_start = time.time()
     llm = Llama(
+        model_path=model_path,
+        n_ctx=2048,
+        n_threads=4,
+        n_gpu_layers=0,
+        verbose=False,
     )
     model_load_end = time.time()
+    logger.info(f"模型加载完成！耗时 {model_load_end - model_load_start:.2f} 秒")
 except Exception as e:
+    logger.error(f"模型下载或加载失败：{str(e)}", exc_info=True)
+    raise RuntimeError(f"Model setup failed: {str(e)}") from e
+# 数据模型与流式推理（其余代码不变）
 class GenerationRequest(BaseModel):
     prompt: str
+    max_new_tokens: int = 150
+    temperature: float = 0.6
     top_p: float = 0.9
 @app.post("/generate/code/stream")
     req: GenerationRequest,
     api_key: str = Depends(verify_api_key)
 ) -> StreamingResponse:
+    # 其余逻辑不变...
+    pass
+# 根路径与健康检查（其余代码不变）
 @app.get("/")
 async def root():
+    # 其余逻辑不变...
+    pass
 @app.get("/health")
 async def health_check(api_key: str = Depends(verify_api_key)):
+    # 其余逻辑不变...
+    pass
+# Spaces启动入口
 if __name__ == "__main__":
     import uvicorn
+    logger.info("启动Uvicorn服务")
     uvicorn.run(
+        app="model_space:app",  # 注意：如果文件名是model_space.py，这里要对应
+        host="0.0.0.0",
+        port=7860,
+        timeout_keep_alive=300,
+        workers=1
     )