Spaces:

NLP-Debater-Project
/

FastAPI-Backend-Models

Running

App Files Files Community

malek-messaoudii commited on 23 days ago

Commit

9aa985d

1 Parent(s): e411044

Refactor audio models and services for improved error handling and response streaming

Browse files

Files changed (4) hide show

models/audio.py +8 -37
routes/audio.py +21 -12
services/stt_service.py +10 -1
services/tts_service.py +6 -0

models/audio.py CHANGED Viewed

@@ -1,26 +1,10 @@
-"""Pydantic schemas for Speech-to-Text and Text-to-Speech endpoints"""
 from pydantic import BaseModel, Field, ConfigDict
 from typing import Optional
-class MyModel(BaseModel):
-    model_loaded: bool
-    model_name: str
-    model_config = {
-        "protected_namespaces": ()
-    }
-# ================================
-# SPEECH TO TEXT
-# ================================
 class STTResponse(BaseModel):
-    """Response model for Whisper speech → text"""
     model_config = ConfigDict(
         json_schema_extra={
             "example": {
@@ -31,7 +15,6 @@ class STTResponse(BaseModel):
             }
         }
     )
     text: str = Field(..., description="Transcribed text from the input audio")
     model_name: str = Field(..., description="STT model used for inference")
     language: Optional[str] = Field(None, description="Detected language")
@@ -41,28 +24,17 @@ class STTResponse(BaseModel):
     )
-# ================================
-# TEXT TO SPEECH
-# ================================
 class TTSRequest(BaseModel):
-    """Text input for TTS conversion"""
     model_config = ConfigDict(
-        json_schema_extra={
-            "example": {
-                "text": "Hello, welcome to our AI system."
-            }
-        }
-    )
-    text: str = Field(
-        ..., min_length=1, max_length=500,
-        description="Text that will be converted into speech"
     )
 class TTSResponse(BaseModel):
-    """Metadata response for TTS generation"""
     model_config = ConfigDict(
         json_schema_extra={
             "example": {
@@ -73,7 +45,6 @@ class TTSResponse(BaseModel):
             }
         }
     )
     message: str
     audio_format: str
     length_seconds: Optional[float] = None

 from pydantic import BaseModel, Field, ConfigDict
 from typing import Optional
+# ==============================
+# SPEECH TO TEXT RESPONSE
+# ==============================
 class STTResponse(BaseModel):
     model_config = ConfigDict(
         json_schema_extra={
             "example": {
             }
         }
     )
     text: str = Field(..., description="Transcribed text from the input audio")
     model_name: str = Field(..., description="STT model used for inference")
     language: Optional[str] = Field(None, description="Detected language")
     )
+# ==============================
+# TEXT TO SPEECH REQUEST / RESPONSE
+# ==============================
 class TTSRequest(BaseModel):
     model_config = ConfigDict(
+        json_schema_extra={"example": {"text": "Hello, welcome to our AI system."}}
     )
+    text: str = Field(..., min_length=1, max_length=500, description="Text to convert to speech")
 class TTSResponse(BaseModel):
     model_config = ConfigDict(
         json_schema_extra={
             "example": {
             }
         }
     )
     message: str
     audio_format: str
     length_seconds: Optional[float] = None

routes/audio.py CHANGED Viewed

@@ -1,25 +1,34 @@
-from fastapi import APIRouter, UploadFile, File
 from services.tts_service import generate_tts
 from services.stt_service import speech_to_text
-from fastapi.responses import FileResponse
-import uuid
 router = APIRouter(prefix="/audio", tags=["Audio"])
 @router.post("/tts")
 async def tts(text: str):
-    audio_bytes = await generate_tts(text)
-    filename = f"tts_{uuid.uuid4()}.wav"
-    with open(filename, "wb") as f:
-        f.write(audio_bytes)
-    return FileResponse(filename, media_type="audio/wav", filename=filename)
 @router.post("/stt")
 async def stt(file: UploadFile = File(...)):
-    audio_bytes = await file.read()
-    text = await speech_to_text(audio_bytes)
     return {"text": text}

+from fastapi import APIRouter, UploadFile, File, HTTPException
 from services.tts_service import generate_tts
 from services.stt_service import speech_to_text
+from fastapi.responses import StreamingResponse
+import io
 router = APIRouter(prefix="/audio", tags=["Audio"])
+# ======================
+# TEXT TO SPEECH
+# ======================
 @router.post("/tts")
 async def tts(text: str):
+    try:
+        audio_bytes = await generate_tts(text)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    # Return as streaming response without saving file
+    return StreamingResponse(io.BytesIO(audio_bytes), media_type="audio/wav")
+# ======================
+# SPEECH TO TEXT
+# ======================
 @router.post("/stt")
 async def stt(file: UploadFile = File(...)):
+    try:
+        audio_bytes = await file.read()
+        text = await speech_to_text(audio_bytes)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
     return {"text": text}

services/stt_service.py CHANGED Viewed

@@ -1,9 +1,18 @@
 from services.gemini_client import get_gemini_client
 async def speech_to_text(audio_bytes: bytes) -> str:
     client = get_gemini_client()
     response = client.models.generate_content(
         model="gemini-2.5-flash",
-        contents=[{"mime_type": "audio/wav", "data": audio_bytes}],
     )
     return response.text

 from services.gemini_client import get_gemini_client
+from google.genai import types
 async def speech_to_text(audio_bytes: bytes) -> str:
+    """
+    Convert speech audio (bytes) to text using Gemini API
+    """
     client = get_gemini_client()
+    # Wrap audio bytes correctly for Gemini
+    contents = [types.File(data=audio_bytes, mime_type="audio/wav")]
     response = client.models.generate_content(
         model="gemini-2.5-flash",
+        contents=contents
     )
     return response.text

services/tts_service.py CHANGED Viewed

@@ -2,7 +2,11 @@ from services.gemini_client import get_gemini_client
 from google.genai import types
 async def generate_tts(text: str) -> bytes:
     client = get_gemini_client()
     response = client.models.generate_content(
         model="gemini-2.5-flash-preview-tts",
         contents=text,
@@ -15,4 +19,6 @@ async def generate_tts(text: str) -> bytes:
             ),
         ),
     )
     return response.candidates[0].content.parts[0].inline_data.data

 from google.genai import types
 async def generate_tts(text: str) -> bytes:
+    """
+    Convert text to speech using Gemini API
+    """
     client = get_gemini_client()
     response = client.models.generate_content(
         model="gemini-2.5-flash-preview-tts",
         contents=text,
             ),
         ),
     )
+    # Return raw audio bytes
     return response.candidates[0].content.parts[0].inline_data.data