Spaces:

NLP-Debater-Project
/

FastAPI-Backend-Models

Running

App Files Files Community

malek-messaoudii commited on 24 days ago

Commit

c7fc3b6

1 Parent(s): 28e04dd

Add tts/stt services

Browse files

Files changed (6) hide show

config.py +1 -0
models/audio.py +69 -0
routes/__init__.py +2 -1
routes/audio.py +84 -0
services/stt_service.py +24 -0
services/tts_service.py +28 -0

config.py CHANGED Viewed

@@ -16,6 +16,7 @@ HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "")
 HUGGINGFACE_STANCE_MODEL_ID = os.getenv("HUGGINGFACE_STANCE_MODEL_ID")
 HUGGINGFACE_LABEL_MODEL_ID = os.getenv("HUGGINGFACE_LABEL_MODEL_ID")
 # Use Hugging Face model ID instead of local path
 STANCE_MODEL_ID = HUGGINGFACE_STANCE_MODEL_ID

 HUGGINGFACE_STANCE_MODEL_ID = os.getenv("HUGGINGFACE_STANCE_MODEL_ID")
 HUGGINGFACE_LABEL_MODEL_ID = os.getenv("HUGGINGFACE_LABEL_MODEL_ID")
+HUGGINGFACE_STT_MODEL_ID = os.getenv("HUGGINGFACE_STT_MODEL_ID", "openai/whisper-large-v3")
 # Use Hugging Face model ID instead of local path
 STANCE_MODEL_ID = HUGGINGFACE_STANCE_MODEL_ID

models/audio.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Pydantic schemas for Speech-to-Text and Text-to-Speech endpoints"""
+from pydantic import BaseModel, Field, ConfigDict
+from typing import Optional
+# ================================
+# SPEECH TO TEXT
+# ================================
+class STTResponse(BaseModel):
+    """Response model for Whisper speech → text"""
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "text": "hello how are you",
+                "model_name": "openai/whisper-large-v3",
+                "language": "en",
+                "duration_seconds": 3.2
+            }
+        }
+    )
+    text: str = Field(..., description="Transcribed text from the input audio")
+    model_name: str = Field(..., description="STT model used for inference")
+    language: Optional[str] = Field(None, description="Detected language")
+    duration_seconds: Optional[float] = Field(
+        None,
+        description="Approximate audio duration in seconds"
+    )
+# ================================
+# TEXT TO SPEECH
+# ================================
+class TTSRequest(BaseModel):
+    """Text input for TTS conversion"""
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "text": "Hello, welcome to our AI system."
+            }
+        }
+    )
+    text: str = Field(
+        ..., min_length=1, max_length=500,
+        description="Text that will be converted into speech"
+    )
+class TTSResponse(BaseModel):
+    """Metadata response for TTS generation"""
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "message": "Audio generated successfully",
+                "audio_format": "wav",
+                "length_seconds": 2.5,
+                "model_name": "suno/bark"
+            }
+        }
+    )
+    message: str
+    audio_format: str
+    length_seconds: Optional[float] = None
+    model_name: str

routes/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from fastapi import APIRouter
 from . import root, health, stance, label
 # Create main router
 api_router = APIRouter()
@@ -11,6 +11,7 @@ api_router.include_router(root.router)
 api_router.include_router(health.router)
 api_router.include_router(stance.router, prefix="/stance")
 api_router.include_router(label.router, prefix="/label")
 __all__ = ["api_router"]

 from fastapi import APIRouter
 from . import root, health, stance, label
+from routes.audio import router as audio_router
 # Create main router
 api_router = APIRouter()
 api_router.include_router(health.router)
 api_router.include_router(stance.router, prefix="/stance")
 api_router.include_router(label.router, prefix="/label")
+api_router.include_router(audio_router)
 __all__ = ["api_router"]

routes/audio.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""Speech-to-Text & Text-to-Speech API Endpoints"""
+from fastapi import APIRouter, UploadFile, File, HTTPException
+from fastapi.responses import StreamingResponse
+import logging
+from models.audio import (
+    STTResponse,
+    TTSRequest,
+    TTSResponse
+)
+from services.stt_service import transcribe_audio
+from services.tts_service import text_to_speech
+router = APIRouter(prefix="/audio", tags=["Audio"])
+logger = logging.getLogger(__name__)
+# ============================================================
+# SPEECH TO TEXT (Whisper)
+# ============================================================
+@router.post("/speech-to-text", response_model=STTResponse)
+async def speech_to_text_endpoint(file: UploadFile = File(...)):
+    """
+    Convert speech to text using openai/whisper-large-v3.
+    - Upload an audio file (wav, mp3, m4a…)
+    - Returns transcribed English text
+    """
+    try:
+        audio_bytes = await file.read()
+        result = transcribe_audio(audio_bytes)
+        response_data = STTResponse(
+            text=result,
+            model_name="openai/whisper-large-v3",
+            language="en",
+            duration_seconds=None  # optional filler
+        )
+        logger.info(f"STT completed: {response_data.text[:40]}...")
+        return response_data
+    except Exception as e:
+        logger.error(f"STT error: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Speech-to-text failed: {str(e)}")
+# ============================================================
+# TEXT TO SPEECH (Bark)
+# ============================================================
+@router.post("/text-to-speech", response_model=TTSResponse)
+async def text_to_speech_endpoint(request: TTSRequest):
+    """
+    Convert text to synthesized speech using Bark.
+    Returns streamed audio.
+    """
+    try:
+        audio_bytes = text_to_speech(request.text)
+        metadata = TTSResponse(
+            message="Audio generated successfully",
+            audio_format="wav",
+            length_seconds=None,
+            model_name="suno/bark"
+        )
+        logger.info(f"TTS generated for text: {request.text[:40]}...")
+        return StreamingResponse(
+            iter([audio_bytes]),
+            media_type="audio/wav",
+            headers={
+                "X-Audio-Metadata": metadata.model_dump_json()
+            }
+        )
+    except Exception as e:
+        logger.error(f"TTS error: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Text-to-speech failed: {str(e)}")

services/stt_service.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# services/stt_service.py
+import requests
+from config import HUGGINGFACE_API_KEY, HUGGINGFACE_STT_MODEL_ID
+def transcribe_audio(file_bytes: bytes) -> str:
+    """
+    Convert audio bytes into English text using Whisper large-v3
+    through Hugging Face Inference API.
+    """
+    headers = {
+        "Authorization": f"Bearer {HUGGINGFACE_API_KEY}",
+        "Content-Type": "application/octet-stream",
+    }
+    url = f"https://api-inference.huggingface.co/models/{HUGGINGFACE_STT_MODEL_ID}"
+    response = requests.post(url, headers=headers, data=file_bytes)
+    try:
+        result = response.json()
+        return result.get("text", "Error: No transcription returned.")
+    except Exception:
+        return "Error: Invalid response from STT model."

services/tts_service.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import requests
+from config import HUGGINGFACE_API_KEY
+# Bark model
+BARK_MODEL_ID = "suno/bark"
+def text_to_speech(text: str) -> bytes:
+    """
+    Convert text to speech (audio bytes) using Hugging Face Bark model.
+    """
+    url = f"https://api-inference.huggingface.co/models/{BARK_MODEL_ID}"
+    headers = {
+        "Authorization": f"Bearer {HUGGINGFACE_API_KEY}",
+        "Content-Type": "application/json"
+    }
+    payload = {
+        "inputs": text
+    }
+    response = requests.post(url, headers=headers, json=payload)
+    # Bark returns raw WAV bytes
+    if response.status_code != 200:
+        raise Exception(f"Bark API error: {response.text}")
+    return response.content