Spaces:

NLP-Debater-Project
/

FastAPI-Backend-Models

Running

App Files Files Community

malek-messaoudii commited on 24 days ago

Commit

4a13628

1 Parent(s): 218c6a3

add files

Browse files

Files changed (6) hide show

models/audio.py +60 -31
routes/audio.py +90 -47
services/chatbot_service.py +46 -0
services/gemini_client.py +15 -5
services/stt_service.py +43 -17
services/tts_service.py +47 -19

models/audio.py CHANGED Viewed

@@ -1,49 +1,78 @@
-from pydantic import BaseModel, Field, ConfigDict
 from typing import Optional
-# ==============================
-# SPEECH TO TEXT RESPONSE
-# ==============================
 class STTResponse(BaseModel):
-    model_config = ConfigDict(
-        json_schema_extra={
             "example": {
                 "text": "hello how are you",
-                "model_name": "openai/whisper-large-v3",
                 "language": "en",
                 "duration_seconds": 3.2
             }
         }
-    )
-    text: str = Field(..., description="Transcribed text from the input audio")
-    model_name: str = Field(..., description="STT model used for inference")
-    language: Optional[str] = Field(None, description="Detected language")
-    duration_seconds: Optional[float] = Field(
-        None,
-        description="Approximate audio duration in seconds"
-    )
-# ==============================
-# TEXT TO SPEECH REQUEST / RESPONSE
-# ==============================
 class TTSRequest(BaseModel):
-    model_config = ConfigDict(
-        json_schema_extra={"example": {"text": "Hello, welcome to our AI system."}}
-    )
     text: str = Field(..., min_length=1, max_length=500, description="Text to convert to speech")
 class TTSResponse(BaseModel):
-    model_config = ConfigDict(
-        json_schema_extra={
             "example": {
                 "message": "Audio generated successfully",
                 "audio_format": "wav",
-                "length_seconds": 2.5,
-                "model_name": "suno/bark"
             }
         }
-    )
-    message: str
-    audio_format: str
-    length_seconds: Optional[float] = None
-    model_name: str

+from pydantic import BaseModel, Field
 from typing import Optional
 class STTResponse(BaseModel):
+    """Response model for Speech-to-Text"""
+    text: str = Field(..., description="Transcribed text from audio")
+    model_name: str = Field(default="gemini-2.5-flash", description="Model used")
+    language: Optional[str] = Field(default="en", description="Detected language")
+    duration_seconds: Optional[float] = Field(None, description="Audio duration")
+    class Config:
+        json_schema_extra = {
             "example": {
                 "text": "hello how are you",
+                "model_name": "gemini-2.5-flash",
                 "language": "en",
                 "duration_seconds": 3.2
             }
         }
 class TTSRequest(BaseModel):
+    """Request model for Text-to-Speech"""
     text: str = Field(..., min_length=1, max_length=500, description="Text to convert to speech")
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "text": "Hello, welcome to our AI voice system."
+            }
+        }
 class TTSResponse(BaseModel):
+    """Response model for Text-to-Speech"""
+    message: str = Field(..., description="Status message")
+    audio_format: str = Field(default="wav", description="Audio format")
+    model_name: str = Field(default="gemini-2.5-flash-preview-tts", description="Model used")
+    length_seconds: Optional[float] = Field(None, description="Generated audio duration")
+    class Config:
+        json_schema_extra = {
             "example": {
                 "message": "Audio generated successfully",
                 "audio_format": "wav",
+                "model_name": "gemini-2.5-flash-preview-tts",
+                "length_seconds": 2.5
+            }
+        }
+class ChatbotRequest(BaseModel):
+    """Request model for Chatbot"""
+    text: str = Field(..., min_length=1, max_length=500, description="User query")
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "text": "What is the weather today?"
+            }
+        }
+class ChatbotResponse(BaseModel):
+    """Response model for Chatbot"""
+    user_input: str = Field(..., description="User input text")
+    bot_response: str = Field(..., description="Bot response text")
+    model_name: str = Field(default="gemini-2.5-flash", description="Model used")
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "user_input": "Hello",
+                "bot_response": "Hi there! How can I help you?",
+                "model_name": "gemini-2.5-flash"
             }
         }

routes/audio.py CHANGED Viewed

@@ -1,91 +1,134 @@
 from fastapi import APIRouter, UploadFile, File, HTTPException
 from fastapi.responses import StreamingResponse
 import io
-from services.tts_service import generate_tts
 from services.stt_service import speech_to_text
 router = APIRouter(prefix="/audio", tags=["Audio"])
-# Allowed MIME types
-ALLOWED_AUDIO_TYPES = {
-    "audio/wav",
-    "audio/x-wav",
-    "audio/mpeg",    # mp3
-    "audio/mp3",     # mp3
-    "audio/mp4",     # sometimes m4a
-    "audio/m4a"      # m4a
-}
-# ------------------------
-# Text to Speech
-# ------------------------
 @router.post("/tts")
-async def tts(text: str):
-    """
-    Convert text to speech and return audio.
     """
-    if not text or len(text) > 500:
-        raise HTTPException(
-            status_code=400,
-            detail="Text must be between 1 and 500 characters"
-        )
     try:
-        audio_bytes = await generate_tts(text)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
-    return StreamingResponse(io.BytesIO(audio_bytes), media_type="audio/wav")
-# ------------------------
-# Speech to Text
-# ------------------------
-@router.post("/stt")
 async def stt(file: UploadFile = File(...)):
     """
-    Accepts an uploaded audio file (wav, mp3, m4a) and returns the transcribed text.
     """
-    # Validate MIME type
     if file.content_type not in ALLOWED_AUDIO_TYPES:
         raise HTTPException(
             status_code=400,
-            detail=f"Unsupported audio format: {file.content_type}. Supported: WAV, MP3, M4A"
         )
     try:
         audio_bytes = await file.read()
         text = await speech_to_text(audio_bytes, file.filename)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
-    return {"text": text}
-# ------------------------
-# Voice Chatbot: User sends voice → TTS reply
-# ------------------------
 @router.post("/chatbot")
-async def chatbot(file: UploadFile = File(...)):
     """
-    User sends an audio file, the system converts to text, generates response, and returns TTS audio.
     """
-    # Validate MIME type
     if file.content_type not in ALLOWED_AUDIO_TYPES:
         raise HTTPException(
             status_code=400,
-            detail=f"Unsupported audio format: {file.content_type}. Supported: WAV, MP3, M4A"
         )
     try:
         audio_bytes = await file.read()
         user_text = await speech_to_text(audio_bytes, file.filename)
-        # Replace this with your NLP or chatbot logic
-        response_text = f"You said: {user_text}"
         audio_response = await generate_tts(response_text)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
-    return StreamingResponse(io.BytesIO(audio_response), media_type="audio/wav")

 from fastapi import APIRouter, UploadFile, File, HTTPException
 from fastapi.responses import StreamingResponse
 import io
+import logging
+from config import ALLOWED_AUDIO_TYPES, MAX_TEXT_LENGTH, MIN_TEXT_LENGTH
 from services.stt_service import speech_to_text
+from services.tts_service import generate_tts
+from services.chatbot_service import get_chatbot_response
+from models.audio import STTResponse, TTSRequest, TTSResponse, ChatbotRequest, ChatbotResponse
+logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/audio", tags=["Audio"])
 @router.post("/tts")
+async def tts(request: TTSRequest):
     """
+    Convert text to speech and return audio file.
+    Example:
+    - POST /audio/tts
+    - Body: {"text": "Hello, welcome to our system"}
+    - Returns: WAV audio file
+    """
     try:
+        logger.info(f"TTS request received for text: '{request.text}'")
+        audio_bytes = await generate_tts(request.text)
+        return StreamingResponse(io.BytesIO(audio_bytes), media_type="audio/wav")
     except Exception as e:
+        logger.error(f"TTS error: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+@router.post("/stt", response_model=STTResponse)
 async def stt(file: UploadFile = File(...)):
     """
+    Convert audio file to text.
+    Example:
+    - POST /audio/stt
+    - File: audio.mp3 (or .wav, .m4a)
+    - Returns: {"text": "transcribed text", "model_name": "gemini-2.5-flash", ...}
     """
+    # Validate file type
     if file.content_type not in ALLOWED_AUDIO_TYPES:
         raise HTTPException(
             status_code=400,
+            detail=f"Unsupported format: {file.content_type}. Supported: WAV, MP3, M4A"
         )
     try:
+        logger.info(f"STT request received for file: {file.filename}")
         audio_bytes = await file.read()
         text = await speech_to_text(audio_bytes, file.filename)
+        return STTResponse(
+            text=text,
+            model_name="gemini-2.5-flash",
+            language="en",
+            duration_seconds=None
+        )
     except Exception as e:
+        logger.error(f"STT error: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @router.post("/chatbot")
+async def chatbot_voice(file: UploadFile = File(...)):
     """
+    Full voice chatbot flow (Audio → Text → Response → Audio).
+    Example:
+    - POST /audio/chatbot
+    - File: user_voice.mp3
+    - Returns: Response audio file (WAV)
+    Process:
+    1. Converts user's audio to text (STT)
+    2. Generates chatbot response to user's text
+    3. Converts response back to audio (TTS)
     """
+    # Validate file type
     if file.content_type not in ALLOWED_AUDIO_TYPES:
         raise HTTPException(
             status_code=400,
+            detail=f"Unsupported format: {file.content_type}. Supported: WAV, MP3, M4A"
         )
     try:
+        logger.info(f"Voice chatbot request received for file: {file.filename}")
+        # Step 1: Convert audio to text
         audio_bytes = await file.read()
         user_text = await speech_to_text(audio_bytes, file.filename)
+        logger.info(f"Step 1 - STT: {user_text}")
+        # Step 2: Generate chatbot response
+        response_text = await get_chatbot_response(user_text)
+        logger.info(f"Step 2 - Response: {response_text}")
+        # Step 3: Convert response to audio
         audio_response = await generate_tts(response_text)
+        logger.info("Step 3 - TTS: Complete")
+        return StreamingResponse(io.BytesIO(audio_response), media_type="audio/wav")
     except Exception as e:
+        logger.error(f"Voice chatbot error: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+@router.post("/chatbot-text", response_model=ChatbotResponse)
+async def chatbot_text(request: ChatbotRequest):
+    """
+    Chatbot interaction with text input/output (no audio).
+    Example:
+    - POST /audio/chatbot-text
+    - Body: {"text": "What is the capital of France?"}
+    - Returns: {"user_input": "What is...", "bot_response": "The capital...", ...}
+    """
+    try:
+        logger.info(f"Text chatbot request: {request.text}")
+        response_text = await get_chatbot_response(request.text)
+        return ChatbotResponse(
+            user_input=request.text,
+            bot_response=response_text,
+            model_name="gemini-2.5-flash"
+        )
+    except Exception as e:
+        logger.error(f"Text chatbot error: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))

services/chatbot_service.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from services.gemini_client import get_gemini_client
+import logging
+logger = logging.getLogger(__name__)
+async def get_chatbot_response(user_text: str) -> str:
+    """
+    Generate chatbot response using Gemini API.
+    Args:
+        user_text: User input text
+    Returns:
+        Chatbot response text
+    Raises:
+        Exception: If response generation fails
+    """
+    try:
+        client = get_gemini_client()
+        logger.info(f"Generating chatbot response for: '{user_text}'")
+        # Create a system prompt for better responses
+        system_prompt = """You are a helpful, friendly AI assistant.
+        Respond concisely and naturally to user queries.
+        Keep responses brief (1-2 sentences) for voice interaction."""
+        response = client.models.generate_content(
+            model="gemini-2.5-flash",
+            contents=[
+                {"role": "user", "parts": [{"text": system_prompt}]},
+                {"role": "user", "parts": [{"text": user_text}]}
+            ]
+        )
+        response_text = response.text
+        logger.info(f"✓ Response generated: '{response_text}'")
+        return response_text
+    except Exception as e:
+        logger.error(f"✗ Chatbot response failed: {str(e)}")
+        # Fallback response
+        return f"I understood you said: '{user_text}'. Could you tell me more?"

services/gemini_client.py CHANGED Viewed

@@ -1,8 +1,18 @@
 from google.genai import Client
-import os
 def get_gemini_client():
-    api_key = os.getenv("GOOGLE_GENAI_API_KEY")
-    if not api_key:
-        raise ValueError("Missing GOOGLE_GENAI_API_KEY environment variable")
-    return Client(api_key=api_key)

 from google.genai import Client
+from config import GOOGLE_GENAI_API_KEY
+import logging
+logger = logging.getLogger(__name__)
 def get_gemini_client():
+    """
+    Initialize and return Gemini client.
+    """
+    try:
+        client = Client(api_key=GOOGLE_GENAI_API_KEY)
+        logger.info("✓ Gemini client initialized successfully")
+        return client
+    except Exception as e:
+        logger.error(f"✗ Failed to initialize Gemini client: {str(e)}")
+        raise ValueError(f"Gemini client initialization failed: {str(e)}")

services/stt_service.py CHANGED Viewed

@@ -1,23 +1,49 @@
 from services.gemini_client import get_gemini_client
 from google.genai import types
-import mimetypes  # <- Add this
 async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
     """
-    Convert audio bytes to text using Gemini API. Supports WAV, MP3, M4A.
     """
-    client = get_gemini_client()
-    # Detect MIME type from filename
-    mime_type, _ = mimetypes.guess_type(filename)
-    if mime_type is None:
-        mime_type = "audio/wav"  # fallback
-    audio_file = types.File(data=audio_bytes, mime_type=mime_type)
-    response = client.models.generate_content(
-        model="gemini-2.5-flash",
-        contents=[audio_file]
-    )
-    return response.text

 from services.gemini_client import get_gemini_client
 from google.genai import types
+import mimetypes
+import logging
+logger = logging.getLogger(__name__)
 async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
     """
+    Convert audio bytes to text using Gemini API.
+    Args:
+        audio_bytes: Raw audio file bytes
+        filename: Name of the audio file (used to detect format)
+    Returns:
+        Transcribed text
+    Raises:
+        Exception: If transcription fails
     """
+    try:
+        client = get_gemini_client()
+        # Detect MIME type from filename
+        mime_type, _ = mimetypes.guess_type(filename)
+        if mime_type is None:
+            mime_type = "audio/wav"  # fallback
+        logger.info(f"Converting audio to text (format: {mime_type})")
+        # Create audio file object
+        audio_file = types.File(data=audio_bytes, mime_type=mime_type)
+        # Call Gemini API
+        response = client.models.generate_content(
+            model="gemini-2.5-flash",
+            contents=[audio_file]
+        )
+        transcribed_text = response.text
+        logger.info(f"✓ STT successful: '{transcribed_text}'")
+        return transcribed_text
+    except Exception as e:
+        logger.error(f"✗ STT failed: {str(e)}")
+        raise Exception(f"Speech-to-text conversion failed: {str(e)}")

services/tts_service.py CHANGED Viewed

@@ -1,25 +1,53 @@
 from services.gemini_client import get_gemini_client
 from google.genai import types
 import base64
-async def generate_tts(text: str) -> bytes:
-    client = get_gemini_client()
-    response = client.models.generate_content(
-        model="gemini-2.5-flash-preview-tts",
-        contents=text,
-        config=types.GenerateContentConfig(
-            response_modalities=["AUDIO"],
-            speech_config=types.SpeechConfig(
-                voice_config=types.VoiceConfig(
-                    prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Kore")
-                )
-            ),
-        ),
-    )
-    # Decode base64 audio into bytes
-    audio_base64 = response.candidates[0].content.parts[0].inline_data.data
-    audio_bytes = base64.b64decode(audio_base64)
-    return audio_bytes

 from services.gemini_client import get_gemini_client
 from google.genai import types
 import base64
+import logging
+logger = logging.getLogger(__name__)
+async def generate_tts(text: str) -> bytes:
+    """
+    Convert text to speech using Gemini API.
+    Args:
+        text: Text to convert to speech
+    Returns:
+        Audio bytes in WAV format
+    Raises:
+        Exception: If TTS generation fails
+    """
+    try:
+        client = get_gemini_client()
+        logger.info(f"Generating speech for: '{text}'")
+        # Call Gemini TTS API
+        response = client.models.generate_content(
+            model="gemini-2.5-flash-preview-tts",
+            contents=text,
+            config=types.GenerateContentConfig(
+                response_modalities=["AUDIO"],
+                speech_config=types.SpeechConfig(
+                    voice_config=types.VoiceConfig(
+                        prebuilt_voice_config=types.PrebuiltVoiceConfig(
+                            voice_name="Kore"  # Options: Kore, Peri, Charon, Fenrir, Orbit
+                        )
+                    )
+                ),
+            ),
+        )
+        # Extract and decode base64 audio
+        audio_base64 = response.candidates[0].content.parts[0].inline_data.data
+        audio_bytes = base64.b64decode(audio_base64)
+        logger.info(f"✓ TTS successful: {len(audio_bytes)} bytes generated")
+        return audio_bytes
+    except Exception as e:
+        logger.error(f"✗ TTS failed: {str(e)}")
+        raise Exception(f"Text-to-speech generation failed: {str(e)}")