Spaces:

NLP-Debater-Project
/

FastAPI-Backend-Models

Running

App Files Files Community

malek-messaoudii commited on 22 days ago

Commit

95cb26e

1 Parent(s): 918acab

Refactor audio processing to utilize free models and enhance logging; update TTS and STT services for improved functionality

Browse files

Files changed (8) hide show

config.py +11 -9
models/audio.py +8 -8
requirements.txt +2 -0
routes/audio.py +21 -18
services/chatbot_service.py +96 -21
services/gemini_client.py +4 -13
services/stt_service.py +51 -45
services/tts_service.py +85 -37

config.py CHANGED Viewed

@@ -3,6 +3,10 @@
 import os
 from pathlib import Path
 from dotenv import load_dotenv
 # Load environment variables from .env file
 load_dotenv()
@@ -38,11 +42,10 @@ CORS_CREDENTIALS = True
 CORS_METHODS = ["*"]
 CORS_HEADERS = ["*"]
-GOOGLE_GENAI_API_KEY = os.getenv("GOOGLE_GENAI_API_KEY")
-# Validate API key
-if not GOOGLE_GENAI_API_KEY:
-    raise ValueError("Missing GOOGLE_GENAI_API_KEY environment variable. Add it to .env file")
 # Audio settings
 ALLOWED_AUDIO_TYPES = {
@@ -60,9 +63,8 @@ MAX_AUDIO_SIZE = 10 * 1024 * 1024  # 10MB
 # Validate configuration
 def validate_config():
-    required_vars = ["GOOGLE_GENAI_API_KEY"]
-    missing = [var for var in required_vars if not os.getenv(var)]
-    if missing:
-        raise ValueError(f"Missing required environment variables: {missing}")
 validate_config()

 import os
 from pathlib import Path
 from dotenv import load_dotenv
+import logging
+# Configure logging
+logger = logging.getLogger(__name__)
 # Load environment variables from .env file
 load_dotenv()
 CORS_METHODS = ["*"]
 CORS_HEADERS = ["*"]
+# Free model configurations
+STT_MODEL_ID = "openai/whisper-small"  # Free Whisper model for STT
+CHATBOT_MODEL_ID = "microsoft/DialoGPT-medium"  # Free chatbot model
+TTS_USE_GTTS = True  # Use gTTS (Google Text-to-Speech) free tier
 # Audio settings
 ALLOWED_AUDIO_TYPES = {
 # Validate configuration
 def validate_config():
+    """Validate that we can use free models"""
+    logger.info("✓ Using free models for STT, TTS, and Chatbot")
+    return True
 validate_config()

models/audio.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 class STTResponse(BaseModel):
     """Response model for Speech-to-Text"""
     text: str = Field(..., description="Transcribed text from audio")
-    model_name: str = Field(default="gemini-2.5-flash", description="Model used")
     language: Optional[str] = Field(default="en", description="Detected language")
     duration_seconds: Optional[float] = Field(None, description="Audio duration")
@@ -13,7 +13,7 @@ class STTResponse(BaseModel):
         json_schema_extra = {
             "example": {
                 "text": "hello how are you",
-                "model_name": "gemini-2.5-flash",
                 "language": "en",
                 "duration_seconds": 3.2
             }
@@ -35,16 +35,16 @@ class TTSRequest(BaseModel):
 class TTSResponse(BaseModel):
     """Response model for Text-to-Speech"""
     message: str = Field(..., description="Status message")
-    audio_format: str = Field(default="wav", description="Audio format")
-    model_name: str = Field(default="gemini-2.5-flash-preview-tts", description="Model used")
     length_seconds: Optional[float] = Field(None, description="Generated audio duration")
     class Config:
         json_schema_extra = {
             "example": {
                 "message": "Audio generated successfully",
-                "audio_format": "wav",
-                "model_name": "gemini-2.5-flash-preview-tts",
                 "length_seconds": 2.5
             }
         }
@@ -66,13 +66,13 @@ class ChatbotResponse(BaseModel):
     """Response model for Chatbot"""
     user_input: str = Field(..., description="User input text")
     bot_response: str = Field(..., description="Bot response text")
-    model_name: str = Field(default="gemini-2.5-flash", description="Model used")
     class Config:
         json_schema_extra = {
             "example": {
                 "user_input": "Hello",
                 "bot_response": "Hi there! How can I help you?",
-                "model_name": "gemini-2.5-flash"
             }
         }

 class STTResponse(BaseModel):
     """Response model for Speech-to-Text"""
     text: str = Field(..., description="Transcribed text from audio")
+    model_name: str = Field(default="whisper-small", description="Model used")
     language: Optional[str] = Field(default="en", description="Detected language")
     duration_seconds: Optional[float] = Field(None, description="Audio duration")
         json_schema_extra = {
             "example": {
                 "text": "hello how are you",
+                "model_name": "whisper-small",
                 "language": "en",
                 "duration_seconds": 3.2
             }
 class TTSResponse(BaseModel):
     """Response model for Text-to-Speech"""
     message: str = Field(..., description="Status message")
+    audio_format: str = Field(default="mp3", description="Audio format")
+    model_name: str = Field(default="gTTS", description="Model used")
     length_seconds: Optional[float] = Field(None, description="Generated audio duration")
     class Config:
         json_schema_extra = {
             "example": {
                 "message": "Audio generated successfully",
+                "audio_format": "mp3",
+                "model_name": "gTTS",
                 "length_seconds": 2.5
             }
         }
     """Response model for Chatbot"""
     user_input: str = Field(..., description="User input text")
     bot_response: str = Field(..., description="Bot response text")
+    model_name: str = Field(default="DialoGPT-medium", description="Model used")
     class Config:
         json_schema_extra = {
             "example": {
                 "user_input": "Hello",
                 "bot_response": "Hi there! How can I help you?",
+                "model_name": "DialoGPT-medium"
             }
         }

requirements.txt CHANGED Viewed

@@ -9,3 +9,5 @@ protobuf>=3.20.0
 huggingface_hub>=0.19.0
 python-multipart
 google-genai>=0.4.0

 huggingface_hub>=0.19.0
 python-multipart
 google-genai>=0.4.0
+gtts==2.5.1

routes/audio.py CHANGED Viewed

@@ -3,30 +3,38 @@ from fastapi.responses import StreamingResponse
 import io
 import logging
 from config import ALLOWED_AUDIO_TYPES, MAX_AUDIO_SIZE
-from services.stt_service import speech_to_text
 from services.tts_service import generate_tts
-from services.chatbot_service import get_chatbot_response
 from models.audio import STTResponse, TTSRequest, TTSResponse, ChatbotRequest, ChatbotResponse
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/audio", tags=["Audio"])
 @router.post("/tts")
 async def tts(request: TTSRequest):
     """
-    Convert text to speech and return audio file.
     Example:
     - POST /audio/tts
     - Body: {"text": "Hello, welcome to our system"}
-    - Returns: WAV audio file
     """
     try:
         logger.info(f"TTS request received for text: '{request.text}'")
         audio_bytes = await generate_tts(request.text)
-        return StreamingResponse(io.BytesIO(audio_bytes), media_type="audio/wav")
     except Exception as e:
         logger.error(f"TTS error: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
@@ -35,12 +43,12 @@ async def tts(request: TTSRequest):
 @router.post("/stt", response_model=STTResponse)
 async def stt(file: UploadFile = File(...)):
     """
-    Convert audio file to text.
     Example:
     - POST /audio/stt
     - File: audio.mp3 (or .wav, .m4a)
-    - Returns: {"text": "transcribed text", "model_name": "gemini-2.5-flash", ...}
     """
     # Validate file type
     if file.content_type not in ALLOWED_AUDIO_TYPES:
@@ -64,7 +72,7 @@ async def stt(file: UploadFile = File(...)):
         return STTResponse(
             text=text,
-            model_name="gemini-2.5-flash",
             language="en",
             duration_seconds=None
         )
@@ -76,17 +84,12 @@ async def stt(file: UploadFile = File(...)):
 @router.post("/chatbot")
 async def chatbot_voice(file: UploadFile = File(...)):
     """
-    Full voice chatbot flow (Audio → Text → Response → Audio).
     Example:
     - POST /audio/chatbot
     - File: user_voice.mp3
-    - Returns: Response audio file (WAV)
-    Process:
-    1. Converts user's audio to text (STT)
-    2. Generates chatbot response to user's text
-    3. Converts response back to audio (TTS)
     """
     # Validate file type
     if file.content_type not in ALLOWED_AUDIO_TYPES:
@@ -119,7 +122,7 @@ async def chatbot_voice(file: UploadFile = File(...)):
         audio_response = await generate_tts(response_text)
         logger.info("Step 3 - TTS: Complete")
-        return StreamingResponse(io.BytesIO(audio_response), media_type="audio/wav")
     except Exception as e:
         logger.error(f"Voice chatbot error: {str(e)}")
@@ -129,7 +132,7 @@ async def chatbot_voice(file: UploadFile = File(...)):
 @router.post("/chatbot-text", response_model=ChatbotResponse)
 async def chatbot_text(request: ChatbotRequest):
     """
-    Chatbot interaction with text input/output (no audio).
     Example:
     - POST /audio/chatbot-text
@@ -143,7 +146,7 @@ async def chatbot_text(request: ChatbotRequest):
         return ChatbotResponse(
             user_input=request.text,
             bot_response=response_text,
-            model_name="gemini-2.5-flash"
         )
     except Exception as e:
         logger.error(f"Text chatbot error: {str(e)}")

 import io
 import logging
 from config import ALLOWED_AUDIO_TYPES, MAX_AUDIO_SIZE
+from services.stt_service import speech_to_text, load_stt_model
 from services.tts_service import generate_tts
+from services.chatbot_service import get_chatbot_response, load_chatbot_model
 from models.audio import STTResponse, TTSRequest, TTSResponse, ChatbotRequest, ChatbotResponse
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/audio", tags=["Audio"])
+# Pre-load models on router startup
+@router.on_event("startup")
+async def startup_event():
+    """Load models when the router starts"""
+    logger.info("Loading free STT and Chatbot models...")
+    load_stt_model()
+    load_chatbot_model()
 @router.post("/tts")
 async def tts(request: TTSRequest):
     """
+    Convert text to speech and return audio file using free gTTS.
     Example:
     - POST /audio/tts
     - Body: {"text": "Hello, welcome to our system"}
+    - Returns: MP3 audio file
     """
     try:
         logger.info(f"TTS request received for text: '{request.text}'")
         audio_bytes = await generate_tts(request.text)
+        return StreamingResponse(io.BytesIO(audio_bytes), media_type="audio/mp3")
     except Exception as e:
         logger.error(f"TTS error: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @router.post("/stt", response_model=STTResponse)
 async def stt(file: UploadFile = File(...)):
     """
+    Convert audio file to text using free Whisper model.
     Example:
     - POST /audio/stt
     - File: audio.mp3 (or .wav, .m4a)
+    - Returns: {"text": "transcribed text", "model_name": "whisper-small", ...}
     """
     # Validate file type
     if file.content_type not in ALLOWED_AUDIO_TYPES:
         return STTResponse(
             text=text,
+            model_name="whisper-small",
             language="en",
             duration_seconds=None
         )
 @router.post("/chatbot")
 async def chatbot_voice(file: UploadFile = File(...)):
     """
+    Full voice chatbot flow using free models (Audio → Text → Response → Audio).
     Example:
     - POST /audio/chatbot
     - File: user_voice.mp3
+    - Returns: Response audio file (MP3)
     """
     # Validate file type
     if file.content_type not in ALLOWED_AUDIO_TYPES:
         audio_response = await generate_tts(response_text)
         logger.info("Step 3 - TTS: Complete")
+        return StreamingResponse(io.BytesIO(audio_response), media_type="audio/mp3")
     except Exception as e:
         logger.error(f"Voice chatbot error: {str(e)}")
 @router.post("/chatbot-text", response_model=ChatbotResponse)
 async def chatbot_text(request: ChatbotRequest):
     """
+    Chatbot interaction with text input/output using free DialoGPT model.
     Example:
     - POST /audio/chatbot-text
         return ChatbotResponse(
             user_input=request.text,
             bot_response=response_text,
+            model_name="DialoGPT-medium"
         )
     except Exception as e:
         logger.error(f"Text chatbot error: {str(e)}")

services/chatbot_service.py CHANGED Viewed

@@ -1,46 +1,121 @@
-from services.gemini_client import get_gemini_client
 import logging
 logger = logging.getLogger(__name__)
-async def get_chatbot_response(user_text: str) -> str:
     """
-    Generate chatbot response using Gemini API.
     Args:
         user_text: User input text
     Returns:
         Chatbot response text
-    Raises:
-        Exception: If response generation fails
     """
     try:
-        client = get_gemini_client()
         logger.info(f"Generating chatbot response for: '{user_text}'")
-        # Create a system prompt for better responses
-        system_prompt = """You are a helpful, friendly AI assistant.
-        Respond concisely and naturally to user queries.
-        Keep responses brief (1-2 sentences) for voice interaction."""
-        # Combine system prompt with user input
-        full_prompt = f"{system_prompt}\n\nUser: {user_text}"
-        response = client.models.generate_content(
-            model="gemini-2.0-flash-exp",  # Using a model that definitely exists
-            contents=[full_prompt]
         )
-        response_text = response.text
-        logger.info(f"✓ Response generated: '{response_text}'")
-        return response_text
     except Exception as e:
         logger.error(f"✗ Chatbot response failed: {str(e)}")
-        # Fallback response
-        return f"I understood you said: '{user_text}'. Could you tell me more?"

+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import logging
+import torch
 logger = logging.getLogger(__name__)
+# Global chatbot components
+chatbot_pipeline = None
+chat_history = {}
+def load_chatbot_model():
+    """Load the free DialoGPT model for chatbot"""
+    global chatbot_pipeline
+    try:
+        logger.info("Loading DialoGPT chatbot model...")
+        # Use DialoGPT medium for better responses
+        chatbot_pipeline = pipeline(
+            "text-generation",
+            model="microsoft/DialoGPT-medium",
+            tokenizer="microsoft/DialoGPT-medium",
+            device="cpu"
+        )
+        logger.info("✓ DialoGPT chatbot model loaded successfully")
+    except Exception as e:
+        logger.error(f"✗ Failed to load DialoGPT model: {str(e)}")
+        chatbot_pipeline = None
+async def get_chatbot_response(user_text: str, user_id: str = "default") -> str:
     """
+    Generate chatbot response using free DialoGPT model.
     Args:
         user_text: User input text
+        user_id: Unique user ID for maintaining conversation history
     Returns:
         Chatbot response text
     """
+    global chatbot_pipeline
     try:
+        if chatbot_pipeline is None:
+            load_chatbot_model()
+            if chatbot_pipeline is None:
+                return get_fallback_response(user_text)
         logger.info(f"Generating chatbot response for: '{user_text}'")
+        # Get or initialize chat history for this user
+        if user_id not in chat_history:
+            chat_history[user_id] = []
+        # Prepare conversation context
+        conversation = chat_history[user_id] + [user_text]
+        context = " ".join(conversation[-3:])  # Use last 3 exchanges as context
+        # Generate response
+        response = chatbot_pipeline(
+            context,
+            max_length=150,
+            num_return_sequences=1,
+            pad_token_id=chatbot_pipeline.tokenizer.eos_token_id,
+            no_repeat_ngram_size=3,
+            do_sample=True,
+            top_k=50,
+            top_p=0.95,
+            temperature=0.7
         )
+        bot_response = response[0]['generated_text'].strip()
+        # Extract only the new response (remove the input context)
+        if context in bot_response:
+            bot_response = bot_response.replace(context, "").strip()
+        # Clean up the response
+        bot_response = clean_response(bot_response)
+        # Update chat history
+        chat_history[user_id].extend([user_text, bot_response])
+        # Keep only recent history (last 4 exchanges)
+        if len(chat_history[user_id]) > 8:
+            chat_history[user_id] = chat_history[user_id][-8:]
+        logger.info(f"✓ Response generated: '{bot_response}'")
+        return bot_response
     except Exception as e:
         logger.error(f"✗ Chatbot response failed: {str(e)}")
+        return get_fallback_response(user_text)
+def clean_response(response: str) -> str:
+    """Clean and format the chatbot response"""
+    # Remove extra spaces
+    response = ' '.join(response.split())
+    # Ensure proper sentence ending
+    if response and not response.endswith(('.', '!', '?')):
+        response += '.'
+    # Limit response length
+    if len(response) > 200:
+        response = response[:197] + '...'
+    return response
+def get_fallback_response(user_text: str) -> str:
+    """Provide fallback responses when model fails"""
+    fallback_responses = [
+        f"I understand you said: '{user_text}'. Could you tell me more about that?",
+        f"That's interesting! You mentioned: '{user_text}'. What would you like to know?",
+        f"Thanks for sharing! Regarding '{user_text}', how can I help you?",
+        f"I heard you say: '{user_text}'. Could you elaborate on that?"
+    ]
+    import random
+    return random.choice(fallback_responses)

services/gemini_client.py CHANGED Viewed

@@ -1,18 +1,9 @@
-from google.genai import Client
-from config import GOOGLE_GENAI_API_KEY
 import logging
 logger = logging.getLogger(__name__)
 def get_gemini_client():
-    """
-    Initialize and return Gemini client.
-    """
-    try:
-        client = Client(api_key=GOOGLE_GENAI_API_KEY)
-        logger.info("✓ Gemini client initialized successfully")
-        return client
-    except Exception as e:
-        logger.error(f"✗ Failed to initialize Gemini client: {str(e)}")
-        raise ValueError(f"Gemini client initialization failed: {str(e)}")

+# This file is no longer needed since we're using free models
 import logging
 logger = logging.getLogger(__name__)
 def get_gemini_client():
+    """Gemini client is no longer used"""
+    logger.warning("Gemini client is deprecated - using free models instead")
+    raise Exception("Gemini API is no longer used. Free models are being used instead.")

services/stt_service.py CHANGED Viewed

@@ -1,66 +1,72 @@
-from services.gemini_client import get_gemini_client
-import base64
-import mimetypes
 import logging
 logger = logging.getLogger(__name__)
 async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
     """
-    Convert audio bytes to text using Gemini API.
     Args:
         audio_bytes: Raw audio file bytes
-        filename: Name of the audio file (used to detect format)
     Returns:
         Transcribed text
-    Raises:
-        Exception: If transcription fails
     """
     try:
-        client = get_gemini_client()
-        # Detect MIME type from filename
-        mime_type, _ = mimetypes.guess_type(filename)
-        if mime_type is None:
-            mime_type = "audio/wav"  # fallback
-        logger.info(f"Converting audio to text (format: {mime_type})")
-        # Convert audio to base64
-        audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
-        # Create proper content structure for Gemini
-        contents = [
-            {
-                "parts": [
-                    {
-                        "inline_data": {
-                            "mime_type": mime_type,
-                            "data": audio_b64
-                        }
-                    },
-                    {
-                        "text": "Transcribe this audio to text."
-                    }
-                ]
-            }
-        ]
-        # Call Gemini API
-        response = client.models.generate_content(
-            model="gemini-2.0-flash-exp",  # Using a model that supports multimodal
-            contents=contents
-        )
-        transcribed_text = response.text.strip()
-        logger.info(f"✓ STT successful: '{transcribed_text}'")
-        return transcribed_text
     except Exception as e:
         logger.error(f"✗ STT failed: {str(e)}")
         raise Exception(f"Speech-to-text conversion failed: {str(e)}")

+import torch
+import torchaudio
+from transformers import pipeline
 import logging
+import tempfile
+import os
 logger = logging.getLogger(__name__)
+# Global STT pipeline
+stt_pipeline = None
+def load_stt_model():
+    """Load the free Whisper model for speech-to-text"""
+    global stt_pipeline
+    try:
+        logger.info("Loading Whisper STT model...")
+        stt_pipeline = pipeline(
+            "automatic-speech-recognition",
+            model="openai/whisper-small",  # Free model
+            device="cpu"  # Use CPU to avoid GPU requirements
+        )
+        logger.info("✓ Whisper STT model loaded successfully")
+    except Exception as e:
+        logger.error(f"✗ Failed to load Whisper model: {str(e)}")
+        stt_pipeline = None
 async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
     """
+    Convert audio bytes to text using free Whisper model.
     Args:
         audio_bytes: Raw audio file bytes
+        filename: Name of the audio file
     Returns:
         Transcribed text
     """
+    global stt_pipeline
     try:
+        if stt_pipeline is None:
+            load_stt_model()
+            if stt_pipeline is None:
+                raise Exception("STT model failed to load")
+        logger.info(f"Converting audio to text using Whisper")
+        # Save audio bytes to temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
+            temp_audio.write(audio_bytes)
+            temp_audio_path = temp_audio.name
+        try:
+            # Transcribe using Whisper
+            result = stt_pipeline(temp_audio_path)
+            transcribed_text = result.get("text", "").strip()
+            if not transcribed_text:
+                transcribed_text = "Sorry, I couldn't understand the audio."
+            logger.info(f"✓ STT successful: '{transcribed_text}'")
+            return transcribed_text
+        finally:
+            # Clean up temporary file
+            if os.path.exists(temp_audio_path):
+                os.unlink(temp_audio_path)
     except Exception as e:
         logger.error(f"✗ STT failed: {str(e)}")
         raise Exception(f"Speech-to-text conversion failed: {str(e)}")

services/tts_service.py CHANGED Viewed

@@ -1,63 +1,111 @@
-from services.gemini_client import get_gemini_client
-from google.genai import types
-import base64
 import logging
 logger = logging.getLogger(__name__)
 async def generate_tts(text: str) -> bytes:
     """
-    Convert text to speech using Gemini API.
     Args:
         text: Text to convert to speech
     Returns:
-        Audio bytes in WAV format
     Raises:
         Exception: If TTS generation fails
     """
     try:
-        client = get_gemini_client()
         logger.info(f"Generating speech for: '{text}'")
-        # For TTS, we need to use the specific TTS endpoint
-        # Note: This might require different API calls based on Gemini's actual TTS API
-        # Temporary fallback: Use regular model with text-to-speech request
-        response = client.models.generate_content(
-            model="gemini-2.0-flash-exp",
-            contents=[f"Convert this to speech: {text}"],
-            config=types.GenerateContentConfig(
-                response_mime_type="audio/wav",
-            ),
-        )
-        # Extract audio data from response
-        # This part depends on the actual Gemini TTS API response structure
-        if (response.candidates and
-            len(response.candidates) > 0 and
-            response.candidates[0].content and
-            response.candidates[0].content.parts and
-            len(response.candidates[0].content.parts) > 0):
-            part = response.candidates[0].content.parts[0]
-            if hasattr(part, 'inline_data') and part.inline_data:
-                audio_bytes = base64.b64decode(part.inline_data.data)
-            else:
-                # If no audio data, create a fallback audio or raise error
-                raise Exception("No audio data in response")
         else:
-            raise Exception("Invalid response format from TTS service")
         logger.info(f"✓ TTS successful: {len(audio_bytes)} bytes generated")
         return audio_bytes
     except Exception as e:
         logger.error(f"✗ TTS failed: {str(e)}")
-        # Fallback: Return a simple error message as text
-        raise Exception(f"Text-to-speech generation failed: {str(e)}")

 import logging
+import io
+import wave
+import numpy as np
 logger = logging.getLogger(__name__)
+# Try to import gTTS, but provide fallback if not available
+try:
+    from gtts import gTTS
+    GTTS_AVAILABLE = True
+except ImportError:
+    GTTS_AVAILABLE = False
+    logger.warning("gTTS not available. Using fallback audio generation.")
 async def generate_tts(text: str) -> bytes:
     """
+    Convert text to speech using free gTTS (Google Text-to-Speech).
     Args:
         text: Text to convert to speech
     Returns:
+        Audio bytes in MP3 format
     Raises:
         Exception: If TTS generation fails
     """
     try:
         logger.info(f"Generating speech for: '{text}'")
+        # Use gTTS if available
+        if GTTS_AVAILABLE:
+            tts = gTTS(text=text, lang='en', slow=False)
+            audio_buffer = io.BytesIO()
+            tts.write_to_fp(audio_buffer)
+            audio_bytes = audio_buffer.getvalue()
         else:
+            # Fallback to simple tone generation
+            audio_bytes = generate_fallback_audio(text)
         logger.info(f"✓ TTS successful: {len(audio_bytes)} bytes generated")
         return audio_bytes
     except Exception as e:
         logger.error(f"✗ TTS failed: {str(e)}")
+        # Ultimate fallback
+        return generate_silent_audio()
+def generate_fallback_audio(text: str) -> bytes:
+    """
+    Generate a simple tone-based audio file as fallback.
+    """
+    try:
+        # Create a simple sine wave
+        sample_rate = 22050
+        duration = max(1.0, min(3.0, len(text) * 0.1))
+        t = np.linspace(0, duration, int(sample_rate * duration), False)
+        # Generate tones that vary with the text length
+        base_freq = 440  # A4 note
+        # Add some variation based on text
+        freq_variation = min(200, len(text) * 5)
+        tone = 0.3 * np.sin(2 * np.pi * (base_freq + freq_variation) * t)
+        # Convert to 16-bit PCM
+        audio_data = (tone * 32767).astype(np.int16)
+        # Create WAV file in memory
+        buffer = io.BytesIO()
+        with wave.open(buffer, 'wb') as wav_file:
+            wav_file.setnchannels(1)  # Mono
+            wav_file.setsampwidth(2)  # 2 bytes = 16-bit
+            wav_file.setframerate(sample_rate)
+            wav_file.writeframes(audio_data.tobytes())
+        return buffer.getvalue()
+    except Exception as e:
+        logger.error(f"Fallback audio generation failed: {str(e)}")
+        return generate_silent_audio()
+def generate_silent_audio() -> bytes:
+    """
+    Generate a short silent audio file as ultimate fallback.
+    """
+    try:
+        sample_rate = 22050
+        duration = 1.0
+        # Generate silence
+        silent_data = np.zeros(int(sample_rate * duration), dtype=np.int16)
+        # Create WAV file in memory
+        buffer = io.BytesIO()
+        with wave.open(buffer, 'wb') as wav_file:
+            wav_file.setnchannels(1)  # Mono
+            wav_file.setsampwidth(2)  # 2 bytes = 16-bit
+            wav_file.setframerate(sample_rate)
+            wav_file.writeframes(silent_data.tobytes())
+        return buffer.getvalue()
+    except Exception as e:
+        logger.error(f"Silent audio generation failed: {str(e)}")
+        # Return empty bytes as last resort
+        return b""