malek-messaoudii commited on
Commit
95cb26e
·
1 Parent(s): 918acab

Refactor audio processing to utilize free models and enhance logging; update TTS and STT services for improved functionality

Browse files
config.py CHANGED
@@ -3,6 +3,10 @@
3
  import os
4
  from pathlib import Path
5
  from dotenv import load_dotenv
 
 
 
 
6
 
7
  # Load environment variables from .env file
8
  load_dotenv()
@@ -38,11 +42,10 @@ CORS_CREDENTIALS = True
38
  CORS_METHODS = ["*"]
39
  CORS_HEADERS = ["*"]
40
 
41
- GOOGLE_GENAI_API_KEY = os.getenv("GOOGLE_GENAI_API_KEY")
42
-
43
- # Validate API key
44
- if not GOOGLE_GENAI_API_KEY:
45
- raise ValueError("Missing GOOGLE_GENAI_API_KEY environment variable. Add it to .env file")
46
 
47
  # Audio settings
48
  ALLOWED_AUDIO_TYPES = {
@@ -60,9 +63,8 @@ MAX_AUDIO_SIZE = 10 * 1024 * 1024 # 10MB
60
 
61
  # Validate configuration
62
  def validate_config():
63
- required_vars = ["GOOGLE_GENAI_API_KEY"]
64
- missing = [var for var in required_vars if not os.getenv(var)]
65
- if missing:
66
- raise ValueError(f"Missing required environment variables: {missing}")
67
 
68
  validate_config()
 
3
  import os
4
  from pathlib import Path
5
  from dotenv import load_dotenv
6
+ import logging
7
+
8
+ # Configure logging
9
+ logger = logging.getLogger(__name__)
10
 
11
  # Load environment variables from .env file
12
  load_dotenv()
 
42
  CORS_METHODS = ["*"]
43
  CORS_HEADERS = ["*"]
44
 
45
+ # Free model configurations
46
+ STT_MODEL_ID = "openai/whisper-small" # Free Whisper model for STT
47
+ CHATBOT_MODEL_ID = "microsoft/DialoGPT-medium" # Free chatbot model
48
+ TTS_USE_GTTS = True # Use gTTS (Google Text-to-Speech) free tier
 
49
 
50
  # Audio settings
51
  ALLOWED_AUDIO_TYPES = {
 
63
 
64
  # Validate configuration
65
  def validate_config():
66
+ """Validate that we can use free models"""
67
+ logger.info("✓ Using free models for STT, TTS, and Chatbot")
68
+ return True
 
69
 
70
  validate_config()
models/audio.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
  class STTResponse(BaseModel):
6
  """Response model for Speech-to-Text"""
7
  text: str = Field(..., description="Transcribed text from audio")
8
- model_name: str = Field(default="gemini-2.5-flash", description="Model used")
9
  language: Optional[str] = Field(default="en", description="Detected language")
10
  duration_seconds: Optional[float] = Field(None, description="Audio duration")
11
 
@@ -13,7 +13,7 @@ class STTResponse(BaseModel):
13
  json_schema_extra = {
14
  "example": {
15
  "text": "hello how are you",
16
- "model_name": "gemini-2.5-flash",
17
  "language": "en",
18
  "duration_seconds": 3.2
19
  }
@@ -35,16 +35,16 @@ class TTSRequest(BaseModel):
35
  class TTSResponse(BaseModel):
36
  """Response model for Text-to-Speech"""
37
  message: str = Field(..., description="Status message")
38
- audio_format: str = Field(default="wav", description="Audio format")
39
- model_name: str = Field(default="gemini-2.5-flash-preview-tts", description="Model used")
40
  length_seconds: Optional[float] = Field(None, description="Generated audio duration")
41
 
42
  class Config:
43
  json_schema_extra = {
44
  "example": {
45
  "message": "Audio generated successfully",
46
- "audio_format": "wav",
47
- "model_name": "gemini-2.5-flash-preview-tts",
48
  "length_seconds": 2.5
49
  }
50
  }
@@ -66,13 +66,13 @@ class ChatbotResponse(BaseModel):
66
  """Response model for Chatbot"""
67
  user_input: str = Field(..., description="User input text")
68
  bot_response: str = Field(..., description="Bot response text")
69
- model_name: str = Field(default="gemini-2.5-flash", description="Model used")
70
 
71
  class Config:
72
  json_schema_extra = {
73
  "example": {
74
  "user_input": "Hello",
75
  "bot_response": "Hi there! How can I help you?",
76
- "model_name": "gemini-2.5-flash"
77
  }
78
  }
 
5
  class STTResponse(BaseModel):
6
  """Response model for Speech-to-Text"""
7
  text: str = Field(..., description="Transcribed text from audio")
8
+ model_name: str = Field(default="whisper-small", description="Model used")
9
  language: Optional[str] = Field(default="en", description="Detected language")
10
  duration_seconds: Optional[float] = Field(None, description="Audio duration")
11
 
 
13
  json_schema_extra = {
14
  "example": {
15
  "text": "hello how are you",
16
+ "model_name": "whisper-small",
17
  "language": "en",
18
  "duration_seconds": 3.2
19
  }
 
35
  class TTSResponse(BaseModel):
36
  """Response model for Text-to-Speech"""
37
  message: str = Field(..., description="Status message")
38
+ audio_format: str = Field(default="mp3", description="Audio format")
39
+ model_name: str = Field(default="gTTS", description="Model used")
40
  length_seconds: Optional[float] = Field(None, description="Generated audio duration")
41
 
42
  class Config:
43
  json_schema_extra = {
44
  "example": {
45
  "message": "Audio generated successfully",
46
+ "audio_format": "mp3",
47
+ "model_name": "gTTS",
48
  "length_seconds": 2.5
49
  }
50
  }
 
66
  """Response model for Chatbot"""
67
  user_input: str = Field(..., description="User input text")
68
  bot_response: str = Field(..., description="Bot response text")
69
+ model_name: str = Field(default="DialoGPT-medium", description="Model used")
70
 
71
  class Config:
72
  json_schema_extra = {
73
  "example": {
74
  "user_input": "Hello",
75
  "bot_response": "Hi there! How can I help you?",
76
+ "model_name": "DialoGPT-medium"
77
  }
78
  }
requirements.txt CHANGED
@@ -9,3 +9,5 @@ protobuf>=3.20.0
9
  huggingface_hub>=0.19.0
10
  python-multipart
11
  google-genai>=0.4.0
 
 
 
9
  huggingface_hub>=0.19.0
10
  python-multipart
11
  google-genai>=0.4.0
12
+ gtts==2.5.1
13
+
routes/audio.py CHANGED
@@ -3,30 +3,38 @@ from fastapi.responses import StreamingResponse
3
  import io
4
  import logging
5
  from config import ALLOWED_AUDIO_TYPES, MAX_AUDIO_SIZE
6
- from services.stt_service import speech_to_text
7
  from services.tts_service import generate_tts
8
- from services.chatbot_service import get_chatbot_response
9
  from models.audio import STTResponse, TTSRequest, TTSResponse, ChatbotRequest, ChatbotResponse
10
 
11
  logger = logging.getLogger(__name__)
12
 
13
  router = APIRouter(prefix="/audio", tags=["Audio"])
14
 
 
 
 
 
 
 
 
 
15
 
16
  @router.post("/tts")
17
  async def tts(request: TTSRequest):
18
  """
19
- Convert text to speech and return audio file.
20
 
21
  Example:
22
  - POST /audio/tts
23
  - Body: {"text": "Hello, welcome to our system"}
24
- - Returns: WAV audio file
25
  """
26
  try:
27
  logger.info(f"TTS request received for text: '{request.text}'")
28
  audio_bytes = await generate_tts(request.text)
29
- return StreamingResponse(io.BytesIO(audio_bytes), media_type="audio/wav")
30
  except Exception as e:
31
  logger.error(f"TTS error: {str(e)}")
32
  raise HTTPException(status_code=500, detail=str(e))
@@ -35,12 +43,12 @@ async def tts(request: TTSRequest):
35
  @router.post("/stt", response_model=STTResponse)
36
  async def stt(file: UploadFile = File(...)):
37
  """
38
- Convert audio file to text.
39
 
40
  Example:
41
  - POST /audio/stt
42
  - File: audio.mp3 (or .wav, .m4a)
43
- - Returns: {"text": "transcribed text", "model_name": "gemini-2.5-flash", ...}
44
  """
45
  # Validate file type
46
  if file.content_type not in ALLOWED_AUDIO_TYPES:
@@ -64,7 +72,7 @@ async def stt(file: UploadFile = File(...)):
64
 
65
  return STTResponse(
66
  text=text,
67
- model_name="gemini-2.5-flash",
68
  language="en",
69
  duration_seconds=None
70
  )
@@ -76,17 +84,12 @@ async def stt(file: UploadFile = File(...)):
76
  @router.post("/chatbot")
77
  async def chatbot_voice(file: UploadFile = File(...)):
78
  """
79
- Full voice chatbot flow (Audio → Text → Response → Audio).
80
 
81
  Example:
82
  - POST /audio/chatbot
83
  - File: user_voice.mp3
84
- - Returns: Response audio file (WAV)
85
-
86
- Process:
87
- 1. Converts user's audio to text (STT)
88
- 2. Generates chatbot response to user's text
89
- 3. Converts response back to audio (TTS)
90
  """
91
  # Validate file type
92
  if file.content_type not in ALLOWED_AUDIO_TYPES:
@@ -119,7 +122,7 @@ async def chatbot_voice(file: UploadFile = File(...)):
119
  audio_response = await generate_tts(response_text)
120
  logger.info("Step 3 - TTS: Complete")
121
 
122
- return StreamingResponse(io.BytesIO(audio_response), media_type="audio/wav")
123
 
124
  except Exception as e:
125
  logger.error(f"Voice chatbot error: {str(e)}")
@@ -129,7 +132,7 @@ async def chatbot_voice(file: UploadFile = File(...)):
129
  @router.post("/chatbot-text", response_model=ChatbotResponse)
130
  async def chatbot_text(request: ChatbotRequest):
131
  """
132
- Chatbot interaction with text input/output (no audio).
133
 
134
  Example:
135
  - POST /audio/chatbot-text
@@ -143,7 +146,7 @@ async def chatbot_text(request: ChatbotRequest):
143
  return ChatbotResponse(
144
  user_input=request.text,
145
  bot_response=response_text,
146
- model_name="gemini-2.5-flash"
147
  )
148
  except Exception as e:
149
  logger.error(f"Text chatbot error: {str(e)}")
 
3
  import io
4
  import logging
5
  from config import ALLOWED_AUDIO_TYPES, MAX_AUDIO_SIZE
6
+ from services.stt_service import speech_to_text, load_stt_model
7
  from services.tts_service import generate_tts
8
+ from services.chatbot_service import get_chatbot_response, load_chatbot_model
9
  from models.audio import STTResponse, TTSRequest, TTSResponse, ChatbotRequest, ChatbotResponse
10
 
11
  logger = logging.getLogger(__name__)
12
 
13
  router = APIRouter(prefix="/audio", tags=["Audio"])
14
 
15
+ # Pre-load models on router startup
16
+ @router.on_event("startup")
17
+ async def startup_event():
18
+ """Load models when the router starts"""
19
+ logger.info("Loading free STT and Chatbot models...")
20
+ load_stt_model()
21
+ load_chatbot_model()
22
+
23
 
24
  @router.post("/tts")
25
  async def tts(request: TTSRequest):
26
  """
27
+ Convert text to speech and return audio file using free gTTS.
28
 
29
  Example:
30
  - POST /audio/tts
31
  - Body: {"text": "Hello, welcome to our system"}
32
+ - Returns: MP3 audio file
33
  """
34
  try:
35
  logger.info(f"TTS request received for text: '{request.text}'")
36
  audio_bytes = await generate_tts(request.text)
37
+ return StreamingResponse(io.BytesIO(audio_bytes), media_type="audio/mp3")
38
  except Exception as e:
39
  logger.error(f"TTS error: {str(e)}")
40
  raise HTTPException(status_code=500, detail=str(e))
 
43
  @router.post("/stt", response_model=STTResponse)
44
  async def stt(file: UploadFile = File(...)):
45
  """
46
+ Convert audio file to text using free Whisper model.
47
 
48
  Example:
49
  - POST /audio/stt
50
  - File: audio.mp3 (or .wav, .m4a)
51
+ - Returns: {"text": "transcribed text", "model_name": "whisper-small", ...}
52
  """
53
  # Validate file type
54
  if file.content_type not in ALLOWED_AUDIO_TYPES:
 
72
 
73
  return STTResponse(
74
  text=text,
75
+ model_name="whisper-small",
76
  language="en",
77
  duration_seconds=None
78
  )
 
84
  @router.post("/chatbot")
85
  async def chatbot_voice(file: UploadFile = File(...)):
86
  """
87
+ Full voice chatbot flow using free models (Audio → Text → Response → Audio).
88
 
89
  Example:
90
  - POST /audio/chatbot
91
  - File: user_voice.mp3
92
+ - Returns: Response audio file (MP3)
 
 
 
 
 
93
  """
94
  # Validate file type
95
  if file.content_type not in ALLOWED_AUDIO_TYPES:
 
122
  audio_response = await generate_tts(response_text)
123
  logger.info("Step 3 - TTS: Complete")
124
 
125
+ return StreamingResponse(io.BytesIO(audio_response), media_type="audio/mp3")
126
 
127
  except Exception as e:
128
  logger.error(f"Voice chatbot error: {str(e)}")
 
132
  @router.post("/chatbot-text", response_model=ChatbotResponse)
133
  async def chatbot_text(request: ChatbotRequest):
134
  """
135
+ Chatbot interaction with text input/output using free DialoGPT model.
136
 
137
  Example:
138
  - POST /audio/chatbot-text
 
146
  return ChatbotResponse(
147
  user_input=request.text,
148
  bot_response=response_text,
149
+ model_name="DialoGPT-medium"
150
  )
151
  except Exception as e:
152
  logger.error(f"Text chatbot error: {str(e)}")
services/chatbot_service.py CHANGED
@@ -1,46 +1,121 @@
1
- from services.gemini_client import get_gemini_client
2
  import logging
 
3
 
4
  logger = logging.getLogger(__name__)
5
 
 
 
 
6
 
7
- async def get_chatbot_response(user_text: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  """
9
- Generate chatbot response using Gemini API.
10
 
11
  Args:
12
  user_text: User input text
 
13
 
14
  Returns:
15
  Chatbot response text
16
-
17
- Raises:
18
- Exception: If response generation fails
19
  """
 
 
20
  try:
21
- client = get_gemini_client()
 
 
 
22
 
23
  logger.info(f"Generating chatbot response for: '{user_text}'")
24
 
25
- # Create a system prompt for better responses
26
- system_prompt = """You are a helpful, friendly AI assistant.
27
- Respond concisely and naturally to user queries.
28
- Keep responses brief (1-2 sentences) for voice interaction."""
29
 
30
- # Combine system prompt with user input
31
- full_prompt = f"{system_prompt}\n\nUser: {user_text}"
 
32
 
33
- response = client.models.generate_content(
34
- model="gemini-2.0-flash-exp", # Using a model that definitely exists
35
- contents=[full_prompt]
 
 
 
 
 
 
 
 
36
  )
37
 
38
- response_text = response.text
39
- logger.info(f"✓ Response generated: '{response_text}'")
 
 
 
 
 
 
 
 
 
40
 
41
- return response_text
 
 
 
 
 
42
 
43
  except Exception as e:
44
  logger.error(f"✗ Chatbot response failed: {str(e)}")
45
- # Fallback response
46
- return f"I understood you said: '{user_text}'. Could you tell me more?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
2
  import logging
3
+ import torch
4
 
5
  logger = logging.getLogger(__name__)
6
 
7
+ # Global chatbot components
8
+ chatbot_pipeline = None
9
+ chat_history = {}
10
 
11
+ def load_chatbot_model():
12
+ """Load the free DialoGPT model for chatbot"""
13
+ global chatbot_pipeline
14
+ try:
15
+ logger.info("Loading DialoGPT chatbot model...")
16
+
17
+ # Use DialoGPT medium for better responses
18
+ chatbot_pipeline = pipeline(
19
+ "text-generation",
20
+ model="microsoft/DialoGPT-medium",
21
+ tokenizer="microsoft/DialoGPT-medium",
22
+ device="cpu"
23
+ )
24
+ logger.info("✓ DialoGPT chatbot model loaded successfully")
25
+ except Exception as e:
26
+ logger.error(f"✗ Failed to load DialoGPT model: {str(e)}")
27
+ chatbot_pipeline = None
28
+
29
+ async def get_chatbot_response(user_text: str, user_id: str = "default") -> str:
30
  """
31
+ Generate chatbot response using free DialoGPT model.
32
 
33
  Args:
34
  user_text: User input text
35
+ user_id: Unique user ID for maintaining conversation history
36
 
37
  Returns:
38
  Chatbot response text
 
 
 
39
  """
40
+ global chatbot_pipeline
41
+
42
  try:
43
+ if chatbot_pipeline is None:
44
+ load_chatbot_model()
45
+ if chatbot_pipeline is None:
46
+ return get_fallback_response(user_text)
47
 
48
  logger.info(f"Generating chatbot response for: '{user_text}'")
49
 
50
+ # Get or initialize chat history for this user
51
+ if user_id not in chat_history:
52
+ chat_history[user_id] = []
 
53
 
54
+ # Prepare conversation context
55
+ conversation = chat_history[user_id] + [user_text]
56
+ context = " ".join(conversation[-3:]) # Use last 3 exchanges as context
57
 
58
+ # Generate response
59
+ response = chatbot_pipeline(
60
+ context,
61
+ max_length=150,
62
+ num_return_sequences=1,
63
+ pad_token_id=chatbot_pipeline.tokenizer.eos_token_id,
64
+ no_repeat_ngram_size=3,
65
+ do_sample=True,
66
+ top_k=50,
67
+ top_p=0.95,
68
+ temperature=0.7
69
  )
70
 
71
+ bot_response = response[0]['generated_text'].strip()
72
+
73
+ # Extract only the new response (remove the input context)
74
+ if context in bot_response:
75
+ bot_response = bot_response.replace(context, "").strip()
76
+
77
+ # Clean up the response
78
+ bot_response = clean_response(bot_response)
79
+
80
+ # Update chat history
81
+ chat_history[user_id].extend([user_text, bot_response])
82
 
83
+ # Keep only recent history (last 4 exchanges)
84
+ if len(chat_history[user_id]) > 8:
85
+ chat_history[user_id] = chat_history[user_id][-8:]
86
+
87
+ logger.info(f"✓ Response generated: '{bot_response}'")
88
+ return bot_response
89
 
90
  except Exception as e:
91
  logger.error(f"✗ Chatbot response failed: {str(e)}")
92
+ return get_fallback_response(user_text)
93
+
94
+
95
+ def clean_response(response: str) -> str:
96
+ """Clean and format the chatbot response"""
97
+ # Remove extra spaces
98
+ response = ' '.join(response.split())
99
+
100
+ # Ensure proper sentence ending
101
+ if response and not response.endswith(('.', '!', '?')):
102
+ response += '.'
103
+
104
+ # Limit response length
105
+ if len(response) > 200:
106
+ response = response[:197] + '...'
107
+
108
+ return response
109
+
110
+
111
+ def get_fallback_response(user_text: str) -> str:
112
+ """Provide fallback responses when model fails"""
113
+ fallback_responses = [
114
+ f"I understand you said: '{user_text}'. Could you tell me more about that?",
115
+ f"That's interesting! You mentioned: '{user_text}'. What would you like to know?",
116
+ f"Thanks for sharing! Regarding '{user_text}', how can I help you?",
117
+ f"I heard you say: '{user_text}'. Could you elaborate on that?"
118
+ ]
119
+
120
+ import random
121
+ return random.choice(fallback_responses)
services/gemini_client.py CHANGED
@@ -1,18 +1,9 @@
1
- from google.genai import Client
2
- from config import GOOGLE_GENAI_API_KEY
3
  import logging
4
 
5
  logger = logging.getLogger(__name__)
6
 
7
-
8
  def get_gemini_client():
9
- """
10
- Initialize and return Gemini client.
11
- """
12
- try:
13
- client = Client(api_key=GOOGLE_GENAI_API_KEY)
14
- logger.info("✓ Gemini client initialized successfully")
15
- return client
16
- except Exception as e:
17
- logger.error(f"✗ Failed to initialize Gemini client: {str(e)}")
18
- raise ValueError(f"Gemini client initialization failed: {str(e)}")
 
1
+ # This file is no longer needed since we're using free models
 
2
  import logging
3
 
4
  logger = logging.getLogger(__name__)
5
 
 
6
  def get_gemini_client():
7
+ """Gemini client is no longer used"""
8
+ logger.warning("Gemini client is deprecated - using free models instead")
9
+ raise Exception("Gemini API is no longer used. Free models are being used instead.")
 
 
 
 
 
 
 
services/stt_service.py CHANGED
@@ -1,66 +1,72 @@
1
- from services.gemini_client import get_gemini_client
2
- import base64
3
- import mimetypes
4
  import logging
 
 
5
 
6
  logger = logging.getLogger(__name__)
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
10
  """
11
- Convert audio bytes to text using Gemini API.
12
 
13
  Args:
14
  audio_bytes: Raw audio file bytes
15
- filename: Name of the audio file (used to detect format)
16
 
17
  Returns:
18
  Transcribed text
19
-
20
- Raises:
21
- Exception: If transcription fails
22
  """
 
 
23
  try:
24
- client = get_gemini_client()
25
-
26
- # Detect MIME type from filename
27
- mime_type, _ = mimetypes.guess_type(filename)
28
- if mime_type is None:
29
- mime_type = "audio/wav" # fallback
30
-
31
- logger.info(f"Converting audio to text (format: {mime_type})")
32
-
33
- # Convert audio to base64
34
- audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
35
-
36
- # Create proper content structure for Gemini
37
- contents = [
38
- {
39
- "parts": [
40
- {
41
- "inline_data": {
42
- "mime_type": mime_type,
43
- "data": audio_b64
44
- }
45
- },
46
- {
47
- "text": "Transcribe this audio to text."
48
- }
49
- ]
50
- }
51
- ]
52
-
53
- # Call Gemini API
54
- response = client.models.generate_content(
55
- model="gemini-2.0-flash-exp", # Using a model that supports multimodal
56
- contents=contents
57
- )
58
 
59
- transcribed_text = response.text.strip()
60
- logger.info(f"✓ STT successful: '{transcribed_text}'")
61
 
62
- return transcribed_text
 
 
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  except Exception as e:
65
  logger.error(f"✗ STT failed: {str(e)}")
66
  raise Exception(f"Speech-to-text conversion failed: {str(e)}")
 
1
+ import torch
2
+ import torchaudio
3
+ from transformers import pipeline
4
  import logging
5
+ import tempfile
6
+ import os
7
 
8
  logger = logging.getLogger(__name__)
9
 
10
+ # Global STT pipeline
11
+ stt_pipeline = None
12
+
13
+ def load_stt_model():
14
+ """Load the free Whisper model for speech-to-text"""
15
+ global stt_pipeline
16
+ try:
17
+ logger.info("Loading Whisper STT model...")
18
+ stt_pipeline = pipeline(
19
+ "automatic-speech-recognition",
20
+ model="openai/whisper-small", # Free model
21
+ device="cpu" # Use CPU to avoid GPU requirements
22
+ )
23
+ logger.info("✓ Whisper STT model loaded successfully")
24
+ except Exception as e:
25
+ logger.error(f"✗ Failed to load Whisper model: {str(e)}")
26
+ stt_pipeline = None
27
 
28
  async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
29
  """
30
+ Convert audio bytes to text using free Whisper model.
31
 
32
  Args:
33
  audio_bytes: Raw audio file bytes
34
+ filename: Name of the audio file
35
 
36
  Returns:
37
  Transcribed text
 
 
 
38
  """
39
+ global stt_pipeline
40
+
41
  try:
42
+ if stt_pipeline is None:
43
+ load_stt_model()
44
+ if stt_pipeline is None:
45
+ raise Exception("STT model failed to load")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ logger.info(f"Converting audio to text using Whisper")
 
48
 
49
+ # Save audio bytes to temporary file
50
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
51
+ temp_audio.write(audio_bytes)
52
+ temp_audio_path = temp_audio.name
53
 
54
+ try:
55
+ # Transcribe using Whisper
56
+ result = stt_pipeline(temp_audio_path)
57
+ transcribed_text = result.get("text", "").strip()
58
+
59
+ if not transcribed_text:
60
+ transcribed_text = "Sorry, I couldn't understand the audio."
61
+
62
+ logger.info(f"✓ STT successful: '{transcribed_text}'")
63
+ return transcribed_text
64
+
65
+ finally:
66
+ # Clean up temporary file
67
+ if os.path.exists(temp_audio_path):
68
+ os.unlink(temp_audio_path)
69
+
70
  except Exception as e:
71
  logger.error(f"✗ STT failed: {str(e)}")
72
  raise Exception(f"Speech-to-text conversion failed: {str(e)}")
services/tts_service.py CHANGED
@@ -1,63 +1,111 @@
1
- from services.gemini_client import get_gemini_client
2
- from google.genai import types
3
- import base64
4
  import logging
 
 
 
5
 
6
  logger = logging.getLogger(__name__)
7
 
 
 
 
 
 
 
 
 
8
 
9
  async def generate_tts(text: str) -> bytes:
10
  """
11
- Convert text to speech using Gemini API.
12
 
13
  Args:
14
  text: Text to convert to speech
15
 
16
  Returns:
17
- Audio bytes in WAV format
18
 
19
  Raises:
20
  Exception: If TTS generation fails
21
  """
22
  try:
23
- client = get_gemini_client()
24
-
25
  logger.info(f"Generating speech for: '{text}'")
26
 
27
- # For TTS, we need to use the specific TTS endpoint
28
- # Note: This might require different API calls based on Gemini's actual TTS API
29
-
30
- # Temporary fallback: Use regular model with text-to-speech request
31
- response = client.models.generate_content(
32
- model="gemini-2.0-flash-exp",
33
- contents=[f"Convert this to speech: {text}"],
34
- config=types.GenerateContentConfig(
35
- response_mime_type="audio/wav",
36
- ),
37
- )
38
-
39
- # Extract audio data from response
40
- # This part depends on the actual Gemini TTS API response structure
41
- if (response.candidates and
42
- len(response.candidates) > 0 and
43
- response.candidates[0].content and
44
- response.candidates[0].content.parts and
45
- len(response.candidates[0].content.parts) > 0):
46
-
47
- part = response.candidates[0].content.parts[0]
48
- if hasattr(part, 'inline_data') and part.inline_data:
49
- audio_bytes = base64.b64decode(part.inline_data.data)
50
- else:
51
- # If no audio data, create a fallback audio or raise error
52
- raise Exception("No audio data in response")
53
  else:
54
- raise Exception("Invalid response format from TTS service")
 
55
 
56
  logger.info(f"✓ TTS successful: {len(audio_bytes)} bytes generated")
57
-
58
  return audio_bytes
59
 
60
  except Exception as e:
61
  logger.error(f"✗ TTS failed: {str(e)}")
62
- # Fallback: Return a simple error message as text
63
- raise Exception(f"Text-to-speech generation failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import logging
2
+ import io
3
+ import wave
4
+ import numpy as np
5
 
6
  logger = logging.getLogger(__name__)
7
 
8
+ # Try to import gTTS, but provide fallback if not available
9
+ try:
10
+ from gtts import gTTS
11
+ GTTS_AVAILABLE = True
12
+ except ImportError:
13
+ GTTS_AVAILABLE = False
14
+ logger.warning("gTTS not available. Using fallback audio generation.")
15
+
16
 
17
  async def generate_tts(text: str) -> bytes:
18
  """
19
+ Convert text to speech using free gTTS (Google Text-to-Speech).
20
 
21
  Args:
22
  text: Text to convert to speech
23
 
24
  Returns:
25
+ Audio bytes in MP3 format
26
 
27
  Raises:
28
  Exception: If TTS generation fails
29
  """
30
  try:
 
 
31
  logger.info(f"Generating speech for: '{text}'")
32
 
33
+ # Use gTTS if available
34
+ if GTTS_AVAILABLE:
35
+ tts = gTTS(text=text, lang='en', slow=False)
36
+ audio_buffer = io.BytesIO()
37
+ tts.write_to_fp(audio_buffer)
38
+ audio_bytes = audio_buffer.getvalue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  else:
40
+ # Fallback to simple tone generation
41
+ audio_bytes = generate_fallback_audio(text)
42
 
43
  logger.info(f"✓ TTS successful: {len(audio_bytes)} bytes generated")
 
44
  return audio_bytes
45
 
46
  except Exception as e:
47
  logger.error(f"✗ TTS failed: {str(e)}")
48
+ # Ultimate fallback
49
+ return generate_silent_audio()
50
+
51
+
52
+ def generate_fallback_audio(text: str) -> bytes:
53
+ """
54
+ Generate a simple tone-based audio file as fallback.
55
+ """
56
+ try:
57
+ # Create a simple sine wave
58
+ sample_rate = 22050
59
+ duration = max(1.0, min(3.0, len(text) * 0.1))
60
+
61
+ t = np.linspace(0, duration, int(sample_rate * duration), False)
62
+
63
+ # Generate tones that vary with the text length
64
+ base_freq = 440 # A4 note
65
+ # Add some variation based on text
66
+ freq_variation = min(200, len(text) * 5)
67
+ tone = 0.3 * np.sin(2 * np.pi * (base_freq + freq_variation) * t)
68
+
69
+ # Convert to 16-bit PCM
70
+ audio_data = (tone * 32767).astype(np.int16)
71
+
72
+ # Create WAV file in memory
73
+ buffer = io.BytesIO()
74
+ with wave.open(buffer, 'wb') as wav_file:
75
+ wav_file.setnchannels(1) # Mono
76
+ wav_file.setsampwidth(2) # 2 bytes = 16-bit
77
+ wav_file.setframerate(sample_rate)
78
+ wav_file.writeframes(audio_data.tobytes())
79
+
80
+ return buffer.getvalue()
81
+
82
+ except Exception as e:
83
+ logger.error(f"Fallback audio generation failed: {str(e)}")
84
+ return generate_silent_audio()
85
+
86
+
87
+ def generate_silent_audio() -> bytes:
88
+ """
89
+ Generate a short silent audio file as ultimate fallback.
90
+ """
91
+ try:
92
+ sample_rate = 22050
93
+ duration = 1.0
94
+
95
+ # Generate silence
96
+ silent_data = np.zeros(int(sample_rate * duration), dtype=np.int16)
97
+
98
+ # Create WAV file in memory
99
+ buffer = io.BytesIO()
100
+ with wave.open(buffer, 'wb') as wav_file:
101
+ wav_file.setnchannels(1) # Mono
102
+ wav_file.setsampwidth(2) # 2 bytes = 16-bit
103
+ wav_file.setframerate(sample_rate)
104
+ wav_file.writeframes(silent_data.tobytes())
105
+
106
+ return buffer.getvalue()
107
+
108
+ except Exception as e:
109
+ logger.error(f"Silent audio generation failed: {str(e)}")
110
+ # Return empty bytes as last resort
111
+ return b""