malek-messaoudii
commited on
Commit
·
95cb26e
1
Parent(s):
918acab
Refactor audio processing to utilize free models and enhance logging; update TTS and STT services for improved functionality
Browse files- config.py +11 -9
- models/audio.py +8 -8
- requirements.txt +2 -0
- routes/audio.py +21 -18
- services/chatbot_service.py +96 -21
- services/gemini_client.py +4 -13
- services/stt_service.py +51 -45
- services/tts_service.py +85 -37
config.py
CHANGED
|
@@ -3,6 +3,10 @@
|
|
| 3 |
import os
|
| 4 |
from pathlib import Path
|
| 5 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
# Load environment variables from .env file
|
| 8 |
load_dotenv()
|
|
@@ -38,11 +42,10 @@ CORS_CREDENTIALS = True
|
|
| 38 |
CORS_METHODS = ["*"]
|
| 39 |
CORS_HEADERS = ["*"]
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
#
|
| 44 |
-
|
| 45 |
-
raise ValueError("Missing GOOGLE_GENAI_API_KEY environment variable. Add it to .env file")
|
| 46 |
|
| 47 |
# Audio settings
|
| 48 |
ALLOWED_AUDIO_TYPES = {
|
|
@@ -60,9 +63,8 @@ MAX_AUDIO_SIZE = 10 * 1024 * 1024 # 10MB
|
|
| 60 |
|
| 61 |
# Validate configuration
|
| 62 |
def validate_config():
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
raise ValueError(f"Missing required environment variables: {missing}")
|
| 67 |
|
| 68 |
validate_config()
|
|
|
|
| 3 |
import os
|
| 4 |
from pathlib import Path
|
| 5 |
from dotenv import load_dotenv
|
| 6 |
+
import logging
|
| 7 |
+
|
| 8 |
+
# Configure logging
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
|
| 11 |
# Load environment variables from .env file
|
| 12 |
load_dotenv()
|
|
|
|
| 42 |
CORS_METHODS = ["*"]
|
| 43 |
CORS_HEADERS = ["*"]
|
| 44 |
|
| 45 |
+
# Free model configurations
|
| 46 |
+
STT_MODEL_ID = "openai/whisper-small" # Free Whisper model for STT
|
| 47 |
+
CHATBOT_MODEL_ID = "microsoft/DialoGPT-medium" # Free chatbot model
|
| 48 |
+
TTS_USE_GTTS = True # Use gTTS (Google Text-to-Speech) free tier
|
|
|
|
| 49 |
|
| 50 |
# Audio settings
|
| 51 |
ALLOWED_AUDIO_TYPES = {
|
|
|
|
| 63 |
|
| 64 |
# Validate configuration
|
| 65 |
def validate_config():
|
| 66 |
+
"""Validate that we can use free models"""
|
| 67 |
+
logger.info("✓ Using free models for STT, TTS, and Chatbot")
|
| 68 |
+
return True
|
|
|
|
| 69 |
|
| 70 |
validate_config()
|
models/audio.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Optional
|
|
| 5 |
class STTResponse(BaseModel):
|
| 6 |
"""Response model for Speech-to-Text"""
|
| 7 |
text: str = Field(..., description="Transcribed text from audio")
|
| 8 |
-
model_name: str = Field(default="
|
| 9 |
language: Optional[str] = Field(default="en", description="Detected language")
|
| 10 |
duration_seconds: Optional[float] = Field(None, description="Audio duration")
|
| 11 |
|
|
@@ -13,7 +13,7 @@ class STTResponse(BaseModel):
|
|
| 13 |
json_schema_extra = {
|
| 14 |
"example": {
|
| 15 |
"text": "hello how are you",
|
| 16 |
-
"model_name": "
|
| 17 |
"language": "en",
|
| 18 |
"duration_seconds": 3.2
|
| 19 |
}
|
|
@@ -35,16 +35,16 @@ class TTSRequest(BaseModel):
|
|
| 35 |
class TTSResponse(BaseModel):
|
| 36 |
"""Response model for Text-to-Speech"""
|
| 37 |
message: str = Field(..., description="Status message")
|
| 38 |
-
audio_format: str = Field(default="
|
| 39 |
-
model_name: str = Field(default="
|
| 40 |
length_seconds: Optional[float] = Field(None, description="Generated audio duration")
|
| 41 |
|
| 42 |
class Config:
|
| 43 |
json_schema_extra = {
|
| 44 |
"example": {
|
| 45 |
"message": "Audio generated successfully",
|
| 46 |
-
"audio_format": "
|
| 47 |
-
"model_name": "
|
| 48 |
"length_seconds": 2.5
|
| 49 |
}
|
| 50 |
}
|
|
@@ -66,13 +66,13 @@ class ChatbotResponse(BaseModel):
|
|
| 66 |
"""Response model for Chatbot"""
|
| 67 |
user_input: str = Field(..., description="User input text")
|
| 68 |
bot_response: str = Field(..., description="Bot response text")
|
| 69 |
-
model_name: str = Field(default="
|
| 70 |
|
| 71 |
class Config:
|
| 72 |
json_schema_extra = {
|
| 73 |
"example": {
|
| 74 |
"user_input": "Hello",
|
| 75 |
"bot_response": "Hi there! How can I help you?",
|
| 76 |
-
"model_name": "
|
| 77 |
}
|
| 78 |
}
|
|
|
|
| 5 |
class STTResponse(BaseModel):
|
| 6 |
"""Response model for Speech-to-Text"""
|
| 7 |
text: str = Field(..., description="Transcribed text from audio")
|
| 8 |
+
model_name: str = Field(default="whisper-small", description="Model used")
|
| 9 |
language: Optional[str] = Field(default="en", description="Detected language")
|
| 10 |
duration_seconds: Optional[float] = Field(None, description="Audio duration")
|
| 11 |
|
|
|
|
| 13 |
json_schema_extra = {
|
| 14 |
"example": {
|
| 15 |
"text": "hello how are you",
|
| 16 |
+
"model_name": "whisper-small",
|
| 17 |
"language": "en",
|
| 18 |
"duration_seconds": 3.2
|
| 19 |
}
|
|
|
|
| 35 |
class TTSResponse(BaseModel):
|
| 36 |
"""Response model for Text-to-Speech"""
|
| 37 |
message: str = Field(..., description="Status message")
|
| 38 |
+
audio_format: str = Field(default="mp3", description="Audio format")
|
| 39 |
+
model_name: str = Field(default="gTTS", description="Model used")
|
| 40 |
length_seconds: Optional[float] = Field(None, description="Generated audio duration")
|
| 41 |
|
| 42 |
class Config:
|
| 43 |
json_schema_extra = {
|
| 44 |
"example": {
|
| 45 |
"message": "Audio generated successfully",
|
| 46 |
+
"audio_format": "mp3",
|
| 47 |
+
"model_name": "gTTS",
|
| 48 |
"length_seconds": 2.5
|
| 49 |
}
|
| 50 |
}
|
|
|
|
| 66 |
"""Response model for Chatbot"""
|
| 67 |
user_input: str = Field(..., description="User input text")
|
| 68 |
bot_response: str = Field(..., description="Bot response text")
|
| 69 |
+
model_name: str = Field(default="DialoGPT-medium", description="Model used")
|
| 70 |
|
| 71 |
class Config:
|
| 72 |
json_schema_extra = {
|
| 73 |
"example": {
|
| 74 |
"user_input": "Hello",
|
| 75 |
"bot_response": "Hi there! How can I help you?",
|
| 76 |
+
"model_name": "DialoGPT-medium"
|
| 77 |
}
|
| 78 |
}
|
requirements.txt
CHANGED
|
@@ -9,3 +9,5 @@ protobuf>=3.20.0
|
|
| 9 |
huggingface_hub>=0.19.0
|
| 10 |
python-multipart
|
| 11 |
google-genai>=0.4.0
|
|
|
|
|
|
|
|
|
| 9 |
huggingface_hub>=0.19.0
|
| 10 |
python-multipart
|
| 11 |
google-genai>=0.4.0
|
| 12 |
+
gtts==2.5.1
|
| 13 |
+
|
routes/audio.py
CHANGED
|
@@ -3,30 +3,38 @@ from fastapi.responses import StreamingResponse
|
|
| 3 |
import io
|
| 4 |
import logging
|
| 5 |
from config import ALLOWED_AUDIO_TYPES, MAX_AUDIO_SIZE
|
| 6 |
-
from services.stt_service import speech_to_text
|
| 7 |
from services.tts_service import generate_tts
|
| 8 |
-
from services.chatbot_service import get_chatbot_response
|
| 9 |
from models.audio import STTResponse, TTSRequest, TTSResponse, ChatbotRequest, ChatbotResponse
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
| 13 |
router = APIRouter(prefix="/audio", tags=["Audio"])
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
@router.post("/tts")
|
| 17 |
async def tts(request: TTSRequest):
|
| 18 |
"""
|
| 19 |
-
Convert text to speech and return audio file.
|
| 20 |
|
| 21 |
Example:
|
| 22 |
- POST /audio/tts
|
| 23 |
- Body: {"text": "Hello, welcome to our system"}
|
| 24 |
-
- Returns:
|
| 25 |
"""
|
| 26 |
try:
|
| 27 |
logger.info(f"TTS request received for text: '{request.text}'")
|
| 28 |
audio_bytes = await generate_tts(request.text)
|
| 29 |
-
return StreamingResponse(io.BytesIO(audio_bytes), media_type="audio/
|
| 30 |
except Exception as e:
|
| 31 |
logger.error(f"TTS error: {str(e)}")
|
| 32 |
raise HTTPException(status_code=500, detail=str(e))
|
|
@@ -35,12 +43,12 @@ async def tts(request: TTSRequest):
|
|
| 35 |
@router.post("/stt", response_model=STTResponse)
|
| 36 |
async def stt(file: UploadFile = File(...)):
|
| 37 |
"""
|
| 38 |
-
Convert audio file to text.
|
| 39 |
|
| 40 |
Example:
|
| 41 |
- POST /audio/stt
|
| 42 |
- File: audio.mp3 (or .wav, .m4a)
|
| 43 |
-
- Returns: {"text": "transcribed text", "model_name": "
|
| 44 |
"""
|
| 45 |
# Validate file type
|
| 46 |
if file.content_type not in ALLOWED_AUDIO_TYPES:
|
|
@@ -64,7 +72,7 @@ async def stt(file: UploadFile = File(...)):
|
|
| 64 |
|
| 65 |
return STTResponse(
|
| 66 |
text=text,
|
| 67 |
-
model_name="
|
| 68 |
language="en",
|
| 69 |
duration_seconds=None
|
| 70 |
)
|
|
@@ -76,17 +84,12 @@ async def stt(file: UploadFile = File(...)):
|
|
| 76 |
@router.post("/chatbot")
|
| 77 |
async def chatbot_voice(file: UploadFile = File(...)):
|
| 78 |
"""
|
| 79 |
-
Full voice chatbot flow (Audio → Text → Response → Audio).
|
| 80 |
|
| 81 |
Example:
|
| 82 |
- POST /audio/chatbot
|
| 83 |
- File: user_voice.mp3
|
| 84 |
-
- Returns: Response audio file (
|
| 85 |
-
|
| 86 |
-
Process:
|
| 87 |
-
1. Converts user's audio to text (STT)
|
| 88 |
-
2. Generates chatbot response to user's text
|
| 89 |
-
3. Converts response back to audio (TTS)
|
| 90 |
"""
|
| 91 |
# Validate file type
|
| 92 |
if file.content_type not in ALLOWED_AUDIO_TYPES:
|
|
@@ -119,7 +122,7 @@ async def chatbot_voice(file: UploadFile = File(...)):
|
|
| 119 |
audio_response = await generate_tts(response_text)
|
| 120 |
logger.info("Step 3 - TTS: Complete")
|
| 121 |
|
| 122 |
-
return StreamingResponse(io.BytesIO(audio_response), media_type="audio/
|
| 123 |
|
| 124 |
except Exception as e:
|
| 125 |
logger.error(f"Voice chatbot error: {str(e)}")
|
|
@@ -129,7 +132,7 @@ async def chatbot_voice(file: UploadFile = File(...)):
|
|
| 129 |
@router.post("/chatbot-text", response_model=ChatbotResponse)
|
| 130 |
async def chatbot_text(request: ChatbotRequest):
|
| 131 |
"""
|
| 132 |
-
Chatbot interaction with text input/output
|
| 133 |
|
| 134 |
Example:
|
| 135 |
- POST /audio/chatbot-text
|
|
@@ -143,7 +146,7 @@ async def chatbot_text(request: ChatbotRequest):
|
|
| 143 |
return ChatbotResponse(
|
| 144 |
user_input=request.text,
|
| 145 |
bot_response=response_text,
|
| 146 |
-
model_name="
|
| 147 |
)
|
| 148 |
except Exception as e:
|
| 149 |
logger.error(f"Text chatbot error: {str(e)}")
|
|
|
|
| 3 |
import io
|
| 4 |
import logging
|
| 5 |
from config import ALLOWED_AUDIO_TYPES, MAX_AUDIO_SIZE
|
| 6 |
+
from services.stt_service import speech_to_text, load_stt_model
|
| 7 |
from services.tts_service import generate_tts
|
| 8 |
+
from services.chatbot_service import get_chatbot_response, load_chatbot_model
|
| 9 |
from models.audio import STTResponse, TTSRequest, TTSResponse, ChatbotRequest, ChatbotResponse
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
| 13 |
router = APIRouter(prefix="/audio", tags=["Audio"])
|
| 14 |
|
| 15 |
+
# Pre-load models on router startup
|
| 16 |
+
@router.on_event("startup")
|
| 17 |
+
async def startup_event():
|
| 18 |
+
"""Load models when the router starts"""
|
| 19 |
+
logger.info("Loading free STT and Chatbot models...")
|
| 20 |
+
load_stt_model()
|
| 21 |
+
load_chatbot_model()
|
| 22 |
+
|
| 23 |
|
| 24 |
@router.post("/tts")
|
| 25 |
async def tts(request: TTSRequest):
|
| 26 |
"""
|
| 27 |
+
Convert text to speech and return audio file using free gTTS.
|
| 28 |
|
| 29 |
Example:
|
| 30 |
- POST /audio/tts
|
| 31 |
- Body: {"text": "Hello, welcome to our system"}
|
| 32 |
+
- Returns: MP3 audio file
|
| 33 |
"""
|
| 34 |
try:
|
| 35 |
logger.info(f"TTS request received for text: '{request.text}'")
|
| 36 |
audio_bytes = await generate_tts(request.text)
|
| 37 |
+
return StreamingResponse(io.BytesIO(audio_bytes), media_type="audio/mp3")
|
| 38 |
except Exception as e:
|
| 39 |
logger.error(f"TTS error: {str(e)}")
|
| 40 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
| 43 |
@router.post("/stt", response_model=STTResponse)
|
| 44 |
async def stt(file: UploadFile = File(...)):
|
| 45 |
"""
|
| 46 |
+
Convert audio file to text using free Whisper model.
|
| 47 |
|
| 48 |
Example:
|
| 49 |
- POST /audio/stt
|
| 50 |
- File: audio.mp3 (or .wav, .m4a)
|
| 51 |
+
- Returns: {"text": "transcribed text", "model_name": "whisper-small", ...}
|
| 52 |
"""
|
| 53 |
# Validate file type
|
| 54 |
if file.content_type not in ALLOWED_AUDIO_TYPES:
|
|
|
|
| 72 |
|
| 73 |
return STTResponse(
|
| 74 |
text=text,
|
| 75 |
+
model_name="whisper-small",
|
| 76 |
language="en",
|
| 77 |
duration_seconds=None
|
| 78 |
)
|
|
|
|
| 84 |
@router.post("/chatbot")
|
| 85 |
async def chatbot_voice(file: UploadFile = File(...)):
|
| 86 |
"""
|
| 87 |
+
Full voice chatbot flow using free models (Audio → Text → Response → Audio).
|
| 88 |
|
| 89 |
Example:
|
| 90 |
- POST /audio/chatbot
|
| 91 |
- File: user_voice.mp3
|
| 92 |
+
- Returns: Response audio file (MP3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
"""
|
| 94 |
# Validate file type
|
| 95 |
if file.content_type not in ALLOWED_AUDIO_TYPES:
|
|
|
|
| 122 |
audio_response = await generate_tts(response_text)
|
| 123 |
logger.info("Step 3 - TTS: Complete")
|
| 124 |
|
| 125 |
+
return StreamingResponse(io.BytesIO(audio_response), media_type="audio/mp3")
|
| 126 |
|
| 127 |
except Exception as e:
|
| 128 |
logger.error(f"Voice chatbot error: {str(e)}")
|
|
|
|
| 132 |
@router.post("/chatbot-text", response_model=ChatbotResponse)
|
| 133 |
async def chatbot_text(request: ChatbotRequest):
|
| 134 |
"""
|
| 135 |
+
Chatbot interaction with text input/output using free DialoGPT model.
|
| 136 |
|
| 137 |
Example:
|
| 138 |
- POST /audio/chatbot-text
|
|
|
|
| 146 |
return ChatbotResponse(
|
| 147 |
user_input=request.text,
|
| 148 |
bot_response=response_text,
|
| 149 |
+
model_name="DialoGPT-medium"
|
| 150 |
)
|
| 151 |
except Exception as e:
|
| 152 |
logger.error(f"Text chatbot error: {str(e)}")
|
services/chatbot_service.py
CHANGED
|
@@ -1,46 +1,121 @@
|
|
| 1 |
-
from
|
| 2 |
import logging
|
|
|
|
| 3 |
|
| 4 |
logger = logging.getLogger(__name__)
|
| 5 |
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
-
Generate chatbot response using
|
| 10 |
|
| 11 |
Args:
|
| 12 |
user_text: User input text
|
|
|
|
| 13 |
|
| 14 |
Returns:
|
| 15 |
Chatbot response text
|
| 16 |
-
|
| 17 |
-
Raises:
|
| 18 |
-
Exception: If response generation fails
|
| 19 |
"""
|
|
|
|
|
|
|
| 20 |
try:
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
logger.info(f"Generating chatbot response for: '{user_text}'")
|
| 24 |
|
| 25 |
-
#
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
Keep responses brief (1-2 sentences) for voice interaction."""
|
| 29 |
|
| 30 |
-
#
|
| 31 |
-
|
|
|
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
)
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
except Exception as e:
|
| 44 |
logger.error(f"✗ Chatbot response failed: {str(e)}")
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
|
| 2 |
import logging
|
| 3 |
+
import torch
|
| 4 |
|
| 5 |
logger = logging.getLogger(__name__)
|
| 6 |
|
| 7 |
+
# Global chatbot components
|
| 8 |
+
chatbot_pipeline = None
|
| 9 |
+
chat_history = {}
|
| 10 |
|
| 11 |
+
def load_chatbot_model():
|
| 12 |
+
"""Load the free DialoGPT model for chatbot"""
|
| 13 |
+
global chatbot_pipeline
|
| 14 |
+
try:
|
| 15 |
+
logger.info("Loading DialoGPT chatbot model...")
|
| 16 |
+
|
| 17 |
+
# Use DialoGPT medium for better responses
|
| 18 |
+
chatbot_pipeline = pipeline(
|
| 19 |
+
"text-generation",
|
| 20 |
+
model="microsoft/DialoGPT-medium",
|
| 21 |
+
tokenizer="microsoft/DialoGPT-medium",
|
| 22 |
+
device="cpu"
|
| 23 |
+
)
|
| 24 |
+
logger.info("✓ DialoGPT chatbot model loaded successfully")
|
| 25 |
+
except Exception as e:
|
| 26 |
+
logger.error(f"✗ Failed to load DialoGPT model: {str(e)}")
|
| 27 |
+
chatbot_pipeline = None
|
| 28 |
+
|
| 29 |
+
async def get_chatbot_response(user_text: str, user_id: str = "default") -> str:
|
| 30 |
"""
|
| 31 |
+
Generate chatbot response using free DialoGPT model.
|
| 32 |
|
| 33 |
Args:
|
| 34 |
user_text: User input text
|
| 35 |
+
user_id: Unique user ID for maintaining conversation history
|
| 36 |
|
| 37 |
Returns:
|
| 38 |
Chatbot response text
|
|
|
|
|
|
|
|
|
|
| 39 |
"""
|
| 40 |
+
global chatbot_pipeline
|
| 41 |
+
|
| 42 |
try:
|
| 43 |
+
if chatbot_pipeline is None:
|
| 44 |
+
load_chatbot_model()
|
| 45 |
+
if chatbot_pipeline is None:
|
| 46 |
+
return get_fallback_response(user_text)
|
| 47 |
|
| 48 |
logger.info(f"Generating chatbot response for: '{user_text}'")
|
| 49 |
|
| 50 |
+
# Get or initialize chat history for this user
|
| 51 |
+
if user_id not in chat_history:
|
| 52 |
+
chat_history[user_id] = []
|
|
|
|
| 53 |
|
| 54 |
+
# Prepare conversation context
|
| 55 |
+
conversation = chat_history[user_id] + [user_text]
|
| 56 |
+
context = " ".join(conversation[-3:]) # Use last 3 exchanges as context
|
| 57 |
|
| 58 |
+
# Generate response
|
| 59 |
+
response = chatbot_pipeline(
|
| 60 |
+
context,
|
| 61 |
+
max_length=150,
|
| 62 |
+
num_return_sequences=1,
|
| 63 |
+
pad_token_id=chatbot_pipeline.tokenizer.eos_token_id,
|
| 64 |
+
no_repeat_ngram_size=3,
|
| 65 |
+
do_sample=True,
|
| 66 |
+
top_k=50,
|
| 67 |
+
top_p=0.95,
|
| 68 |
+
temperature=0.7
|
| 69 |
)
|
| 70 |
|
| 71 |
+
bot_response = response[0]['generated_text'].strip()
|
| 72 |
+
|
| 73 |
+
# Extract only the new response (remove the input context)
|
| 74 |
+
if context in bot_response:
|
| 75 |
+
bot_response = bot_response.replace(context, "").strip()
|
| 76 |
+
|
| 77 |
+
# Clean up the response
|
| 78 |
+
bot_response = clean_response(bot_response)
|
| 79 |
+
|
| 80 |
+
# Update chat history
|
| 81 |
+
chat_history[user_id].extend([user_text, bot_response])
|
| 82 |
|
| 83 |
+
# Keep only recent history (last 4 exchanges)
|
| 84 |
+
if len(chat_history[user_id]) > 8:
|
| 85 |
+
chat_history[user_id] = chat_history[user_id][-8:]
|
| 86 |
+
|
| 87 |
+
logger.info(f"✓ Response generated: '{bot_response}'")
|
| 88 |
+
return bot_response
|
| 89 |
|
| 90 |
except Exception as e:
|
| 91 |
logger.error(f"✗ Chatbot response failed: {str(e)}")
|
| 92 |
+
return get_fallback_response(user_text)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def clean_response(response: str) -> str:
|
| 96 |
+
"""Clean and format the chatbot response"""
|
| 97 |
+
# Remove extra spaces
|
| 98 |
+
response = ' '.join(response.split())
|
| 99 |
+
|
| 100 |
+
# Ensure proper sentence ending
|
| 101 |
+
if response and not response.endswith(('.', '!', '?')):
|
| 102 |
+
response += '.'
|
| 103 |
+
|
| 104 |
+
# Limit response length
|
| 105 |
+
if len(response) > 200:
|
| 106 |
+
response = response[:197] + '...'
|
| 107 |
+
|
| 108 |
+
return response
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def get_fallback_response(user_text: str) -> str:
|
| 112 |
+
"""Provide fallback responses when model fails"""
|
| 113 |
+
fallback_responses = [
|
| 114 |
+
f"I understand you said: '{user_text}'. Could you tell me more about that?",
|
| 115 |
+
f"That's interesting! You mentioned: '{user_text}'. What would you like to know?",
|
| 116 |
+
f"Thanks for sharing! Regarding '{user_text}', how can I help you?",
|
| 117 |
+
f"I heard you say: '{user_text}'. Could you elaborate on that?"
|
| 118 |
+
]
|
| 119 |
+
|
| 120 |
+
import random
|
| 121 |
+
return random.choice(fallback_responses)
|
services/gemini_client.py
CHANGED
|
@@ -1,18 +1,9 @@
|
|
| 1 |
-
|
| 2 |
-
from config import GOOGLE_GENAI_API_KEY
|
| 3 |
import logging
|
| 4 |
|
| 5 |
logger = logging.getLogger(__name__)
|
| 6 |
|
| 7 |
-
|
| 8 |
def get_gemini_client():
|
| 9 |
-
"""
|
| 10 |
-
|
| 11 |
-
""
|
| 12 |
-
try:
|
| 13 |
-
client = Client(api_key=GOOGLE_GENAI_API_KEY)
|
| 14 |
-
logger.info("✓ Gemini client initialized successfully")
|
| 15 |
-
return client
|
| 16 |
-
except Exception as e:
|
| 17 |
-
logger.error(f"✗ Failed to initialize Gemini client: {str(e)}")
|
| 18 |
-
raise ValueError(f"Gemini client initialization failed: {str(e)}")
|
|
|
|
| 1 |
+
# This file is no longer needed since we're using free models
|
|
|
|
| 2 |
import logging
|
| 3 |
|
| 4 |
logger = logging.getLogger(__name__)
|
| 5 |
|
|
|
|
| 6 |
def get_gemini_client():
|
| 7 |
+
"""Gemini client is no longer used"""
|
| 8 |
+
logger.warning("Gemini client is deprecated - using free models instead")
|
| 9 |
+
raise Exception("Gemini API is no longer used. Free models are being used instead.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
services/stt_service.py
CHANGED
|
@@ -1,66 +1,72 @@
|
|
| 1 |
-
|
| 2 |
-
import
|
| 3 |
-
import
|
| 4 |
import logging
|
|
|
|
|
|
|
| 5 |
|
| 6 |
logger = logging.getLogger(__name__)
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
|
| 10 |
"""
|
| 11 |
-
Convert audio bytes to text using
|
| 12 |
|
| 13 |
Args:
|
| 14 |
audio_bytes: Raw audio file bytes
|
| 15 |
-
filename: Name of the audio file
|
| 16 |
|
| 17 |
Returns:
|
| 18 |
Transcribed text
|
| 19 |
-
|
| 20 |
-
Raises:
|
| 21 |
-
Exception: If transcription fails
|
| 22 |
"""
|
|
|
|
|
|
|
| 23 |
try:
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
if mime_type is None:
|
| 29 |
-
mime_type = "audio/wav" # fallback
|
| 30 |
-
|
| 31 |
-
logger.info(f"Converting audio to text (format: {mime_type})")
|
| 32 |
-
|
| 33 |
-
# Convert audio to base64
|
| 34 |
-
audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
|
| 35 |
-
|
| 36 |
-
# Create proper content structure for Gemini
|
| 37 |
-
contents = [
|
| 38 |
-
{
|
| 39 |
-
"parts": [
|
| 40 |
-
{
|
| 41 |
-
"inline_data": {
|
| 42 |
-
"mime_type": mime_type,
|
| 43 |
-
"data": audio_b64
|
| 44 |
-
}
|
| 45 |
-
},
|
| 46 |
-
{
|
| 47 |
-
"text": "Transcribe this audio to text."
|
| 48 |
-
}
|
| 49 |
-
]
|
| 50 |
-
}
|
| 51 |
-
]
|
| 52 |
-
|
| 53 |
-
# Call Gemini API
|
| 54 |
-
response = client.models.generate_content(
|
| 55 |
-
model="gemini-2.0-flash-exp", # Using a model that supports multimodal
|
| 56 |
-
contents=contents
|
| 57 |
-
)
|
| 58 |
|
| 59 |
-
|
| 60 |
-
logger.info(f"✓ STT successful: '{transcribed_text}'")
|
| 61 |
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
except Exception as e:
|
| 65 |
logger.error(f"✗ STT failed: {str(e)}")
|
| 66 |
raise Exception(f"Speech-to-text conversion failed: {str(e)}")
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torchaudio
|
| 3 |
+
from transformers import pipeline
|
| 4 |
import logging
|
| 5 |
+
import tempfile
|
| 6 |
+
import os
|
| 7 |
|
| 8 |
logger = logging.getLogger(__name__)
|
| 9 |
|
| 10 |
+
# Global STT pipeline
|
| 11 |
+
stt_pipeline = None
|
| 12 |
+
|
| 13 |
+
def load_stt_model():
|
| 14 |
+
"""Load the free Whisper model for speech-to-text"""
|
| 15 |
+
global stt_pipeline
|
| 16 |
+
try:
|
| 17 |
+
logger.info("Loading Whisper STT model...")
|
| 18 |
+
stt_pipeline = pipeline(
|
| 19 |
+
"automatic-speech-recognition",
|
| 20 |
+
model="openai/whisper-small", # Free model
|
| 21 |
+
device="cpu" # Use CPU to avoid GPU requirements
|
| 22 |
+
)
|
| 23 |
+
logger.info("✓ Whisper STT model loaded successfully")
|
| 24 |
+
except Exception as e:
|
| 25 |
+
logger.error(f"✗ Failed to load Whisper model: {str(e)}")
|
| 26 |
+
stt_pipeline = None
|
| 27 |
|
| 28 |
async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
|
| 29 |
"""
|
| 30 |
+
Convert audio bytes to text using free Whisper model.
|
| 31 |
|
| 32 |
Args:
|
| 33 |
audio_bytes: Raw audio file bytes
|
| 34 |
+
filename: Name of the audio file
|
| 35 |
|
| 36 |
Returns:
|
| 37 |
Transcribed text
|
|
|
|
|
|
|
|
|
|
| 38 |
"""
|
| 39 |
+
global stt_pipeline
|
| 40 |
+
|
| 41 |
try:
|
| 42 |
+
if stt_pipeline is None:
|
| 43 |
+
load_stt_model()
|
| 44 |
+
if stt_pipeline is None:
|
| 45 |
+
raise Exception("STT model failed to load")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
+
logger.info(f"Converting audio to text using Whisper")
|
|
|
|
| 48 |
|
| 49 |
+
# Save audio bytes to temporary file
|
| 50 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
|
| 51 |
+
temp_audio.write(audio_bytes)
|
| 52 |
+
temp_audio_path = temp_audio.name
|
| 53 |
|
| 54 |
+
try:
|
| 55 |
+
# Transcribe using Whisper
|
| 56 |
+
result = stt_pipeline(temp_audio_path)
|
| 57 |
+
transcribed_text = result.get("text", "").strip()
|
| 58 |
+
|
| 59 |
+
if not transcribed_text:
|
| 60 |
+
transcribed_text = "Sorry, I couldn't understand the audio."
|
| 61 |
+
|
| 62 |
+
logger.info(f"✓ STT successful: '{transcribed_text}'")
|
| 63 |
+
return transcribed_text
|
| 64 |
+
|
| 65 |
+
finally:
|
| 66 |
+
# Clean up temporary file
|
| 67 |
+
if os.path.exists(temp_audio_path):
|
| 68 |
+
os.unlink(temp_audio_path)
|
| 69 |
+
|
| 70 |
except Exception as e:
|
| 71 |
logger.error(f"✗ STT failed: {str(e)}")
|
| 72 |
raise Exception(f"Speech-to-text conversion failed: {str(e)}")
|
services/tts_service.py
CHANGED
|
@@ -1,63 +1,111 @@
|
|
| 1 |
-
from services.gemini_client import get_gemini_client
|
| 2 |
-
from google.genai import types
|
| 3 |
-
import base64
|
| 4 |
import logging
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
logger = logging.getLogger(__name__)
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
async def generate_tts(text: str) -> bytes:
|
| 10 |
"""
|
| 11 |
-
Convert text to speech using
|
| 12 |
|
| 13 |
Args:
|
| 14 |
text: Text to convert to speech
|
| 15 |
|
| 16 |
Returns:
|
| 17 |
-
Audio bytes in
|
| 18 |
|
| 19 |
Raises:
|
| 20 |
Exception: If TTS generation fails
|
| 21 |
"""
|
| 22 |
try:
|
| 23 |
-
client = get_gemini_client()
|
| 24 |
-
|
| 25 |
logger.info(f"Generating speech for: '{text}'")
|
| 26 |
|
| 27 |
-
#
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
contents=[f"Convert this to speech: {text}"],
|
| 34 |
-
config=types.GenerateContentConfig(
|
| 35 |
-
response_mime_type="audio/wav",
|
| 36 |
-
),
|
| 37 |
-
)
|
| 38 |
-
|
| 39 |
-
# Extract audio data from response
|
| 40 |
-
# This part depends on the actual Gemini TTS API response structure
|
| 41 |
-
if (response.candidates and
|
| 42 |
-
len(response.candidates) > 0 and
|
| 43 |
-
response.candidates[0].content and
|
| 44 |
-
response.candidates[0].content.parts and
|
| 45 |
-
len(response.candidates[0].content.parts) > 0):
|
| 46 |
-
|
| 47 |
-
part = response.candidates[0].content.parts[0]
|
| 48 |
-
if hasattr(part, 'inline_data') and part.inline_data:
|
| 49 |
-
audio_bytes = base64.b64decode(part.inline_data.data)
|
| 50 |
-
else:
|
| 51 |
-
# If no audio data, create a fallback audio or raise error
|
| 52 |
-
raise Exception("No audio data in response")
|
| 53 |
else:
|
| 54 |
-
|
|
|
|
| 55 |
|
| 56 |
logger.info(f"✓ TTS successful: {len(audio_bytes)} bytes generated")
|
| 57 |
-
|
| 58 |
return audio_bytes
|
| 59 |
|
| 60 |
except Exception as e:
|
| 61 |
logger.error(f"✗ TTS failed: {str(e)}")
|
| 62 |
-
#
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import logging
|
| 2 |
+
import io
|
| 3 |
+
import wave
|
| 4 |
+
import numpy as np
|
| 5 |
|
| 6 |
logger = logging.getLogger(__name__)
|
| 7 |
|
| 8 |
+
# Try to import gTTS, but provide fallback if not available
|
| 9 |
+
try:
|
| 10 |
+
from gtts import gTTS
|
| 11 |
+
GTTS_AVAILABLE = True
|
| 12 |
+
except ImportError:
|
| 13 |
+
GTTS_AVAILABLE = False
|
| 14 |
+
logger.warning("gTTS not available. Using fallback audio generation.")
|
| 15 |
+
|
| 16 |
|
| 17 |
async def generate_tts(text: str) -> bytes:
|
| 18 |
"""
|
| 19 |
+
Convert text to speech using free gTTS (Google Text-to-Speech).
|
| 20 |
|
| 21 |
Args:
|
| 22 |
text: Text to convert to speech
|
| 23 |
|
| 24 |
Returns:
|
| 25 |
+
Audio bytes in MP3 format
|
| 26 |
|
| 27 |
Raises:
|
| 28 |
Exception: If TTS generation fails
|
| 29 |
"""
|
| 30 |
try:
|
|
|
|
|
|
|
| 31 |
logger.info(f"Generating speech for: '{text}'")
|
| 32 |
|
| 33 |
+
# Use gTTS if available
|
| 34 |
+
if GTTS_AVAILABLE:
|
| 35 |
+
tts = gTTS(text=text, lang='en', slow=False)
|
| 36 |
+
audio_buffer = io.BytesIO()
|
| 37 |
+
tts.write_to_fp(audio_buffer)
|
| 38 |
+
audio_bytes = audio_buffer.getvalue()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
else:
|
| 40 |
+
# Fallback to simple tone generation
|
| 41 |
+
audio_bytes = generate_fallback_audio(text)
|
| 42 |
|
| 43 |
logger.info(f"✓ TTS successful: {len(audio_bytes)} bytes generated")
|
|
|
|
| 44 |
return audio_bytes
|
| 45 |
|
| 46 |
except Exception as e:
|
| 47 |
logger.error(f"✗ TTS failed: {str(e)}")
|
| 48 |
+
# Ultimate fallback
|
| 49 |
+
return generate_silent_audio()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def generate_fallback_audio(text: str) -> bytes:
|
| 53 |
+
"""
|
| 54 |
+
Generate a simple tone-based audio file as fallback.
|
| 55 |
+
"""
|
| 56 |
+
try:
|
| 57 |
+
# Create a simple sine wave
|
| 58 |
+
sample_rate = 22050
|
| 59 |
+
duration = max(1.0, min(3.0, len(text) * 0.1))
|
| 60 |
+
|
| 61 |
+
t = np.linspace(0, duration, int(sample_rate * duration), False)
|
| 62 |
+
|
| 63 |
+
# Generate tones that vary with the text length
|
| 64 |
+
base_freq = 440 # A4 note
|
| 65 |
+
# Add some variation based on text
|
| 66 |
+
freq_variation = min(200, len(text) * 5)
|
| 67 |
+
tone = 0.3 * np.sin(2 * np.pi * (base_freq + freq_variation) * t)
|
| 68 |
+
|
| 69 |
+
# Convert to 16-bit PCM
|
| 70 |
+
audio_data = (tone * 32767).astype(np.int16)
|
| 71 |
+
|
| 72 |
+
# Create WAV file in memory
|
| 73 |
+
buffer = io.BytesIO()
|
| 74 |
+
with wave.open(buffer, 'wb') as wav_file:
|
| 75 |
+
wav_file.setnchannels(1) # Mono
|
| 76 |
+
wav_file.setsampwidth(2) # 2 bytes = 16-bit
|
| 77 |
+
wav_file.setframerate(sample_rate)
|
| 78 |
+
wav_file.writeframes(audio_data.tobytes())
|
| 79 |
+
|
| 80 |
+
return buffer.getvalue()
|
| 81 |
+
|
| 82 |
+
except Exception as e:
|
| 83 |
+
logger.error(f"Fallback audio generation failed: {str(e)}")
|
| 84 |
+
return generate_silent_audio()
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def generate_silent_audio() -> bytes:
|
| 88 |
+
"""
|
| 89 |
+
Generate a short silent audio file as ultimate fallback.
|
| 90 |
+
"""
|
| 91 |
+
try:
|
| 92 |
+
sample_rate = 22050
|
| 93 |
+
duration = 1.0
|
| 94 |
+
|
| 95 |
+
# Generate silence
|
| 96 |
+
silent_data = np.zeros(int(sample_rate * duration), dtype=np.int16)
|
| 97 |
+
|
| 98 |
+
# Create WAV file in memory
|
| 99 |
+
buffer = io.BytesIO()
|
| 100 |
+
with wave.open(buffer, 'wb') as wav_file:
|
| 101 |
+
wav_file.setnchannels(1) # Mono
|
| 102 |
+
wav_file.setsampwidth(2) # 2 bytes = 16-bit
|
| 103 |
+
wav_file.setframerate(sample_rate)
|
| 104 |
+
wav_file.writeframes(silent_data.tobytes())
|
| 105 |
+
|
| 106 |
+
return buffer.getvalue()
|
| 107 |
+
|
| 108 |
+
except Exception as e:
|
| 109 |
+
logger.error(f"Silent audio generation failed: {str(e)}")
|
| 110 |
+
# Return empty bytes as last resort
|
| 111 |
+
return b""
|