malek-messaoudii
commited on
Commit
·
47a3efb
1
Parent(s):
95cb26e
update stt
Browse files- requirements.txt +1 -1
- services/stt_service.py +28 -34
requirements.txt
CHANGED
|
@@ -10,4 +10,4 @@ huggingface_hub>=0.19.0
|
|
| 10 |
python-multipart
|
| 11 |
google-genai>=0.4.0
|
| 12 |
gtts==2.5.1
|
| 13 |
-
|
|
|
|
| 10 |
python-multipart
|
| 11 |
google-genai>=0.4.0
|
| 12 |
gtts==2.5.1
|
| 13 |
+
requests==2.31.0
|
services/stt_service.py
CHANGED
|
@@ -1,33 +1,13 @@
|
|
| 1 |
-
import
|
| 2 |
-
import torchaudio
|
| 3 |
-
from transformers import pipeline
|
| 4 |
import logging
|
| 5 |
import tempfile
|
| 6 |
import os
|
| 7 |
|
| 8 |
logger = logging.getLogger(__name__)
|
| 9 |
|
| 10 |
-
# Global STT pipeline
|
| 11 |
-
stt_pipeline = None
|
| 12 |
-
|
| 13 |
-
def load_stt_model():
|
| 14 |
-
"""Load the free Whisper model for speech-to-text"""
|
| 15 |
-
global stt_pipeline
|
| 16 |
-
try:
|
| 17 |
-
logger.info("Loading Whisper STT model...")
|
| 18 |
-
stt_pipeline = pipeline(
|
| 19 |
-
"automatic-speech-recognition",
|
| 20 |
-
model="openai/whisper-small", # Free model
|
| 21 |
-
device="cpu" # Use CPU to avoid GPU requirements
|
| 22 |
-
)
|
| 23 |
-
logger.info("✓ Whisper STT model loaded successfully")
|
| 24 |
-
except Exception as e:
|
| 25 |
-
logger.error(f"✗ Failed to load Whisper model: {str(e)}")
|
| 26 |
-
stt_pipeline = None
|
| 27 |
-
|
| 28 |
async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
|
| 29 |
"""
|
| 30 |
-
Convert audio bytes to text using
|
| 31 |
|
| 32 |
Args:
|
| 33 |
audio_bytes: Raw audio file bytes
|
|
@@ -36,15 +16,8 @@ async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
|
|
| 36 |
Returns:
|
| 37 |
Transcribed text
|
| 38 |
"""
|
| 39 |
-
global stt_pipeline
|
| 40 |
-
|
| 41 |
try:
|
| 42 |
-
|
| 43 |
-
load_stt_model()
|
| 44 |
-
if stt_pipeline is None:
|
| 45 |
-
raise Exception("STT model failed to load")
|
| 46 |
-
|
| 47 |
-
logger.info(f"Converting audio to text using Whisper")
|
| 48 |
|
| 49 |
# Save audio bytes to temporary file
|
| 50 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
|
|
@@ -52,10 +25,20 @@ async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
|
|
| 52 |
temp_audio_path = temp_audio.name
|
| 53 |
|
| 54 |
try:
|
| 55 |
-
#
|
| 56 |
-
|
| 57 |
-
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
if not transcribed_text:
|
| 60 |
transcribed_text = "Sorry, I couldn't understand the audio."
|
| 61 |
|
|
@@ -69,4 +52,15 @@ async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
|
|
| 69 |
|
| 70 |
except Exception as e:
|
| 71 |
logger.error(f"✗ STT failed: {str(e)}")
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
|
|
|
|
|
|
| 2 |
import logging
|
| 3 |
import tempfile
|
| 4 |
import os
|
| 5 |
|
| 6 |
logger = logging.getLogger(__name__)
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
|
| 9 |
"""
|
| 10 |
+
Convert audio bytes to text using Hugging Face Inference API (free).
|
| 11 |
|
| 12 |
Args:
|
| 13 |
audio_bytes: Raw audio file bytes
|
|
|
|
| 16 |
Returns:
|
| 17 |
Transcribed text
|
| 18 |
"""
|
|
|
|
|
|
|
| 19 |
try:
|
| 20 |
+
logger.info(f"Converting audio to text using Hugging Face API")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
# Save audio bytes to temporary file
|
| 23 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
|
|
|
|
| 25 |
temp_audio_path = temp_audio.name
|
| 26 |
|
| 27 |
try:
|
| 28 |
+
# Use Hugging Face Inference API (free)
|
| 29 |
+
API_URL = "https://api-inference.huggingface.co/models/openai/whisper-medium"
|
| 30 |
+
headers = {"Authorization": "Bearer YOUR_HF_TOKEN"} # Optional for free tier
|
| 31 |
|
| 32 |
+
with open(temp_audio_path, "rb") as f:
|
| 33 |
+
response = requests.post(API_URL, headers=headers, data=f)
|
| 34 |
+
|
| 35 |
+
if response.status_code == 200:
|
| 36 |
+
result = response.json()
|
| 37 |
+
transcribed_text = result.get("text", "").strip()
|
| 38 |
+
else:
|
| 39 |
+
# Fallback to local model if API fails
|
| 40 |
+
transcribed_text = await fallback_stt(audio_bytes, filename)
|
| 41 |
+
|
| 42 |
if not transcribed_text:
|
| 43 |
transcribed_text = "Sorry, I couldn't understand the audio."
|
| 44 |
|
|
|
|
| 52 |
|
| 53 |
except Exception as e:
|
| 54 |
logger.error(f"✗ STT failed: {str(e)}")
|
| 55 |
+
return "Sorry, there was an error processing your audio."
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
async def fallback_stt(audio_bytes: bytes, filename: str) -> str:
|
| 59 |
+
"""Fallback STT using a simpler approach"""
|
| 60 |
+
try:
|
| 61 |
+
# Simple fallback - you could implement a basic speech recognition here
|
| 62 |
+
# For now, return a placeholder
|
| 63 |
+
return "Audio received but transcription service is temporarily unavailable."
|
| 64 |
+
except Exception as e:
|
| 65 |
+
logger.error(f"Fallback STT also failed: {str(e)}")
|
| 66 |
+
return "Audio processing failed."
|