malek-messaoudii commited on
Commit
47a3efb
·
1 Parent(s): 95cb26e

update stt

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -1
  2. services/stt_service.py +28 -34
requirements.txt CHANGED
@@ -10,4 +10,4 @@ huggingface_hub>=0.19.0
10
  python-multipart
11
  google-genai>=0.4.0
12
  gtts==2.5.1
13
-
 
10
  python-multipart
11
  google-genai>=0.4.0
12
  gtts==2.5.1
13
+ requests==2.31.0
services/stt_service.py CHANGED
@@ -1,33 +1,13 @@
1
- import torch
2
- import torchaudio
3
- from transformers import pipeline
4
  import logging
5
  import tempfile
6
  import os
7
 
8
  logger = logging.getLogger(__name__)
9
 
10
- # Global STT pipeline
11
- stt_pipeline = None
12
-
13
- def load_stt_model():
14
- """Load the free Whisper model for speech-to-text"""
15
- global stt_pipeline
16
- try:
17
- logger.info("Loading Whisper STT model...")
18
- stt_pipeline = pipeline(
19
- "automatic-speech-recognition",
20
- model="openai/whisper-small", # Free model
21
- device="cpu" # Use CPU to avoid GPU requirements
22
- )
23
- logger.info("✓ Whisper STT model loaded successfully")
24
- except Exception as e:
25
- logger.error(f"✗ Failed to load Whisper model: {str(e)}")
26
- stt_pipeline = None
27
-
28
  async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
29
  """
30
- Convert audio bytes to text using free Whisper model.
31
 
32
  Args:
33
  audio_bytes: Raw audio file bytes
@@ -36,15 +16,8 @@ async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
36
  Returns:
37
  Transcribed text
38
  """
39
- global stt_pipeline
40
-
41
  try:
42
- if stt_pipeline is None:
43
- load_stt_model()
44
- if stt_pipeline is None:
45
- raise Exception("STT model failed to load")
46
-
47
- logger.info(f"Converting audio to text using Whisper")
48
 
49
  # Save audio bytes to temporary file
50
  with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
@@ -52,10 +25,20 @@ async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
52
  temp_audio_path = temp_audio.name
53
 
54
  try:
55
- # Transcribe using Whisper
56
- result = stt_pipeline(temp_audio_path)
57
- transcribed_text = result.get("text", "").strip()
58
 
 
 
 
 
 
 
 
 
 
 
59
  if not transcribed_text:
60
  transcribed_text = "Sorry, I couldn't understand the audio."
61
 
@@ -69,4 +52,15 @@ async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
69
 
70
  except Exception as e:
71
  logger.error(f"✗ STT failed: {str(e)}")
72
- raise Exception(f"Speech-to-text conversion failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
 
 
2
  import logging
3
  import tempfile
4
  import os
5
 
6
  logger = logging.getLogger(__name__)
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
9
  """
10
+ Convert audio bytes to text using Hugging Face Inference API (free).
11
 
12
  Args:
13
  audio_bytes: Raw audio file bytes
 
16
  Returns:
17
  Transcribed text
18
  """
 
 
19
  try:
20
+ logger.info(f"Converting audio to text using Hugging Face API")
 
 
 
 
 
21
 
22
  # Save audio bytes to temporary file
23
  with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
 
25
  temp_audio_path = temp_audio.name
26
 
27
  try:
28
+ # Use Hugging Face Inference API (free)
29
+ API_URL = "https://api-inference.huggingface.co/models/openai/whisper-medium"
30
+ headers = {"Authorization": "Bearer YOUR_HF_TOKEN"} # Optional for free tier
31
 
32
+ with open(temp_audio_path, "rb") as f:
33
+ response = requests.post(API_URL, headers=headers, data=f)
34
+
35
+ if response.status_code == 200:
36
+ result = response.json()
37
+ transcribed_text = result.get("text", "").strip()
38
+ else:
39
+ # Fallback to local model if API fails
40
+ transcribed_text = await fallback_stt(audio_bytes, filename)
41
+
42
  if not transcribed_text:
43
  transcribed_text = "Sorry, I couldn't understand the audio."
44
 
 
52
 
53
  except Exception as e:
54
  logger.error(f"✗ STT failed: {str(e)}")
55
+ return "Sorry, there was an error processing your audio."
56
+
57
+
58
+ async def fallback_stt(audio_bytes: bytes, filename: str) -> str:
59
+ """Fallback STT using a simpler approach"""
60
+ try:
61
+ # Simple fallback - you could implement a basic speech recognition here
62
+ # For now, return a placeholder
63
+ return "Audio received but transcription service is temporarily unavailable."
64
+ except Exception as e:
65
+ logger.error(f"Fallback STT also failed: {str(e)}")
66
+ return "Audio processing failed."