File size: 2,886 Bytes
3b2b211 4a13628 95cb26e 3b2b211 4a13628 3b2b211 d4b6133 3b2b211 d4b6133 3b2b211 4a13628 3b2b211 e8aa76b 3b2b211 a8c8142 3b2b211 4a13628 3b2b211 4a13628 3b2b211 a8c8142 95cb26e e8aa76b 95cb26e 3b2b211 95cb26e 4a13628 3b2b211 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import torch
from transformers import pipeline
import logging
import tempfile
import os
import subprocess
logger = logging.getLogger(__name__)
# Global STT pipeline
stt_pipeline = None
def load_stt_model():
"""Load the free Whisper model for speech-to-text"""
global stt_pipeline
try:
# Check if ffmpeg is available
if not check_ffmpeg():
logger.warning("ffmpeg not found. STT may not work properly.")
logger.info("Loading Whisper-medium STT model...")
stt_pipeline = pipeline(
"automatic-speech-recognition",
model="openai/whisper-medium",
device="cpu"
)
logger.info("β Whisper-medium STT model loaded successfully")
except Exception as e:
logger.error(f"β Failed to load Whisper-medium model: {str(e)}")
stt_pipeline = None
def check_ffmpeg():
"""Check if ffmpeg is available"""
try:
subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
return True
except (subprocess.CalledProcessError, FileNotFoundError):
return False
async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
"""
Convert audio bytes to text using free Whisper model.
"""
global stt_pipeline
try:
if stt_pipeline is None:
load_stt_model()
if stt_pipeline is None:
raise Exception("STT model failed to load")
# Check ffmpeg again before processing
if not check_ffmpeg():
return "Error: ffmpeg is required for audio processing but is not installed. Please install ffmpeg on the server."
logger.info(f"Converting audio to text using Whisper-medium")
# Save audio bytes to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
temp_audio.write(audio_bytes)
temp_audio_path = temp_audio.name
try:
# Transcribe using Whisper
result = stt_pipeline(temp_audio_path)
transcribed_text = result.get("text", "").strip()
if not transcribed_text:
transcribed_text = "No speech detected in the audio."
logger.info(f"β STT successful: '{transcribed_text}'")
return transcribed_text
finally:
# Clean up temporary file
if os.path.exists(temp_audio_path):
os.unlink(temp_audio_path)
except Exception as e:
logger.error(f"β STT failed: {str(e)}")
if "ffmpeg" in str(e).lower():
return "Audio processing failed: ffmpeg is required but not installed. Please install ffmpeg on the server."
raise Exception(f"Speech-to-text conversion failed: {str(e)}") |