File size: 2,886 Bytes
3b2b211
 
4a13628
95cb26e
 
3b2b211
4a13628
 
 
3b2b211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4b6133
 
3b2b211
d4b6133
3b2b211
 
4a13628
3b2b211
 
 
 
e8aa76b
3b2b211
 
 
a8c8142
3b2b211
4a13628
3b2b211
 
 
 
4a13628
3b2b211
 
 
a8c8142
95cb26e
 
e8aa76b
95cb26e
 
 
 
3b2b211
 
 
 
95cb26e
4a13628
 
3b2b211
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import torch
from transformers import pipeline
import logging
import tempfile
import os
import subprocess

logger = logging.getLogger(__name__)

# Global STT pipeline
stt_pipeline = None

def load_stt_model():
    """Load the free Whisper model for speech-to-text"""
    global stt_pipeline
    try:
        # Check if ffmpeg is available
        if not check_ffmpeg():
            logger.warning("ffmpeg not found. STT may not work properly.")
            
        logger.info("Loading Whisper-medium STT model...")
        stt_pipeline = pipeline(
            "automatic-speech-recognition",
            model="openai/whisper-medium",
            device="cpu"
        )
        logger.info("βœ“ Whisper-medium STT model loaded successfully")
    except Exception as e:
        logger.error(f"βœ— Failed to load Whisper-medium model: {str(e)}")
        stt_pipeline = None

def check_ffmpeg():
    """Check if ffmpeg is available"""
    try:
        subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        return False

async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
    """
    Convert audio bytes to text using free Whisper model.
    """
    global stt_pipeline
    
    try:
        if stt_pipeline is None:
            load_stt_model()
            if stt_pipeline is None:
                raise Exception("STT model failed to load")
        
        # Check ffmpeg again before processing
        if not check_ffmpeg():
            return "Error: ffmpeg is required for audio processing but is not installed. Please install ffmpeg on the server."
        
        logger.info(f"Converting audio to text using Whisper-medium")
        
        # Save audio bytes to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio:
            temp_audio.write(audio_bytes)
            temp_audio_path = temp_audio.name
        
        try:
            # Transcribe using Whisper
            result = stt_pipeline(temp_audio_path)
            transcribed_text = result.get("text", "").strip()
            
            if not transcribed_text:
                transcribed_text = "No speech detected in the audio."
                
            logger.info(f"βœ“ STT successful: '{transcribed_text}'")
            return transcribed_text
            
        finally:
            # Clean up temporary file
            if os.path.exists(temp_audio_path):
                os.unlink(temp_audio_path)
                
    except Exception as e:
        logger.error(f"βœ— STT failed: {str(e)}")
        if "ffmpeg" in str(e).lower():
            return "Audio processing failed: ffmpeg is required but not installed. Please install ffmpeg on the server."
        raise Exception(f"Speech-to-text conversion failed: {str(e)}")