Spaces:

datbkpro
/

voicebot

Running

App Files Files Community

datbkpro commited on Nov 13, 2025

Commit

d102321

verified ·

1 Parent(s): 0304fae

Update services/streaming_voice_service.py

Browse files

Files changed (1) hide show

services/streaming_voice_service.py +118 -33

services/streaming_voice_service.py CHANGED Viewed

@@ -13,17 +13,21 @@ import zipfile
 from vosk import Model, KaldiRecognizer
 from groq import Groq
 from typing import Optional, Dict, Any, Callable
 class VoskStreamingASR:
     def __init__(self, model_path: str = None):
         self.model = None
         self.recognizer = None
         self.sample_rate = 16000
         self.is_streaming = False
-        # Buffer để tích luỹ audio
         self.audio_buffer = []
         if model_path is None:
             model_path = self._download_vosk_model()
@@ -31,8 +35,6 @@ class VoskStreamingASR:
             print(f"🔄 Đang tải VOSK model từ: {model_path}")
             try:
                 self.model = Model(model_path)
-                self.recognizer = KaldiRecognizer(self.model, self.sample_rate)
-                self.recognizer.SetWords(True)
                 print("✅ Đã tải VOSK model thành công")
             except Exception as e:
                 print(f"❌ Lỗi khởi tạo VOSK model: {e}")
@@ -56,7 +58,6 @@ class VoskStreamingASR:
                 with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                     zip_ref.extractall("models/")
-                # Đảm bảo thư mục tồn tại
                 if os.path.exists("models/vosk-model-small-vn-0.4"):
                     os.rename("models/vosk-model-small-vn-0.4", model_dir)
@@ -73,6 +74,7 @@ class VoskStreamingASR:
     def start_stream(self):
         """Bắt đầu stream mới"""
         if self.model is None:
             return False
         try:
@@ -87,10 +89,12 @@ class VoskStreamingASR:
             return False
     def process_audio_chunk(self, audio_chunk: np.ndarray, sample_rate: int = None) -> Dict[str, Any]:
-        """Xử lý audio chunk - SIMPLE & EFFECTIVE"""
         if self.recognizer is None or not self.is_streaming:
             return {"text": "", "partial": "", "is_final": False}
         try:
             # Resample nếu cần
             if sample_rate and sample_rate != self.sample_rate:
@@ -106,17 +110,21 @@ class VoskStreamingASR:
             # THÊM VÀO BUFFER - QUAN TRỌNG
             self.audio_buffer.extend(audio_chunk)
             # Chỉ xử lý khi có đủ audio (ít nhất 1 giây)
             if len(self.audio_buffer) < 16000:
                 return {"text": "", "partial": "Đang nghe...", "is_final": False}
-            # Lấy audio từ buffer để xử lý (2 giây gần nhất)
-            process_audio = np.array(self.audio_buffer[-32000:], dtype=np.int16)
             # Chuyển sang bytes
             audio_bytes = process_audio.tobytes()
-            # Xử lý với VOSK
             if self.recognizer.AcceptWaveform(audio_bytes):
                 result_json = self.recognizer.Result()
                 result = json.loads(result_json)
@@ -125,21 +133,26 @@ class VoskStreamingASR:
                     print(f"✅ VOSK Final: '{text}'")
                     # Reset buffer sau khi có kết quả
                     self.audio_buffer = []
-                    return {"text": text, "partial": "", "is_final": True}
-            # Kiểm tra partial result
             partial_json = self.recognizer.PartialResult()
             partial_result = json.loads(partial_json)
             partial_text = partial_result.get('partial', '').strip()
             if partial_text:
                 print(f"🎯 VOSK Partial: '{partial_text}'")
-                return {"text": "", "partial": partial_text, "is_final": False}
         except Exception as e:
             print(f"❌ Lỗi VOSK processing: {e}")
-        return {"text": "", "partial": "Nói tiếp đi...", "is_final": False}
     def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
         """Resample audio"""
@@ -174,11 +187,17 @@ class StreamingVoiceService:
         self.rag_system = rag_system
         self.tts_service = tts_service
-        # Khởi tạo VOSK ASR - ĐƠN GIẢN
         print("🔄 Đang khởi tạo VOSK ASR...")
         self.vosk_asr = VoskStreamingASR()
         self.is_listening = False
         self.current_callback = None
     def start_listening(self, speech_callback: Callable) -> bool:
         """Bắt đầu lắng nghe"""
@@ -200,26 +219,36 @@ class StreamingVoiceService:
         return True
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
-        """Xử lý audio streaming - ĐƠN GIẢN & HIỆU QUẢ"""
         if not audio_data:
-            return {
-                'transcription': "Không có âm thanh",
-                'response': "",
-                'tts_audio': None,
-                'status': 'error'
-            }
         try:
             sample_rate, audio_array = audio_data
-            print(f"🎤 Nhận audio: {len(audio_array)} samples")
             # Đảm bảo VOSK stream đang chạy
             if not self.vosk_asr.is_streaming:
                 self.vosk_asr.start_stream()
-            # Xử lý với VOSK
             result = self.vosk_asr.process_audio_chunk(audio_array, sample_rate)
             # LUÔN trả về text để hiển thị real-time
             if result['partial']:
@@ -230,13 +259,24 @@ class StreamingVoiceService:
                     'status': 'listening'
                 }
             elif result['is_final'] and result['text']:
-                # Có kết quả cuối - tạo phản hồi AI
                 print(f"📝 Final transcription: '{result['text']}'")
                 response = self._generate_ai_response(result['text'])
                 return {
                     'transcription': result['text'],
                     'response': response,
-                    'tts_audio': None,
                     'status': 'completed'
                 }
             else:
@@ -249,18 +289,13 @@ class StreamingVoiceService:
         except Exception as e:
             print(f"❌ Lỗi xử lý audio: {e}")
-            return {
-                'transcription': f"Lỗi: {e}",
-                'response': "",
-                'tts_audio': None,
-                'status': 'error'
-            }
     def _generate_ai_response(self, transcription: str) -> str:
-        """Tạo phản hồi AI đơn giản"""
         try:
             messages = [
-                {"role": "system", "content": "Bạn là trợ lý AI thân thiện. Trả lời ngắn gọn bằng tiếng Việt."},
                 {"role": "user", "content": transcription}
             ]
@@ -277,6 +312,34 @@ class StreamingVoiceService:
             print(f"❌ Lỗi AI: {e}")
             return "Xin lỗi, tôi không thể trả lời ngay lúc này."
     def stop_listening(self):
         """Dừng lắng nghe"""
         self.is_listening = False
@@ -294,6 +357,28 @@ class StreamingVoiceService:
             'is_listening': self.is_listening,
             'vosk_active': self.vosk_asr.is_streaming if self.vosk_asr else False
         }
 # import io
 # import numpy as np
 # import soundfile as sf

 from vosk import Model, KaldiRecognizer
 from groq import Groq
 from typing import Optional, Dict, Any, Callable
+from config.settings import settings
 class VoskStreamingASR:
     def __init__(self, model_path: str = None):
+        """Khởi tạo VOSK ASR streaming với buffer"""
         self.model = None
         self.recognizer = None
         self.sample_rate = 16000
         self.is_streaming = False
+        # Buffer để tích luỹ audio - QUAN TRỌNG
         self.audio_buffer = []
+        self.buffer_size = 32000  # 2 giây audio
+        # Tự động tải model nếu không có đường dẫn
         if model_path is None:
             model_path = self._download_vosk_model()
             print(f"🔄 Đang tải VOSK model từ: {model_path}")
             try:
                 self.model = Model(model_path)
                 print("✅ Đã tải VOSK model thành công")
             except Exception as e:
                 print(f"❌ Lỗi khởi tạo VOSK model: {e}")
                 with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                     zip_ref.extractall("models/")
                 if os.path.exists("models/vosk-model-small-vn-0.4"):
                     os.rename("models/vosk-model-small-vn-0.4", model_dir)
     def start_stream(self):
         """Bắt đầu stream mới"""
         if self.model is None:
+            print("❌ VOSK model chưa được khởi tạo")
             return False
         try:
             return False
     def process_audio_chunk(self, audio_chunk: np.ndarray, sample_rate: int = None) -> Dict[str, Any]:
+        """Xử lý audio chunk với buffer - FIXED VERSION"""
         if self.recognizer is None or not self.is_streaming:
             return {"text": "", "partial": "", "is_final": False}
+        start_time = time.time()
         try:
             # Resample nếu cần
             if sample_rate and sample_rate != self.sample_rate:
             # THÊM VÀO BUFFER - QUAN TRỌNG
             self.audio_buffer.extend(audio_chunk)
+            # Giữ buffer trong giới hạn
+            if len(self.audio_buffer) > self.buffer_size:
+                self.audio_buffer = self.audio_buffer[-self.buffer_size:]
             # Chỉ xử lý khi có đủ audio (ít nhất 1 giây)
             if len(self.audio_buffer) < 16000:
                 return {"text": "", "partial": "Đang nghe...", "is_final": False}
+            # Lấy audio từ buffer để xử lý
+            process_audio = np.array(self.audio_buffer, dtype=np.int16)
             # Chuyển sang bytes
             audio_bytes = process_audio.tobytes()
+            # Xử lý với VOSK - GỬI TOÀN BỘ BUFFER
             if self.recognizer.AcceptWaveform(audio_bytes):
                 result_json = self.recognizer.Result()
                 result = json.loads(result_json)
                     print(f"✅ VOSK Final: '{text}'")
                     # Reset buffer sau khi có kết quả
                     self.audio_buffer = []
+                    processing_time = time.time() - start_time
+                    return {"text": text, "partial": "", "is_final": True, "processing_time": processing_time}
+            # Kiểm tra partial result - LUÔN CÓ KẾT QUẢ
             partial_json = self.recognizer.PartialResult()
             partial_result = json.loads(partial_json)
             partial_text = partial_result.get('partial', '').strip()
+            processing_time = time.time() - start_time
             if partial_text:
                 print(f"🎯 VOSK Partial: '{partial_text}'")
+                return {"text": "", "partial": partial_text, "is_final": False, "processing_time": processing_time}
+            else:
+                # LUÔN trả về partial text để hiển thị
+                return {"text": "", "partial": "🎤 Đang nghe... nói tiếp đi", "is_final": False, "processing_time": processing_time}
         except Exception as e:
             print(f"❌ Lỗi VOSK processing: {e}")
+            return {"text": "", "partial": f"Lỗi: {e}", "is_final": False, "processing_time": 0}
     def _resample_audio(self, audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
         """Resample audio"""
         self.rag_system = rag_system
         self.tts_service = tts_service
+        # Khởi tạo VOSK ASR
         print("🔄 Đang khởi tạo VOSK ASR...")
         self.vosk_asr = VoskStreamingASR()
         self.is_listening = False
         self.current_callback = None
+        # Latency tracking - FIXED
+        self.latency_metrics = {
+            'asr': [], 'llm': [], 'tts': [], 'total': []
+        }
+        self.last_processing_time = 0
     def start_listening(self, speech_callback: Callable) -> bool:
         """Bắt đầu lắng nghe"""
         return True
     def process_streaming_audio(self, audio_data: tuple) -> Dict[str, Any]:
+        """Xử lý audio streaming - FIXED LATENCY TRACKING"""
         if not audio_data:
+            return self._create_error_response("❌ Không có dữ liệu âm thanh")
+        total_start_time = time.time()
         try:
             sample_rate, audio_array = audio_data
+            print(f"🎤 Nhận audio: {len(audio_array)} samples, {sample_rate}Hz")
             # Đảm bảo VOSK stream đang chạy
             if not self.vosk_asr.is_streaming:
                 self.vosk_asr.start_stream()
+            # Xử lý với VOSK - với latency tracking
+            asr_start_time = time.time()
             result = self.vosk_asr.process_audio_chunk(audio_array, sample_rate)
+            asr_time = time.time() - asr_start_time
+            # Cập nhật latency metrics
+            if 'processing_time' in result and result['processing_time'] > 0:
+                self.latency_metrics['asr'].append(result['processing_time'])
+            else:
+                self.latency_metrics['asr'].append(asr_time)
+            total_time = time.time() - total_start_time
+            self.latency_metrics['total'].append(total_time)
+            print(f"⏱️ ASR time: {asr_time:.3f}s, Total: {total_time:.3f}s")
             # LUÔN trả về text để hiển thị real-time
             if result['partial']:
                     'status': 'listening'
                 }
             elif result['is_final'] and result['text']:
+                # Có kết quả cuối - tạo phản hồi AI với latency tracking
                 print(f"📝 Final transcription: '{result['text']}'")
+                llm_start_time = time.time()
                 response = self._generate_ai_response(result['text'])
+                llm_time = time.time() - llm_start_time
+                self.latency_metrics['llm'].append(llm_time)
+                tts_start_time = time.time()
+                tts_audio_path = self._text_to_speech(response)
+                tts_time = time.time() - tts_start_time
+                if tts_time > 0:
+                    self.latency_metrics['tts'].append(tts_time)
                 return {
                     'transcription': result['text'],
                     'response': response,
+                    'tts_audio': tts_audio_path,
                     'status': 'completed'
                 }
             else:
         except Exception as e:
             print(f"❌ Lỗi xử lý audio: {e}")
+            return self._create_error_response(f"Lỗi: {e}")
     def _generate_ai_response(self, transcription: str) -> str:
+        """Tạo phản hồi AI"""
         try:
             messages = [
+                {"role": "system", "content": "Bạn là trợ lý AI. Trả lời ngắn gọn bằng tiếng Việt."},
                 {"role": "user", "content": transcription}
             ]
             print(f"❌ Lỗi AI: {e}")
             return "Xin lỗi, tôi không thể trả lời ngay lúc này."
+    def _text_to_speech(self, text: str) -> Optional[str]:
+        """Chuyển văn bản thành giọng nói"""
+        try:
+            if not text:
+                return None
+            # Sử dụng TTS service
+            audio_path = self.tts_service.text_to_speech(
+                text=text,
+                language='vi',
+                speed=1.0
+            )
+            return audio_path
+        except Exception as e:
+            print(f"❌ Lỗi TTS: {e}")
+            return None
+    def _create_error_response(self, message: str) -> Dict[str, Any]:
+        """Tạo response lỗi"""
+        return {
+            'transcription': message,
+            'response': "Vui lòng thử lại",
+            'tts_audio': None,
+            'status': 'error'
+        }
     def stop_listening(self):
         """Dừng lắng nghe"""
         self.is_listening = False
             'is_listening': self.is_listening,
             'vosk_active': self.vosk_asr.is_streaming if self.vosk_asr else False
         }
+    def get_latency_stats(self) -> dict:
+        """Lấy thống kê latency - FIXED VERSION"""
+        stats = {}
+        for component, latencies in self.latency_metrics.items():
+            if latencies and len(latencies) > 0:
+                # Lấy 10 giá trị gần nhất
+                recent_latencies = latencies[-10:] if len(latencies) > 10 else latencies
+                stats[component] = {
+                    'avg': sum(recent_latencies) / len(recent_latencies),
+                    'min': min(recent_latencies),
+                    'max': max(recent_latencies),
+                    'count': len(recent_latencies),
+                    'recent_values': [f"{x:.3f}s" for x in recent_latencies]
+                }
+            else:
+                stats[component] = {
+                    'avg': 0, 'min': 0, 'max': 0, 'count': 0, 'recent_values': []
+                }
+        print(f"📊 Latency stats: {stats}")
+        return stats
 # import io
 # import numpy as np
 # import soundfile as sf