|
|
import asyncio |
|
|
import base64 |
|
|
import json |
|
|
from pathlib import Path |
|
|
import gradio as gr |
|
|
import numpy as np |
|
|
import openai |
|
|
from dotenv import load_dotenv |
|
|
from fastapi import FastAPI |
|
|
from typing import Callable |
|
|
from core.silero_vad import SileroVAD |
|
|
from services.streaming_voice_service import VoskStreamingASR |
|
|
from fastapi.responses import HTMLResponse, StreamingResponse |
|
|
|
|
|
|
|
|
from fastrtc import ( |
|
|
AdditionalOutputs, |
|
|
AsyncStreamHandler, |
|
|
Stream, |
|
|
get_twilio_turn_credentials, |
|
|
wait_for_item, |
|
|
) |
|
|
from gradio.utils import get_space |
|
|
|
|
|
class OpenAIRealtimeService: |
|
|
"""Dịch vụ OpenAI Realtime API cho streaming chất lượng cao""" |
|
|
|
|
|
def __init__(self, api_key: str): |
|
|
self.client = openai.AsyncOpenAI(api_key=api_key) |
|
|
self.connection = None |
|
|
self.is_active = False |
|
|
|
|
|
async def start_session(self): |
|
|
"""Bắt đầu session OpenAI Realtime""" |
|
|
try: |
|
|
self.connection = await self.client.beta.realtime.connect( |
|
|
model="gpt-4o-mini-realtime-preview-2024-12-17" |
|
|
) |
|
|
|
|
|
|
|
|
await self.connection.session.update( |
|
|
session={ |
|
|
"turn_detection": {"type": "server_vad"}, |
|
|
"input_audio_transcription": { |
|
|
"model": "whisper-1", |
|
|
"language": "vi", |
|
|
}, |
|
|
} |
|
|
) |
|
|
|
|
|
self.is_active = True |
|
|
print("✅ OpenAI Realtime session started") |
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Lỗi khởi động OpenAI Realtime: {e}") |
|
|
return False |
|
|
|
|
|
async def process_audio_chunk(self, audio_chunk: np.ndarray, sample_rate: int = 24000): |
|
|
"""Xử lý audio chunk với OpenAI Realtime API""" |
|
|
if not self.connection or not self.is_active: |
|
|
return None |
|
|
|
|
|
try: |
|
|
|
|
|
if sample_rate != 24000: |
|
|
audio_chunk = self._resample_audio(audio_chunk, sample_rate, 24000) |
|
|
|
|
|
|
|
|
audio_message = base64.b64encode(audio_chunk.tobytes()).decode("utf-8") |
|
|
|
|
|
|
|
|
await self.connection.input_audio_buffer.append(audio=audio_message) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Lỗi xử lý audio với OpenAI: {e}") |
|
|
|
|
|
async def get_responses(self): |
|
|
"""Lấy responses từ OpenAI Realtime API""" |
|
|
if not self.connection: |
|
|
return |
|
|
|
|
|
async for event in self.connection: |
|
|
if event.type == "input_audio_buffer.speech_started": |
|
|
yield {"type": "speech_started"} |
|
|
|
|
|
elif event.type == "conversation.item.input_audio_transcription.completed": |
|
|
yield { |
|
|
"type": "user_transcription", |
|
|
"content": event.transcript, |
|
|
"role": "user" |
|
|
} |
|
|
|
|
|
elif event.type == "response.audio_transcript.done": |
|
|
yield { |
|
|
"type": "assistant_transcription", |
|
|
"content": event.transcript, |
|
|
"role": "assistant" |
|
|
} |
|
|
|
|
|
elif event.type == "response.audio.delta": |
|
|
audio_data = np.frombuffer( |
|
|
base64.b64decode(event.delta), dtype=np.int16 |
|
|
) |
|
|
yield { |
|
|
"type": "audio_delta", |
|
|
"audio": audio_data, |
|
|
"sample_rate": 24000 |
|
|
} |
|
|
|
|
|
async def close(self): |
|
|
"""Đóng kết nối""" |
|
|
if self.connection: |
|
|
await self.connection.close() |
|
|
self.is_active = False |
|
|
print("🛑 OpenAI Realtime session closed") |
|
|
|
|
|
class HybridStreamingService: |
|
|
"""Service kết hợp VOSK local và OpenAI Realtime""" |
|
|
|
|
|
def __init__(self, groq_client, rag_system, tts_service, openai_key: str = None): |
|
|
self.groq_client = groq_client |
|
|
self.rag_system = rag_system |
|
|
self.tts_service = tts_service |
|
|
|
|
|
|
|
|
self.vosk_asr = VoskStreamingASR() |
|
|
self.vad_processor = SileroVAD() |
|
|
|
|
|
|
|
|
self.openai_service = None |
|
|
if openai_key: |
|
|
self.openai_service = OpenAIRealtimeService(openai_key) |
|
|
|
|
|
self.current_mode = "local" |
|
|
self.is_listening = False |
|
|
|
|
|
async def start_listening(self, speech_callback: Callable, mode: str = "auto"): |
|
|
"""Bắt đầu lắng nghe với mode lựa chọn""" |
|
|
self.current_callback = speech_callback |
|
|
|
|
|
if mode == "openai" and self.openai_service: |
|
|
return await self._start_openai_mode() |
|
|
else: |
|
|
return self._start_local_mode() |
|
|
|
|
|
async def _start_openai_mode(self): |
|
|
"""Khởi động chế độ OpenAI Realtime""" |
|
|
try: |
|
|
success = await self.openai_service.start_session() |
|
|
if success: |
|
|
self.is_listening = True |
|
|
self.current_mode = "openai" |
|
|
|
|
|
|
|
|
asyncio.create_task(self._openai_response_handler()) |
|
|
|
|
|
if self.current_callback: |
|
|
self.current_callback({ |
|
|
'transcription': "Đã bắt đầu với OpenAI Realtime...", |
|
|
'response': "", |
|
|
'tts_audio': None, |
|
|
'status': 'openai_listening' |
|
|
}) |
|
|
|
|
|
return True |
|
|
return False |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Lỗi khởi động OpenAI mode: {e}") |
|
|
return False |
|
|
|
|
|
def _start_local_mode(self): |
|
|
"""Khởi động chế độ local VOSK""" |
|
|
try: |
|
|
if self.vosk_asr.start_stream() and self.vad_processor.start_stream(self._on_speech_detected): |
|
|
self.is_listening = True |
|
|
self.current_mode = "local" |
|
|
|
|
|
|
|
|
self._start_worker_threads() |
|
|
|
|
|
if self.current_callback: |
|
|
self.current_callback({ |
|
|
'transcription': "Đã bắt đầu với VOSK local...", |
|
|
'response': "", |
|
|
'tts_audio': None, |
|
|
'status': 'local_listening' |
|
|
}) |
|
|
|
|
|
return True |
|
|
return False |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Lỗi khởi động local mode: {e}") |
|
|
return False |
|
|
|
|
|
async def _openai_response_handler(self): |
|
|
"""Xử lý responses từ OpenAI Realtime""" |
|
|
try: |
|
|
async for response in self.openai_service.get_responses(): |
|
|
if response['type'] == 'user_transcription' and self.current_callback: |
|
|
self.current_callback({ |
|
|
'transcription': response['content'], |
|
|
'response': "Đang xử lý...", |
|
|
'tts_audio': None, |
|
|
'status': 'processing' |
|
|
}) |
|
|
|
|
|
elif response['type'] == 'assistant_transcription' and self.current_callback: |
|
|
self.current_callback({ |
|
|
'transcription': "", |
|
|
'response': response['content'], |
|
|
'tts_audio': None, |
|
|
'status': 'completed' |
|
|
}) |
|
|
|
|
|
elif response['type'] == 'audio_delta' and self.current_callback: |
|
|
|
|
|
audio_path = self._save_temp_audio(response['audio'], response['sample_rate']) |
|
|
self.current_callback({ |
|
|
'transcription': "", |
|
|
'response': "", |
|
|
'tts_audio': audio_path, |
|
|
'status': 'audio_streaming' |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Lỗi OpenAI response handler: {e}") |