malek-messaoudii commited on
Commit
c7fc3b6
·
1 Parent(s): 28e04dd

Add tts/stt services

Browse files
config.py CHANGED
@@ -16,6 +16,7 @@ HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "")
16
 
17
  HUGGINGFACE_STANCE_MODEL_ID = os.getenv("HUGGINGFACE_STANCE_MODEL_ID")
18
  HUGGINGFACE_LABEL_MODEL_ID = os.getenv("HUGGINGFACE_LABEL_MODEL_ID")
 
19
 
20
  # Use Hugging Face model ID instead of local path
21
  STANCE_MODEL_ID = HUGGINGFACE_STANCE_MODEL_ID
 
16
 
17
  HUGGINGFACE_STANCE_MODEL_ID = os.getenv("HUGGINGFACE_STANCE_MODEL_ID")
18
  HUGGINGFACE_LABEL_MODEL_ID = os.getenv("HUGGINGFACE_LABEL_MODEL_ID")
19
+ HUGGINGFACE_STT_MODEL_ID = os.getenv("HUGGINGFACE_STT_MODEL_ID", "openai/whisper-large-v3")
20
 
21
  # Use Hugging Face model ID instead of local path
22
  STANCE_MODEL_ID = HUGGINGFACE_STANCE_MODEL_ID
models/audio.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic schemas for Speech-to-Text and Text-to-Speech endpoints"""
2
+
3
+ from pydantic import BaseModel, Field, ConfigDict
4
+ from typing import Optional
5
+
6
+
7
+ # ================================
8
+ # SPEECH TO TEXT
9
+ # ================================
10
+
11
+ class STTResponse(BaseModel):
12
+ """Response model for Whisper speech → text"""
13
+ model_config = ConfigDict(
14
+ json_schema_extra={
15
+ "example": {
16
+ "text": "hello how are you",
17
+ "model_name": "openai/whisper-large-v3",
18
+ "language": "en",
19
+ "duration_seconds": 3.2
20
+ }
21
+ }
22
+ )
23
+
24
+ text: str = Field(..., description="Transcribed text from the input audio")
25
+ model_name: str = Field(..., description="STT model used for inference")
26
+ language: Optional[str] = Field(None, description="Detected language")
27
+ duration_seconds: Optional[float] = Field(
28
+ None,
29
+ description="Approximate audio duration in seconds"
30
+ )
31
+
32
+
33
+ # ================================
34
+ # TEXT TO SPEECH
35
+ # ================================
36
+
37
+ class TTSRequest(BaseModel):
38
+ """Text input for TTS conversion"""
39
+ model_config = ConfigDict(
40
+ json_schema_extra={
41
+ "example": {
42
+ "text": "Hello, welcome to our AI system."
43
+ }
44
+ }
45
+ )
46
+
47
+ text: str = Field(
48
+ ..., min_length=1, max_length=500,
49
+ description="Text that will be converted into speech"
50
+ )
51
+
52
+
53
+ class TTSResponse(BaseModel):
54
+ """Metadata response for TTS generation"""
55
+ model_config = ConfigDict(
56
+ json_schema_extra={
57
+ "example": {
58
+ "message": "Audio generated successfully",
59
+ "audio_format": "wav",
60
+ "length_seconds": 2.5,
61
+ "model_name": "suno/bark"
62
+ }
63
+ }
64
+ )
65
+
66
+ message: str
67
+ audio_format: str
68
+ length_seconds: Optional[float] = None
69
+ model_name: str
routes/__init__.py CHANGED
@@ -2,7 +2,7 @@
2
 
3
  from fastapi import APIRouter
4
  from . import root, health, stance, label
5
-
6
  # Create main router
7
  api_router = APIRouter()
8
 
@@ -11,6 +11,7 @@ api_router.include_router(root.router)
11
  api_router.include_router(health.router)
12
  api_router.include_router(stance.router, prefix="/stance")
13
  api_router.include_router(label.router, prefix="/label")
 
14
 
15
  __all__ = ["api_router"]
16
 
 
2
 
3
  from fastapi import APIRouter
4
  from . import root, health, stance, label
5
+ from routes.audio import router as audio_router
6
  # Create main router
7
  api_router = APIRouter()
8
 
 
11
  api_router.include_router(health.router)
12
  api_router.include_router(stance.router, prefix="/stance")
13
  api_router.include_router(label.router, prefix="/label")
14
+ api_router.include_router(audio_router)
15
 
16
  __all__ = ["api_router"]
17
 
routes/audio.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Speech-to-Text & Text-to-Speech API Endpoints"""
2
+
3
+ from fastapi import APIRouter, UploadFile, File, HTTPException
4
+ from fastapi.responses import StreamingResponse
5
+ import logging
6
+
7
+ from models.audio import (
8
+ STTResponse,
9
+ TTSRequest,
10
+ TTSResponse
11
+ )
12
+
13
+ from services.stt_service import transcribe_audio
14
+ from services.tts_service import text_to_speech
15
+
16
+ router = APIRouter(prefix="/audio", tags=["Audio"])
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ # ============================================================
21
+ # SPEECH TO TEXT (Whisper)
22
+ # ============================================================
23
+
24
+ @router.post("/speech-to-text", response_model=STTResponse)
25
+ async def speech_to_text_endpoint(file: UploadFile = File(...)):
26
+ """
27
+ Convert speech to text using openai/whisper-large-v3.
28
+
29
+ - Upload an audio file (wav, mp3, m4a…)
30
+ - Returns transcribed English text
31
+ """
32
+ try:
33
+ audio_bytes = await file.read()
34
+
35
+ result = transcribe_audio(audio_bytes)
36
+
37
+ response_data = STTResponse(
38
+ text=result,
39
+ model_name="openai/whisper-large-v3",
40
+ language="en",
41
+ duration_seconds=None # optional filler
42
+ )
43
+
44
+ logger.info(f"STT completed: {response_data.text[:40]}...")
45
+ return response_data
46
+
47
+ except Exception as e:
48
+ logger.error(f"STT error: {str(e)}")
49
+ raise HTTPException(status_code=500, detail=f"Speech-to-text failed: {str(e)}")
50
+
51
+
52
+ # ============================================================
53
+ # TEXT TO SPEECH (Bark)
54
+ # ============================================================
55
+
56
+ @router.post("/text-to-speech", response_model=TTSResponse)
57
+ async def text_to_speech_endpoint(request: TTSRequest):
58
+ """
59
+ Convert text to synthesized speech using Bark.
60
+ Returns streamed audio.
61
+ """
62
+ try:
63
+ audio_bytes = text_to_speech(request.text)
64
+
65
+ metadata = TTSResponse(
66
+ message="Audio generated successfully",
67
+ audio_format="wav",
68
+ length_seconds=None,
69
+ model_name="suno/bark"
70
+ )
71
+
72
+ logger.info(f"TTS generated for text: {request.text[:40]}...")
73
+
74
+ return StreamingResponse(
75
+ iter([audio_bytes]),
76
+ media_type="audio/wav",
77
+ headers={
78
+ "X-Audio-Metadata": metadata.model_dump_json()
79
+ }
80
+ )
81
+
82
+ except Exception as e:
83
+ logger.error(f"TTS error: {str(e)}")
84
+ raise HTTPException(status_code=500, detail=f"Text-to-speech failed: {str(e)}")
services/stt_service.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # services/stt_service.py
2
+
3
+ import requests
4
+ from config import HUGGINGFACE_API_KEY, HUGGINGFACE_STT_MODEL_ID
5
+
6
+ def transcribe_audio(file_bytes: bytes) -> str:
7
+ """
8
+ Convert audio bytes into English text using Whisper large-v3
9
+ through Hugging Face Inference API.
10
+ """
11
+ headers = {
12
+ "Authorization": f"Bearer {HUGGINGFACE_API_KEY}",
13
+ "Content-Type": "application/octet-stream",
14
+ }
15
+
16
+ url = f"https://api-inference.huggingface.co/models/{HUGGINGFACE_STT_MODEL_ID}"
17
+
18
+ response = requests.post(url, headers=headers, data=file_bytes)
19
+
20
+ try:
21
+ result = response.json()
22
+ return result.get("text", "Error: No transcription returned.")
23
+ except Exception:
24
+ return "Error: Invalid response from STT model."
services/tts_service.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from config import HUGGINGFACE_API_KEY
3
+
4
+ # Bark model
5
+ BARK_MODEL_ID = "suno/bark"
6
+
7
+ def text_to_speech(text: str) -> bytes:
8
+ """
9
+ Convert text to speech (audio bytes) using Hugging Face Bark model.
10
+ """
11
+ url = f"https://api-inference.huggingface.co/models/{BARK_MODEL_ID}"
12
+
13
+ headers = {
14
+ "Authorization": f"Bearer {HUGGINGFACE_API_KEY}",
15
+ "Content-Type": "application/json"
16
+ }
17
+
18
+ payload = {
19
+ "inputs": text
20
+ }
21
+
22
+ response = requests.post(url, headers=headers, json=payload)
23
+
24
+ # Bark returns raw WAV bytes
25
+ if response.status_code != 200:
26
+ raise Exception(f"Bark API error: {response.text}")
27
+
28
+ return response.content