malek-messaoudii commited on
Commit
9aa985d
·
1 Parent(s): e411044

Refactor audio models and services for improved error handling and response streaming

Browse files
models/audio.py CHANGED
@@ -1,26 +1,10 @@
1
- """Pydantic schemas for Speech-to-Text and Text-to-Speech endpoints"""
2
-
3
  from pydantic import BaseModel, Field, ConfigDict
4
  from typing import Optional
5
 
6
-
7
- class MyModel(BaseModel):
8
- model_loaded: bool
9
- model_name: str
10
-
11
- model_config = {
12
- "protected_namespaces": ()
13
- }
14
-
15
-
16
- # ================================
17
- # SPEECH TO TEXT
18
- # ================================
19
-
20
-
21
-
22
  class STTResponse(BaseModel):
23
- """Response model for Whisper speech → text"""
24
  model_config = ConfigDict(
25
  json_schema_extra={
26
  "example": {
@@ -31,7 +15,6 @@ class STTResponse(BaseModel):
31
  }
32
  }
33
  )
34
-
35
  text: str = Field(..., description="Transcribed text from the input audio")
36
  model_name: str = Field(..., description="STT model used for inference")
37
  language: Optional[str] = Field(None, description="Detected language")
@@ -41,28 +24,17 @@ class STTResponse(BaseModel):
41
  )
42
 
43
 
44
- # ================================
45
- # TEXT TO SPEECH
46
- # ================================
47
-
48
  class TTSRequest(BaseModel):
49
- """Text input for TTS conversion"""
50
  model_config = ConfigDict(
51
- json_schema_extra={
52
- "example": {
53
- "text": "Hello, welcome to our AI system."
54
- }
55
- }
56
- )
57
-
58
- text: str = Field(
59
- ..., min_length=1, max_length=500,
60
- description="Text that will be converted into speech"
61
  )
 
62
 
63
 
64
  class TTSResponse(BaseModel):
65
- """Metadata response for TTS generation"""
66
  model_config = ConfigDict(
67
  json_schema_extra={
68
  "example": {
@@ -73,7 +45,6 @@ class TTSResponse(BaseModel):
73
  }
74
  }
75
  )
76
-
77
  message: str
78
  audio_format: str
79
  length_seconds: Optional[float] = None
 
 
 
1
  from pydantic import BaseModel, Field, ConfigDict
2
  from typing import Optional
3
 
4
+ # ==============================
5
+ # SPEECH TO TEXT RESPONSE
6
+ # ==============================
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  class STTResponse(BaseModel):
 
8
  model_config = ConfigDict(
9
  json_schema_extra={
10
  "example": {
 
15
  }
16
  }
17
  )
 
18
  text: str = Field(..., description="Transcribed text from the input audio")
19
  model_name: str = Field(..., description="STT model used for inference")
20
  language: Optional[str] = Field(None, description="Detected language")
 
24
  )
25
 
26
 
27
+ # ==============================
28
+ # TEXT TO SPEECH REQUEST / RESPONSE
29
+ # ==============================
 
30
  class TTSRequest(BaseModel):
 
31
  model_config = ConfigDict(
32
+ json_schema_extra={"example": {"text": "Hello, welcome to our AI system."}}
 
 
 
 
 
 
 
 
 
33
  )
34
+ text: str = Field(..., min_length=1, max_length=500, description="Text to convert to speech")
35
 
36
 
37
  class TTSResponse(BaseModel):
 
38
  model_config = ConfigDict(
39
  json_schema_extra={
40
  "example": {
 
45
  }
46
  }
47
  )
 
48
  message: str
49
  audio_format: str
50
  length_seconds: Optional[float] = None
routes/audio.py CHANGED
@@ -1,25 +1,34 @@
1
- from fastapi import APIRouter, UploadFile, File
2
  from services.tts_service import generate_tts
3
  from services.stt_service import speech_to_text
4
- from fastapi.responses import FileResponse
5
- import uuid
6
 
7
  router = APIRouter(prefix="/audio", tags=["Audio"])
8
 
9
-
 
 
10
  @router.post("/tts")
11
  async def tts(text: str):
12
- audio_bytes = await generate_tts(text)
13
-
14
- filename = f"tts_{uuid.uuid4()}.wav"
15
- with open(filename, "wb") as f:
16
- f.write(audio_bytes)
17
 
18
- return FileResponse(filename, media_type="audio/wav", filename=filename)
 
19
 
20
 
 
 
 
21
  @router.post("/stt")
22
  async def stt(file: UploadFile = File(...)):
23
- audio_bytes = await file.read()
24
- text = await speech_to_text(audio_bytes)
 
 
 
 
25
  return {"text": text}
 
1
+ from fastapi import APIRouter, UploadFile, File, HTTPException
2
  from services.tts_service import generate_tts
3
  from services.stt_service import speech_to_text
4
+ from fastapi.responses import StreamingResponse
5
+ import io
6
 
7
  router = APIRouter(prefix="/audio", tags=["Audio"])
8
 
9
+ # ======================
10
+ # TEXT TO SPEECH
11
+ # ======================
12
  @router.post("/tts")
13
  async def tts(text: str):
14
+ try:
15
+ audio_bytes = await generate_tts(text)
16
+ except Exception as e:
17
+ raise HTTPException(status_code=500, detail=str(e))
 
18
 
19
+ # Return as streaming response without saving file
20
+ return StreamingResponse(io.BytesIO(audio_bytes), media_type="audio/wav")
21
 
22
 
23
+ # ======================
24
+ # SPEECH TO TEXT
25
+ # ======================
26
  @router.post("/stt")
27
  async def stt(file: UploadFile = File(...)):
28
+ try:
29
+ audio_bytes = await file.read()
30
+ text = await speech_to_text(audio_bytes)
31
+ except Exception as e:
32
+ raise HTTPException(status_code=500, detail=str(e))
33
+
34
  return {"text": text}
services/stt_service.py CHANGED
@@ -1,9 +1,18 @@
1
  from services.gemini_client import get_gemini_client
 
2
 
3
  async def speech_to_text(audio_bytes: bytes) -> str:
 
 
 
4
  client = get_gemini_client()
 
 
 
 
5
  response = client.models.generate_content(
6
  model="gemini-2.5-flash",
7
- contents=[{"mime_type": "audio/wav", "data": audio_bytes}],
8
  )
 
9
  return response.text
 
1
  from services.gemini_client import get_gemini_client
2
+ from google.genai import types
3
 
4
  async def speech_to_text(audio_bytes: bytes) -> str:
5
+ """
6
+ Convert speech audio (bytes) to text using Gemini API
7
+ """
8
  client = get_gemini_client()
9
+
10
+ # Wrap audio bytes correctly for Gemini
11
+ contents = [types.File(data=audio_bytes, mime_type="audio/wav")]
12
+
13
  response = client.models.generate_content(
14
  model="gemini-2.5-flash",
15
+ contents=contents
16
  )
17
+
18
  return response.text
services/tts_service.py CHANGED
@@ -2,7 +2,11 @@ from services.gemini_client import get_gemini_client
2
  from google.genai import types
3
 
4
  async def generate_tts(text: str) -> bytes:
 
 
 
5
  client = get_gemini_client()
 
6
  response = client.models.generate_content(
7
  model="gemini-2.5-flash-preview-tts",
8
  contents=text,
@@ -15,4 +19,6 @@ async def generate_tts(text: str) -> bytes:
15
  ),
16
  ),
17
  )
 
 
18
  return response.candidates[0].content.parts[0].inline_data.data
 
2
  from google.genai import types
3
 
4
  async def generate_tts(text: str) -> bytes:
5
+ """
6
+ Convert text to speech using Gemini API
7
+ """
8
  client = get_gemini_client()
9
+
10
  response = client.models.generate_content(
11
  model="gemini-2.5-flash-preview-tts",
12
  contents=text,
 
19
  ),
20
  ),
21
  )
22
+
23
+ # Return raw audio bytes
24
  return response.candidates[0].content.parts[0].inline_data.data