malek-messaoudii commited on
Commit
73d4f3c
·
1 Parent(s): 8d87b19

add gemini api

Browse files
config.py CHANGED
@@ -16,7 +16,7 @@ HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "")
16
 
17
  HUGGINGFACE_STANCE_MODEL_ID = os.getenv("HUGGINGFACE_STANCE_MODEL_ID")
18
  HUGGINGFACE_LABEL_MODEL_ID = os.getenv("HUGGINGFACE_LABEL_MODEL_ID")
19
- HUGGINGFACE_STT_MODEL_ID = os.getenv("HUGGINGFACE_STT_MODEL_ID", "openai/whisper-large-v3")
20
 
21
  # Use Hugging Face model ID instead of local path
22
  STANCE_MODEL_ID = HUGGINGFACE_STANCE_MODEL_ID
 
16
 
17
  HUGGINGFACE_STANCE_MODEL_ID = os.getenv("HUGGINGFACE_STANCE_MODEL_ID")
18
  HUGGINGFACE_LABEL_MODEL_ID = os.getenv("HUGGINGFACE_LABEL_MODEL_ID")
19
+ HUGGINGFACE_STT_MODEL_ID = os.getenv("HUGGINGFACE_STT_MODEL_ID")
20
 
21
  # Use Hugging Face model ID instead of local path
22
  STANCE_MODEL_ID = HUGGINGFACE_STANCE_MODEL_ID
routes/audio.py CHANGED
@@ -1,84 +1,25 @@
1
- """Speech-to-Text & Text-to-Speech API Endpoints"""
2
-
3
- from fastapi import APIRouter, UploadFile, File, HTTPException
4
- from fastapi.responses import StreamingResponse
5
- import logging
6
-
7
- from models.audio import (
8
- STTResponse,
9
- TTSRequest,
10
- TTSResponse
11
- )
12
-
13
- from services.stt_service import transcribe_audio
14
- from services.tts_service import text_to_speech
15
 
16
  router = APIRouter(prefix="/audio", tags=["Audio"])
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- # ============================================================
21
- # SPEECH TO TEXT (Whisper)
22
- # ============================================================
23
-
24
- @router.post("/speech-to-text", response_model=STTResponse)
25
- async def speech_to_text_endpoint(file: UploadFile = File(...)):
26
- """
27
- Convert speech to text using openai/whisper-large-v3.
28
-
29
- - Upload an audio file (wav, mp3, m4a…)
30
- - Returns transcribed English text
31
- """
32
- try:
33
- audio_bytes = await file.read()
34
-
35
- result = transcribe_audio(audio_bytes)
36
-
37
- response_data = STTResponse(
38
- text=result,
39
- model_name="openai/whisper-large-v3",
40
- language="en",
41
- duration_seconds=None # optional filler
42
- )
43
-
44
- logger.info(f"STT completed: {response_data.text[:40]}...")
45
- return response_data
46
-
47
- except Exception as e:
48
- logger.error(f"STT error: {str(e)}")
49
- raise HTTPException(status_code=500, detail=f"Speech-to-text failed: {str(e)}")
50
-
51
 
52
- # ============================================================
53
- # TEXT TO SPEECH (Bark)
54
- # ============================================================
55
 
56
- @router.post("/text-to-speech", response_model=TTSResponse)
57
- async def text_to_speech_endpoint(request: TTSRequest):
58
- """
59
- Convert text to synthesized speech using Bark.
60
- Returns streamed audio.
61
- """
62
- try:
63
- audio_bytes = text_to_speech(request.text)
64
 
65
- metadata = TTSResponse(
66
- message="Audio generated successfully",
67
- audio_format="wav",
68
- length_seconds=None,
69
- model_name="suno/bark"
70
- )
71
 
72
- logger.info(f"TTS generated for text: {request.text[:40]}...")
73
 
74
- return StreamingResponse(
75
- iter([audio_bytes]),
76
- media_type="audio/wav",
77
- headers={
78
- "X-Audio-Metadata": metadata.model_dump_json()
79
- }
80
- )
81
 
82
- except Exception as e:
83
- logger.error(f"TTS error: {str(e)}")
84
- raise HTTPException(status_code=500, detail=f"Text-to-speech failed: {str(e)}")
 
 
 
1
+ from fastapi import APIRouter, UploadFile, File
2
+ from services.tts_service import generate_tts
3
+ from services.stt_service import speech_to_text
4
+ from fastapi.responses import FileResponse
5
+ import uuid
 
 
 
 
 
 
 
 
 
6
 
7
  router = APIRouter(prefix="/audio", tags=["Audio"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
 
9
 
10
+ @router.post("/tts")
11
+ async def tts(text: str):
12
+ audio_bytes = await generate_tts(text)
 
 
 
 
 
13
 
14
+ filename = f"tts_{uuid.uuid4()}.wav"
15
+ with open(filename, "wb") as f:
16
+ f.write(audio_bytes)
 
 
 
17
 
18
+ return FileResponse(filename, media_type="audio/wav", filename=filename)
19
 
 
 
 
 
 
 
 
20
 
21
+ @router.post("/stt")
22
+ async def stt(file: UploadFile = File(...)):
23
+ audio_bytes = await file.read()
24
+ text = await speech_to_text(audio_bytes)
25
+ return {"text": text}
services/__init__.py CHANGED
@@ -3,9 +3,19 @@
3
  from .stance_model_manager import StanceModelManager, stance_model_manager
4
  from .label_model_manager import KpaModelManager, kpa_model_manager
5
 
 
 
 
 
 
6
  __all__ = [
7
  "StanceModelManager",
8
  "stance_model_manager",
9
  "KpaModelManager",
10
  "kpa_model_manager",
 
 
 
 
 
11
  ]
 
3
  from .stance_model_manager import StanceModelManager, stance_model_manager
4
  from .label_model_manager import KpaModelManager, kpa_model_manager
5
 
6
+ # NEW imports
7
+ from .stt_service import speech_to_text
8
+ from .tts_service import generate_tts
9
+ from .gemini_client import get_gemini_client
10
+
11
  __all__ = [
12
  "StanceModelManager",
13
  "stance_model_manager",
14
  "KpaModelManager",
15
  "kpa_model_manager",
16
+
17
+ # NEW exports
18
+ "speech_to_text",
19
+ "generate_tts",
20
+ "get_gemini_client",
21
  ]
services/gemini_client.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from google import genai
2
+
3
+ client = genai.Client()
services/stt_service.py CHANGED
@@ -1,24 +1,18 @@
1
- # services/stt_service.py
2
 
3
- import requests
4
- from config import HUGGINGFACE_API_KEY, HUGGINGFACE_STT_MODEL_ID
5
 
6
- def transcribe_audio(file_bytes: bytes) -> str:
7
- """
8
- Convert audio bytes into English text using Whisper large-v3
9
- through Hugging Face Inference API.
10
- """
11
- headers = {
12
- "Authorization": f"Bearer {HUGGINGFACE_API_KEY}",
13
- "Content-Type": "application/octet-stream",
14
- }
15
 
16
- url = f"https://api-inference.huggingface.co/models/{HUGGINGFACE_STT_MODEL_ID}"
 
 
 
 
 
 
 
 
17
 
18
- response = requests.post(url, headers=headers, data=file_bytes)
19
-
20
- try:
21
- result = response.json()
22
- return result.get("text", "Error: No transcription returned.")
23
- except Exception:
24
- return "Error: Invalid response from STT model."
 
1
+ from services.gemini_client import get_gemini_client
2
 
 
 
3
 
4
+ async def speech_to_text(audio_bytes: bytes) -> str:
5
+ client = get_gemini_client()
 
 
 
 
 
 
 
6
 
7
+ response = client.models.generate_content(
8
+ model="gemini-2.5-flash",
9
+ contents=[
10
+ {
11
+ "mime_type": "audio/wav",
12
+ "data": audio_bytes
13
+ }
14
+ ]
15
+ )
16
 
17
+ text = response.text
18
+ return text
 
 
 
 
 
services/tts_service.py CHANGED
@@ -1,28 +1,33 @@
1
- import requests
2
- from config import HUGGINGFACE_API_KEY
 
3
 
4
- # Bark model
5
- BARK_MODEL_ID = "suno/bark"
6
 
7
- def text_to_speech(text: str) -> bytes:
8
- """
9
- Convert text to speech (audio bytes) using Hugging Face Bark model.
10
- """
11
- url = f"https://api-inference.huggingface.co/models/{BARK_MODEL_ID}"
 
12
 
13
- headers = {
14
- "Authorization": f"Bearer {HUGGINGFACE_API_KEY}",
15
- "Content-Type": "application/json"
16
- }
17
 
18
- payload = {
19
- "inputs": text
20
- }
21
 
22
- response = requests.post(url, headers=headers, json=payload)
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- # Bark returns raw WAV bytes
25
- if response.status_code != 200:
26
- raise Exception(f"Bark API error: {response.text}")
27
-
28
- return response.content
 
1
+ from google.genai import types
2
+ from services.gemini_client import get_gemini_client
3
+ import wave
4
 
 
 
5
 
6
+ def save_wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
7
+ with wave.open(filename, "wb") as wf:
8
+ wf.setnchannels(channels)
9
+ wf.setsampwidth(sample_width)
10
+ wf.setframerate(rate)
11
+ wf.writeframes(pcm)
12
 
 
 
 
 
13
 
14
+ async def generate_tts(text: str) -> bytes:
15
+ client = get_gemini_client()
 
16
 
17
+ response = client.models.generate_content(
18
+ model="gemini-2.5-flash-preview-tts",
19
+ contents=text,
20
+ config=types.GenerateContentConfig(
21
+ response_modalities=["AUDIO"],
22
+ speech_config=types.SpeechConfig(
23
+ voice_config=types.VoiceConfig(
24
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(
25
+ voice_name="Kore"
26
+ )
27
+ )
28
+ ),
29
+ ),
30
+ )
31
 
32
+ audio_bytes = response.candidates[0].content.parts[0].inline_data.data
33
+ return audio_bytes