malek-messaoudii
commited on
Commit
·
91b1985
1
Parent(s):
73d4f3c
update env
Browse files- requirements.txt +1 -0
- services/gemini_client.py +8 -2
- services/stt_service.py +2 -11
- services/tts_service.py +3 -18
requirements.txt
CHANGED
|
@@ -8,3 +8,4 @@ accelerate>=0.24.0
|
|
| 8 |
protobuf>=3.20.0
|
| 9 |
huggingface_hub>=0.19.0
|
| 10 |
python-multipart
|
|
|
|
|
|
| 8 |
protobuf>=3.20.0
|
| 9 |
huggingface_hub>=0.19.0
|
| 10 |
python-multipart
|
| 11 |
+
google-genai>=0.4.0
|
services/gemini_client.py
CHANGED
|
@@ -1,3 +1,9 @@
|
|
| 1 |
-
from google import
|
|
|
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from google.genai import Client, types
|
| 2 |
+
import os
|
| 3 |
|
| 4 |
+
def get_gemini_client() -> Client:
|
| 5 |
+
"""
|
| 6 |
+
Returns a singleton Gemini AI client.
|
| 7 |
+
Can be extended to read API key from environment variables.
|
| 8 |
+
"""
|
| 9 |
+
return Client()
|
services/stt_service.py
CHANGED
|
@@ -1,18 +1,9 @@
|
|
| 1 |
from services.gemini_client import get_gemini_client
|
| 2 |
|
| 3 |
-
|
| 4 |
async def speech_to_text(audio_bytes: bytes) -> str:
|
| 5 |
client = get_gemini_client()
|
| 6 |
-
|
| 7 |
response = client.models.generate_content(
|
| 8 |
model="gemini-2.5-flash",
|
| 9 |
-
contents=[
|
| 10 |
-
{
|
| 11 |
-
"mime_type": "audio/wav",
|
| 12 |
-
"data": audio_bytes
|
| 13 |
-
}
|
| 14 |
-
]
|
| 15 |
)
|
| 16 |
-
|
| 17 |
-
text = response.text
|
| 18 |
-
return text
|
|
|
|
| 1 |
from services.gemini_client import get_gemini_client
|
| 2 |
|
|
|
|
| 3 |
async def speech_to_text(audio_bytes: bytes) -> str:
|
| 4 |
client = get_gemini_client()
|
|
|
|
| 5 |
response = client.models.generate_content(
|
| 6 |
model="gemini-2.5-flash",
|
| 7 |
+
contents=[{"mime_type": "audio/wav", "data": audio_bytes}],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
)
|
| 9 |
+
return response.text
|
|
|
|
|
|
services/tts_service.py
CHANGED
|
@@ -1,19 +1,8 @@
|
|
| 1 |
-
from google.genai import types
|
| 2 |
from services.gemini_client import get_gemini_client
|
| 3 |
-
import
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
def save_wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
|
| 7 |
-
with wave.open(filename, "wb") as wf:
|
| 8 |
-
wf.setnchannels(channels)
|
| 9 |
-
wf.setsampwidth(sample_width)
|
| 10 |
-
wf.setframerate(rate)
|
| 11 |
-
wf.writeframes(pcm)
|
| 12 |
-
|
| 13 |
|
| 14 |
async def generate_tts(text: str) -> bytes:
|
| 15 |
client = get_gemini_client()
|
| 16 |
-
|
| 17 |
response = client.models.generate_content(
|
| 18 |
model="gemini-2.5-flash-preview-tts",
|
| 19 |
contents=text,
|
|
@@ -21,13 +10,9 @@ async def generate_tts(text: str) -> bytes:
|
|
| 21 |
response_modalities=["AUDIO"],
|
| 22 |
speech_config=types.SpeechConfig(
|
| 23 |
voice_config=types.VoiceConfig(
|
| 24 |
-
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
| 25 |
-
voice_name="Kore"
|
| 26 |
-
)
|
| 27 |
)
|
| 28 |
),
|
| 29 |
),
|
| 30 |
)
|
| 31 |
-
|
| 32 |
-
audio_bytes = response.candidates[0].content.parts[0].inline_data.data
|
| 33 |
-
return audio_bytes
|
|
|
|
|
|
|
| 1 |
from services.gemini_client import get_gemini_client
|
| 2 |
+
from google.genai import types
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
async def generate_tts(text: str) -> bytes:
|
| 5 |
client = get_gemini_client()
|
|
|
|
| 6 |
response = client.models.generate_content(
|
| 7 |
model="gemini-2.5-flash-preview-tts",
|
| 8 |
contents=text,
|
|
|
|
| 10 |
response_modalities=["AUDIO"],
|
| 11 |
speech_config=types.SpeechConfig(
|
| 12 |
voice_config=types.VoiceConfig(
|
| 13 |
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Kore")
|
|
|
|
|
|
|
| 14 |
)
|
| 15 |
),
|
| 16 |
),
|
| 17 |
)
|
| 18 |
+
return response.candidates[0].content.parts[0].inline_data.data
|
|
|
|
|
|