|
|
"""Pydantic schemas for Speech-to-Text and Text-to-Speech endpoints""" |
|
|
|
|
|
from pydantic import BaseModel, Field, ConfigDict |
|
|
from typing import Optional |
|
|
|
|
|
|
|
|
class MyModel(BaseModel): |
|
|
model_loaded: bool |
|
|
model_name: str |
|
|
|
|
|
model_config = { |
|
|
"protected_namespaces": () |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class STTResponse(BaseModel): |
|
|
"""Response model for Whisper speech → text""" |
|
|
model_config = ConfigDict( |
|
|
json_schema_extra={ |
|
|
"example": { |
|
|
"text": "hello how are you", |
|
|
"model_name": "openai/whisper-large-v3", |
|
|
"language": "en", |
|
|
"duration_seconds": 3.2 |
|
|
} |
|
|
} |
|
|
) |
|
|
|
|
|
text: str = Field(..., description="Transcribed text from the input audio") |
|
|
model_name: str = Field(..., description="STT model used for inference") |
|
|
language: Optional[str] = Field(None, description="Detected language") |
|
|
duration_seconds: Optional[float] = Field( |
|
|
None, |
|
|
description="Approximate audio duration in seconds" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TTSRequest(BaseModel): |
|
|
"""Text input for TTS conversion""" |
|
|
model_config = ConfigDict( |
|
|
json_schema_extra={ |
|
|
"example": { |
|
|
"text": "Hello, welcome to our AI system." |
|
|
} |
|
|
} |
|
|
) |
|
|
|
|
|
text: str = Field( |
|
|
..., min_length=1, max_length=500, |
|
|
description="Text that will be converted into speech" |
|
|
) |
|
|
|
|
|
|
|
|
class TTSResponse(BaseModel): |
|
|
"""Metadata response for TTS generation""" |
|
|
model_config = ConfigDict( |
|
|
json_schema_extra={ |
|
|
"example": { |
|
|
"message": "Audio generated successfully", |
|
|
"audio_format": "wav", |
|
|
"length_seconds": 2.5, |
|
|
"model_name": "suno/bark" |
|
|
} |
|
|
} |
|
|
) |
|
|
|
|
|
message: str |
|
|
audio_format: str |
|
|
length_seconds: Optional[float] = None |
|
|
model_name: str |
|
|
|