File size: 2,055 Bytes
c7fc3b6 8d87b19 c7fc3b6 8d87b19 c7fc3b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
"""Pydantic schemas for Speech-to-Text and Text-to-Speech endpoints"""
from pydantic import BaseModel, Field, ConfigDict
from typing import Optional
class MyModel(BaseModel):
model_loaded: bool
model_name: str
model_config = {
"protected_namespaces": ()
}
# ================================
# SPEECH TO TEXT
# ================================
class STTResponse(BaseModel):
"""Response model for Whisper speech → text"""
model_config = ConfigDict(
json_schema_extra={
"example": {
"text": "hello how are you",
"model_name": "openai/whisper-large-v3",
"language": "en",
"duration_seconds": 3.2
}
}
)
text: str = Field(..., description="Transcribed text from the input audio")
model_name: str = Field(..., description="STT model used for inference")
language: Optional[str] = Field(None, description="Detected language")
duration_seconds: Optional[float] = Field(
None,
description="Approximate audio duration in seconds"
)
# ================================
# TEXT TO SPEECH
# ================================
class TTSRequest(BaseModel):
"""Text input for TTS conversion"""
model_config = ConfigDict(
json_schema_extra={
"example": {
"text": "Hello, welcome to our AI system."
}
}
)
text: str = Field(
..., min_length=1, max_length=500,
description="Text that will be converted into speech"
)
class TTSResponse(BaseModel):
"""Metadata response for TTS generation"""
model_config = ConfigDict(
json_schema_extra={
"example": {
"message": "Audio generated successfully",
"audio_format": "wav",
"length_seconds": 2.5,
"model_name": "suno/bark"
}
}
)
message: str
audio_format: str
length_seconds: Optional[float] = None
model_name: str
|