File size: 2,055 Bytes
c7fc3b6
 
 
 
 
 
8d87b19
 
 
 
 
 
 
 
 
c7fc3b6
 
 
 
8d87b19
 
c7fc3b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""Pydantic schemas for Speech-to-Text and Text-to-Speech endpoints"""

from pydantic import BaseModel, Field, ConfigDict
from typing import Optional


class MyModel(BaseModel):
    model_loaded: bool
    model_name: str

    model_config = {
        "protected_namespaces": ()
    }

    
# ================================
# SPEECH TO TEXT
# ================================



class STTResponse(BaseModel):
    """Response model for Whisper speech → text"""
    model_config = ConfigDict(
        json_schema_extra={
            "example": {
                "text": "hello how are you",
                "model_name": "openai/whisper-large-v3",
                "language": "en",
                "duration_seconds": 3.2
            }
        }
    )

    text: str = Field(..., description="Transcribed text from the input audio")
    model_name: str = Field(..., description="STT model used for inference")
    language: Optional[str] = Field(None, description="Detected language")
    duration_seconds: Optional[float] = Field(
        None,
        description="Approximate audio duration in seconds"
    )


# ================================
# TEXT TO SPEECH
# ================================

class TTSRequest(BaseModel):
    """Text input for TTS conversion"""
    model_config = ConfigDict(
        json_schema_extra={
            "example": {
                "text": "Hello, welcome to our AI system."
            }
        }
    )

    text: str = Field(
        ..., min_length=1, max_length=500,
        description="Text that will be converted into speech"
    )


class TTSResponse(BaseModel):
    """Metadata response for TTS generation"""
    model_config = ConfigDict(
        json_schema_extra={
            "example": {
                "message": "Audio generated successfully",
                "audio_format": "wav",
                "length_seconds": 2.5,
                "model_name": "suno/bark"
            }
        }
    )

    message: str
    audio_format: str
    length_seconds: Optional[float] = None
    model_name: str