| import logging |
| from typing import Generator |
|
|
| import numpy as np |
|
|
| from modules.api.impl.handler.AudioHandler import AudioHandler |
| from modules.api.impl.model.audio_model import AdjustConfig |
| from modules.api.impl.model.chattts_model import ChatTTSConfig, InferConfig |
| from modules.api.impl.model.enhancer_model import EnhancerConfig |
| from modules.Enhancer.ResembleEnhance import apply_audio_enhance_full |
| from modules.normalization import text_normalize |
| from modules.speaker import Speaker |
| from modules.synthesize_audio import synthesize_audio |
| from modules.synthesize_stream import synthesize_stream |
| from modules.utils.audio import apply_normalize, apply_prosody_to_audio_data |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class TTSHandler(AudioHandler): |
| def __init__( |
| self, |
| text_content: str, |
| spk: Speaker, |
| tts_config: ChatTTSConfig, |
| infer_config: InferConfig, |
| adjust_config: AdjustConfig, |
| enhancer_config: EnhancerConfig, |
| ): |
| assert isinstance(text_content, str), "text_content should be str" |
| assert isinstance(spk, Speaker), "spk should be Speaker" |
| assert isinstance( |
| tts_config, ChatTTSConfig |
| ), "tts_config should be ChatTTSConfig" |
| assert isinstance( |
| infer_config, InferConfig |
| ), "infer_config should be InferConfig" |
| assert isinstance( |
| adjust_config, AdjustConfig |
| ), "adjest_config should be AdjustConfig" |
| assert isinstance( |
| enhancer_config, EnhancerConfig |
| ), "enhancer_config should be EnhancerConfig" |
|
|
| self.text_content = text_content |
| self.spk = spk |
| self.tts_config = tts_config |
| self.infer_config = infer_config |
| self.adjest_config = adjust_config |
| self.enhancer_config = enhancer_config |
|
|
| self.validate() |
|
|
| def validate(self): |
| |
| pass |
|
|
| def enqueue(self) -> tuple[np.ndarray, int]: |
| text = text_normalize(self.text_content) |
| tts_config = self.tts_config |
| infer_config = self.infer_config |
| adjust_config = self.adjest_config |
| enhancer_config = self.enhancer_config |
|
|
| sample_rate, audio_data = synthesize_audio( |
| text, |
| spk=self.spk, |
| temperature=tts_config.temperature, |
| top_P=tts_config.top_p, |
| top_K=tts_config.top_k, |
| prompt1=tts_config.prompt1, |
| prompt2=tts_config.prompt2, |
| prefix=tts_config.prefix, |
| infer_seed=infer_config.seed, |
| batch_size=infer_config.batch_size, |
| spliter_threshold=infer_config.spliter_threshold, |
| end_of_sentence=infer_config.eos, |
| ) |
|
|
| if enhancer_config.enabled: |
| nfe = enhancer_config.nfe |
| solver = enhancer_config.solver |
| lambd = enhancer_config.lambd |
| tau = enhancer_config.tau |
|
|
| audio_data, sample_rate = apply_audio_enhance_full( |
| audio_data=audio_data, |
| sr=sample_rate, |
| nfe=nfe, |
| solver=solver, |
| lambd=lambd, |
| tau=tau, |
| ) |
|
|
| audio_data = apply_prosody_to_audio_data( |
| audio_data=audio_data, |
| rate=adjust_config.speed_rate, |
| pitch=adjust_config.pitch, |
| volume=adjust_config.volume_gain_db, |
| sr=sample_rate, |
| ) |
|
|
| if adjust_config.normalize: |
| sample_rate, audio_data = apply_normalize( |
| audio_data=audio_data, |
| headroom=adjust_config.headroom, |
| sr=sample_rate, |
| ) |
|
|
| return audio_data, sample_rate |
|
|
| def enqueue_stream(self) -> Generator[tuple[np.ndarray, int], None, None]: |
| text = text_normalize(self.text_content) |
| tts_config = self.tts_config |
| infer_config = self.infer_config |
| adjust_config = self.adjest_config |
| enhancer_config = self.enhancer_config |
|
|
| if enhancer_config.enabled: |
| logger.warning( |
| "enhancer_config is enabled, but it is not supported in stream mode" |
| ) |
|
|
| gen = synthesize_stream( |
| text, |
| spk=self.spk, |
| temperature=tts_config.temperature, |
| top_P=tts_config.top_p, |
| top_K=tts_config.top_k, |
| prompt1=tts_config.prompt1, |
| prompt2=tts_config.prompt2, |
| prefix=tts_config.prefix, |
| infer_seed=infer_config.seed, |
| spliter_threshold=infer_config.spliter_threshold, |
| end_of_sentence=infer_config.eos, |
| ) |
|
|
| |
| for sr, wav in gen: |
|
|
| wav = apply_prosody_to_audio_data( |
| audio_data=wav, |
| rate=adjust_config.speed_rate, |
| pitch=adjust_config.pitch, |
| volume=adjust_config.volume_gain_db, |
| sr=sr, |
| ) |
|
|
| if adjust_config.normalize: |
| sr, wav = apply_normalize( |
| audio_data=wav, |
| headroom=adjust_config.headroom, |
| sr=sr, |
| ) |
|
|
| yield wav, sr |
|
|