#!/usr/bin/env python3 import sys import os import time print("=== CosyVoice Text-to-Speech Test ===\n") # Add Matcha-TTS to path sys.path.append('third_party/Matcha-TTS') # Wait for model files required_files = [ 'pretrained_models/CosyVoice-300M/flow.pt', 'pretrained_models/CosyVoice-300M/speech_tokenizer_v1.onnx', 'pretrained_models/CosyVoice-300M/campplus.onnx' ] print("Checking for required model files...") max_wait = 300 # 5 minutes start_time = time.time() while True: missing_files = [f for f in required_files if not os.path.exists(f)] if not missing_files: print("All required files found!") break elapsed = time.time() - start_time if elapsed > max_wait: print(f"Timeout waiting for files. Missing: {missing_files}") sys.exit(1) print(f"Waiting for downloads to complete... ({int(elapsed)}s elapsed)") time.sleep(10) try: from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav import torchaudio print("\n1. Testing Zero-Shot Voice Cloning") print("-" * 40) # Initialize model cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M', load_jit=False, load_trt=False, fp16=False) # Test 1: Zero-shot with provided prompt audio if os.path.exists('asset/zero_shot_prompt.wav'): prompt_speech_16k = load_wav('asset/zero_shot_prompt.wav', 16000) text_pt = "Olá, este é um teste de síntese de voz em português brasileiro." prompt_pt = "Testando a conversão de texto para fala." print(f"Generating Portuguese speech: '{text_pt}'") for i, j in enumerate(cosyvoice.inference_zero_shot(text_pt, prompt_pt, prompt_speech_16k, stream=False)): output_file = f'output_portuguese_{i}.wav' torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate) print(f"✓ Saved: {output_file}") # Test 2: Chinese speech text_zh = "你好,我是通义生成式语音大模型,很高兴为您服务。" prompt_zh = "希望这个测试能够成功。" print(f"\nGenerating Chinese speech: '{text_zh}'") if os.path.exists('asset/zero_shot_prompt.wav'): for i, j in enumerate(cosyvoice.inference_zero_shot(text_zh, prompt_zh, prompt_speech_16k, stream=False)): output_file = f'output_chinese_{i}.wav' torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate) print(f"✓ Saved: {output_file}") # Test 3: English speech text_en = "Hello, this is a test of the CosyVoice text-to-speech system. It supports multiple languages." prompt_en = "Testing voice synthesis." print(f"\nGenerating English speech: '{text_en}'") if os.path.exists('asset/zero_shot_prompt.wav'): for i, j in enumerate(cosyvoice.inference_zero_shot(text_en, prompt_en, prompt_speech_16k, stream=False)): output_file = f'output_english_{i}.wav' torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate) print(f"✓ Saved: {output_file}") print("\n2. Testing SFT Mode (if available)") print("-" * 40) # Try SFT model if available if os.path.exists('pretrained_models/CosyVoice-300M-SFT/cosyvoice.yaml'): try: cosyvoice_sft = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False) # List available speakers speakers = cosyvoice_sft.list_available_spks() print(f"Available speakers: {speakers[:5]}...") # Show first 5 # Generate with first available speaker if speakers: text = "Este é um teste usando o modelo SFT com falantes pré-definidos." speaker = speakers[0] print(f"\nGenerating with speaker '{speaker}': '{text}'") for i, j in enumerate(cosyvoice_sft.inference_sft(text, speaker, stream=False)): output_file = f'output_sft_{i}.wav' torchaudio.save(output_file, j['tts_speech'], cosyvoice_sft.sample_rate) print(f"✓ Saved: {output_file}") except Exception as e: print(f"SFT model not ready yet: {e}") print("\n=== Test completed successfully! ===") print("\nGenerated audio files:") for f in os.listdir('.'): if f.startswith('output_') and f.endswith('.wav'): size = os.path.getsize(f) / 1024 print(f" - {f} ({size:.1f} KB)") except Exception as e: print(f"\nError during test: {e}") import traceback traceback.print_exc()