Update README.md
Browse files
README.md
CHANGED
|
@@ -34,4 +34,50 @@ The model can synthesize speech up to **90 minutes** long with up to **4 distinc
|
|
| 34 |
| VibeVoice-7B| 32K | ~45 min | [HF link](https://huggingface.co/microsoft/VibeVoice-7B) |
|
| 35 |
| VibeVoice-AcousticTokenizer | - | - | [HF link](https://huggingface.co/microsoft/VibeVoice-AcousticTokenizer) |
|
| 36 |
| VibeVoice-SemanticTokenizer | - | - | This model |
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
| VibeVoice-7B| 32K | ~45 min | [HF link](https://huggingface.co/microsoft/VibeVoice-7B) |
|
| 35 |
| VibeVoice-AcousticTokenizer | - | - | [HF link](https://huggingface.co/microsoft/VibeVoice-AcousticTokenizer) |
|
| 36 |
| VibeVoice-SemanticTokenizer | - | - | This model |
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# Usage
|
| 40 |
+
|
| 41 |
+
Below is example usage to encode audio for extracting semantic features:
|
| 42 |
+
|
| 43 |
+
```python
|
| 44 |
+
import torch
|
| 45 |
+
from transformers import AutoFeatureExtractor, VibeVoiceSemanticTokenizerModel
|
| 46 |
+
from transformers.audio_utils import load_audio_librosa
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
model_id = "bezzam/VibeVoice-SemanticTokenizer"
|
| 50 |
+
sampling_rate = 24000
|
| 51 |
+
|
| 52 |
+
# load audio
|
| 53 |
+
audio = load_audio_librosa(
|
| 54 |
+
"https://hf.co/datasets/bezzam/vibevoice_samples/resolve/main/voices/en-Alice_woman.wav",
|
| 55 |
+
sampling_rate=sampling_rate,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
# load model
|
| 59 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 60 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
|
| 61 |
+
model = VibeVoiceSemanticTokenizerModel.from_pretrained(
|
| 62 |
+
model_id,
|
| 63 |
+
device_map=device,
|
| 64 |
+
).eval()
|
| 65 |
+
|
| 66 |
+
# preprocess audio
|
| 67 |
+
inputs = feature_extractor(
|
| 68 |
+
audio,
|
| 69 |
+
sampling_rate=sampling_rate,
|
| 70 |
+
padding=True,
|
| 71 |
+
pad_to_multiple_of=3200,
|
| 72 |
+
return_attention_mask=False,
|
| 73 |
+
return_tensors="pt",
|
| 74 |
+
).to(device)
|
| 75 |
+
print("Input audio shape:", inputs.input_features.shape)
|
| 76 |
+
# Input audio shape: torch.Size([1, 1, 224000])
|
| 77 |
+
|
| 78 |
+
# encode
|
| 79 |
+
with torch.no_grad():
|
| 80 |
+
encoded_outputs = model.encode(inputs.input_features)
|
| 81 |
+
print("Latent shape:", encoded_outputs.latents.shape)
|
| 82 |
+
# Latent shape: torch.Size([1, 70, 128])
|
| 83 |
+
```
|