|
|
<!DOCTYPE html> |
|
|
<html lang="en"> |
|
|
<meta charset="UTF-8" /> |
|
|
<title>VibeVoice-Realtime TTS Demo</title> |
|
|
<style> |
|
|
:root { |
|
|
--bg: #f5f7fc; |
|
|
--surface: #ffffff; |
|
|
--accent: #5562ff; |
|
|
--accent-strong: #3f4dff; |
|
|
--text-primary: #1f2742; |
|
|
--text-muted: #5d6789; |
|
|
--border: rgba(85, 98, 255, 0.18); |
|
|
--shadow: 0 18px 45px rgba(31, 39, 66, 0.08); |
|
|
} |
|
|
|
|
|
.helper-text { |
|
|
font-size: 12px; |
|
|
color: #8a93b5; |
|
|
} |
|
|
|
|
|
* { |
|
|
box-sizing: border-box; |
|
|
} |
|
|
|
|
|
body { |
|
|
margin: 0; |
|
|
background: var(--bg); |
|
|
font-family: 'Inter', 'Segoe UI', Roboto, Helvetica, sans-serif; |
|
|
color: var(--text-primary); |
|
|
display: flex; |
|
|
justify-content: center; |
|
|
padding: 48px 20px; |
|
|
} |
|
|
|
|
|
.app-shell { |
|
|
width: min(960px, 100%); |
|
|
background: var(--surface); |
|
|
border-radius: 20px; |
|
|
padding: 36px 40px 44px; |
|
|
box-shadow: var(--shadow); |
|
|
display: flex; |
|
|
flex-direction: column; |
|
|
gap: 28px; |
|
|
} |
|
|
|
|
|
h1 { |
|
|
margin: 0; |
|
|
text-align: center; |
|
|
font-size: 30px; |
|
|
font-weight: 700; |
|
|
letter-spacing: 0.01em; |
|
|
} |
|
|
|
|
|
.panel { |
|
|
display: flex; |
|
|
flex-direction: column; |
|
|
gap: 10px; |
|
|
} |
|
|
|
|
|
.field { |
|
|
display: flex; |
|
|
flex-direction: column; |
|
|
gap: 8px; |
|
|
} |
|
|
|
|
|
.field-label { |
|
|
font-weight: 600; |
|
|
font-size: 15px; |
|
|
color: var(--text-primary); |
|
|
} |
|
|
|
|
|
.text-input { |
|
|
width: 100%; |
|
|
min-height: 140px; |
|
|
max-height: 240px; |
|
|
border: 1px solid rgba(31, 39, 66, 0.14); |
|
|
border-radius: 12px; |
|
|
padding: 14px 16px; |
|
|
font-size: 15px; |
|
|
line-height: 1.6; |
|
|
font-family: inherit; |
|
|
background: #f9faff; |
|
|
transition: border-color 0.2s, box-shadow 0.2s; |
|
|
resize: vertical; |
|
|
} |
|
|
|
|
|
.text-input:focus { |
|
|
outline: none; |
|
|
border-color: var(--accent); |
|
|
box-shadow: 0 0 0 3px rgba(85, 98, 255, 0.18); |
|
|
background: #fff; |
|
|
} |
|
|
|
|
|
#streamingPreviewContainer { |
|
|
border-radius: 14px; |
|
|
border: 1px solid var(--border); |
|
|
background: linear-gradient(135deg, #eef2ff 0%, #f7f9ff 100%); |
|
|
padding: 18px 20px; |
|
|
box-shadow: inset 0 1px 2px rgba(85, 98, 255, 0.12); |
|
|
} |
|
|
|
|
|
#streamingPreviewHeader { |
|
|
font-weight: 600; |
|
|
color: var(--text-primary); |
|
|
display: flex; |
|
|
align-items: center; |
|
|
gap: 10px; |
|
|
font-size: 14px; |
|
|
margin-bottom: 8px; |
|
|
} |
|
|
|
|
|
#streamingPreviewNote { |
|
|
font-weight: 400; |
|
|
font-size: 12px; |
|
|
color: var(--text-muted); |
|
|
} |
|
|
|
|
|
#streamingPreview { |
|
|
min-height: 70px; |
|
|
padding: 10px 12px; |
|
|
border-radius: 10px; |
|
|
background: rgba(255, 255, 255, 0.9); |
|
|
border: 1px solid rgba(85, 98, 255, 0.25); |
|
|
font-family: 'Courier New', Courier, monospace; |
|
|
font-size: 14px; |
|
|
line-height: 1.5; |
|
|
color: var(--text-primary); |
|
|
white-space: pre-wrap; |
|
|
} |
|
|
|
|
|
#streamingPreview.streaming-active::after { |
|
|
content: ""; |
|
|
display: inline-block; |
|
|
width: 2px; |
|
|
height: 1.1em; |
|
|
background: var(--accent); |
|
|
margin-left: 2px; |
|
|
animation: previewCaret 0.9s steps(1) infinite; |
|
|
vertical-align: bottom; |
|
|
} |
|
|
|
|
|
@keyframes previewCaret { |
|
|
0%, 50% { |
|
|
opacity: 1; |
|
|
} |
|
|
51%, 100% { |
|
|
opacity: 0; |
|
|
} |
|
|
} |
|
|
|
|
|
.control-panel { |
|
|
display: flex; |
|
|
flex-direction: column; |
|
|
gap: 18px; |
|
|
} |
|
|
|
|
|
.inline-field { |
|
|
display: flex; |
|
|
flex-direction: column; |
|
|
gap: 6px; |
|
|
} |
|
|
|
|
|
.select-control { |
|
|
width: 220px; |
|
|
border: 1px solid rgba(31, 39, 66, 0.14); |
|
|
border-radius: 10px; |
|
|
padding: 8px 12px; |
|
|
font-size: 14px; |
|
|
font-family: inherit; |
|
|
background: #fbfcff; |
|
|
color: var(--text-primary); |
|
|
transition: border-color 0.2s, box-shadow 0.2s; |
|
|
} |
|
|
|
|
|
.select-control:focus { |
|
|
outline: none; |
|
|
border-color: var(--accent); |
|
|
box-shadow: 0 0 0 3px rgba(85, 98, 255, 0.18); |
|
|
background: #fff; |
|
|
} |
|
|
|
|
|
.control-row { |
|
|
display: flex; |
|
|
align-items: center; |
|
|
flex-wrap: wrap; |
|
|
gap: 20px 28px; |
|
|
} |
|
|
|
|
|
.range-control { |
|
|
display: flex; |
|
|
align-items: center; |
|
|
gap: 12px; |
|
|
font-size: 14px; |
|
|
color: var(--text-primary); |
|
|
} |
|
|
|
|
|
.range-control input[type="range"] { |
|
|
width: 200px; |
|
|
accent-color: var(--accent); |
|
|
} |
|
|
|
|
|
.range-value { |
|
|
font-weight: 600; |
|
|
color: var(--text-primary); |
|
|
min-width: 42px; |
|
|
text-align: right; |
|
|
} |
|
|
|
|
|
#playback { |
|
|
background: var(--accent); |
|
|
color: #fff; |
|
|
border: none; |
|
|
padding: 10px 24px; |
|
|
border-radius: 999px; |
|
|
cursor: pointer; |
|
|
font-weight: 600; |
|
|
font-size: 14px; |
|
|
box-shadow: 0 8px 16px rgba(85, 98, 255, 0.25); |
|
|
transition: transform 0.15s, box-shadow 0.15s, background 0.15s; |
|
|
} |
|
|
|
|
|
#playback:hover { |
|
|
transform: translateY(-1px); |
|
|
box-shadow: 0 10px 20px rgba(85, 98, 255, 0.28); |
|
|
} |
|
|
|
|
|
#playback:active { |
|
|
transform: translateY(0); |
|
|
} |
|
|
|
|
|
#playback.playing { |
|
|
background: var(--accent-strong); |
|
|
} |
|
|
|
|
|
.secondary-btn { |
|
|
border: 1px solid rgba(31, 39, 66, 0.18); |
|
|
background: #f1f3ff; |
|
|
color: var(--text-primary); |
|
|
padding: 8px 18px; |
|
|
border-radius: 999px; |
|
|
cursor: pointer; |
|
|
font-size: 13px; |
|
|
font-weight: 500; |
|
|
transition: background 0.15s, border-color 0.15s; |
|
|
} |
|
|
|
|
|
.secondary-btn:hover { |
|
|
background: #e6e9ff; |
|
|
border-color: rgba(31, 39, 66, 0.26); |
|
|
} |
|
|
|
|
|
.secondary-btn:disabled { |
|
|
opacity: 0.55; |
|
|
cursor: not-allowed; |
|
|
} |
|
|
|
|
|
.metrics { |
|
|
display: flex; |
|
|
flex-wrap: wrap; |
|
|
gap: 16px 32px; |
|
|
font-size: 14px; |
|
|
color: var(--text-muted); |
|
|
} |
|
|
|
|
|
.metrics span { |
|
|
display: flex; |
|
|
align-items: baseline; |
|
|
gap: 6px; |
|
|
} |
|
|
|
|
|
.metrics span strong { |
|
|
color: var(--text-primary); |
|
|
font-weight: 600; |
|
|
} |
|
|
|
|
|
.metric-unit { |
|
|
color: var(--text-muted); |
|
|
font-size: 13px; |
|
|
} |
|
|
|
|
|
#logOutput { |
|
|
max-height: 260px; |
|
|
overflow-y: auto; |
|
|
background: #f7f9ff; |
|
|
color: var(--text-primary); |
|
|
padding: 16px 18px; |
|
|
border: 1px solid rgba(31, 39, 66, 0.12); |
|
|
border-radius: 12px; |
|
|
font-size: 13px; |
|
|
line-height: 1.6; |
|
|
box-shadow: inset 0 1px 2px rgba(15, 23, 42, 0.06); |
|
|
font-family: 'Fira Code', 'Courier New', Courier, monospace; |
|
|
margin-top: 0px; |
|
|
} |
|
|
|
|
|
@media (max-width: 720px) { |
|
|
.app-shell { |
|
|
padding: 28px 20px 36px; |
|
|
gap: 24px; |
|
|
} |
|
|
|
|
|
.select-control { |
|
|
width: 100%; |
|
|
} |
|
|
|
|
|
.control-row { |
|
|
flex-direction: column; |
|
|
align-items: flex-start; |
|
|
gap: 16px; |
|
|
} |
|
|
|
|
|
#playback { |
|
|
width: 100%; |
|
|
text-align: center; |
|
|
} |
|
|
} |
|
|
</style> |
|
|
<body> |
|
|
<div class="app-shell"> |
|
|
<h1>VibeVoice-Realtime TTS Demo</h1> |
|
|
|
|
|
<section class="panel"> |
|
|
<label class="field"> |
|
|
<span class="field-label">Text</span> |
|
|
<textarea |
|
|
id="prompt" |
|
|
class="text-input" |
|
|
rows="4" |
|
|
>Enter your text here and click "Start" to instantly hear the VibeVoice-Realtime TTS output audio.</textarea> |
|
|
</label> |
|
|
|
|
|
<div id="streamingPreviewContainer"> |
|
|
<div id="streamingPreviewHeader"> |
|
|
<span>Streaming Input Text</span> |
|
|
</div> |
|
|
<div id="streamingPreview" aria-live="polite">This area will display the streaming input text in real time.</div> |
|
|
</div> |
|
|
</section> |
|
|
<span class="helper-text">This demo requires the full text to be provided upfront. The model then receives the text via streaming input during synthesis.<br> |
|
|
For non-punctuation special characters, applying text normalization before processing often yields better results.</span> |
|
|
|
|
|
<section class="panel control-panel"> |
|
|
<div class="inline-field"> |
|
|
<span class="field-label">Speaker</span> |
|
|
<select id="voiceSelect" class="select-control"> |
|
|
<option value="">Loading...</option> |
|
|
</select> |
|
|
</div> |
|
|
|
|
|
<div class="control-row"> |
|
|
<label class="range-control"> |
|
|
<span>CFG</span> |
|
|
<input id="cfgScale" type="range" min="1" max="3" step="0.05" value="1.5" /> |
|
|
<span class="range-value" id="cfgValue">1.5</span> |
|
|
</label> |
|
|
<label class="range-control"> |
|
|
<span>Inference Steps</span> |
|
|
<input id="inferenceSteps" type="range" min="1" max="20" step="1" value="5" /> |
|
|
<span class="range-value" id="stepsValue">5</span> |
|
|
</label> |
|
|
<button id="resetControls" type="button" class="secondary-btn">Reset Controls</button> |
|
|
</div> |
|
|
|
|
|
<div class="control-row"> |
|
|
<button id="playback">Start</button> |
|
|
<button id="saveAudio" type="button" class="secondary-btn" disabled>Save</button> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
<section class="panel"> |
|
|
<div class="metrics"> |
|
|
<span>Model Generated Audio<strong id="modelGenerated">0.00</strong><span class="metric-unit">s</span></span> |
|
|
<span>Audio Played<strong id="playbackElapsed">0.00</strong><span class="metric-unit">s</span></span> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
<section class="panel"> |
|
|
<span class="field-label">Runtime Logs</span> |
|
|
<pre id="logOutput"></pre> |
|
|
</section> |
|
|
</div> |
|
|
|
|
|
|
|
|
<script> |
|
|
(() => { |
|
|
const SAMPLE_RATE = 24_000; |
|
|
const BUFFER_SIZE = 2048; |
|
|
const PREBUFFER_SEC = 0.1; |
|
|
|
|
|
let audioCtx = null; |
|
|
let scriptNode = null; |
|
|
let socket = null; |
|
|
let buffer = new Float32Array(0); |
|
|
let isPlaying = false; |
|
|
let hasStartedPlayback = false; |
|
|
let silentFrameCount = 0; |
|
|
|
|
|
const promptInput = document.getElementById('prompt'); |
|
|
const streamingPreview = document.getElementById('streamingPreview'); |
|
|
const controlBtn = document.getElementById('playback'); |
|
|
const cfgSelect = document.getElementById('cfgScale'); |
|
|
const stepsSelect = document.getElementById('inferenceSteps'); |
|
|
const voiceSelect = document.getElementById('voiceSelect'); |
|
|
const cfgValueLabel = document.getElementById('cfgValue'); |
|
|
const stepsValueLabel = document.getElementById('stepsValue'); |
|
|
const modelGeneratedLabel = document.getElementById('modelGenerated'); |
|
|
const playbackElapsedLabel = document.getElementById('playbackElapsed'); |
|
|
const logOutput = document.getElementById('logOutput'); |
|
|
const resetBtn = document.getElementById('resetControls'); |
|
|
const saveBtn = document.getElementById('saveAudio'); |
|
|
|
|
|
let playbackTimer = null; |
|
|
let lastPlaybackElapsed = 0; |
|
|
let playbackSamples = 0; |
|
|
let modelGeneratedTotal = 0; |
|
|
let firstBrowserChunkLogged = false; |
|
|
let playbackStartedLogged = false; |
|
|
const logEntries = []; |
|
|
let logSequence = 0; |
|
|
let recordedChunks = []; |
|
|
let recordedSamples = 0; |
|
|
let recordingComplete = false; |
|
|
let downloadUrl = null; |
|
|
|
|
|
const revokeDownloadUrl = () => { |
|
|
if (downloadUrl) { |
|
|
URL.revokeObjectURL(downloadUrl); |
|
|
downloadUrl = null; |
|
|
} |
|
|
}; |
|
|
|
|
|
const updateSaveButtonState = () => { |
|
|
if (!saveBtn) { |
|
|
return; |
|
|
} |
|
|
saveBtn.disabled = recordedSamples === 0 || !recordingComplete; |
|
|
}; |
|
|
|
|
|
const clearRecordedChunks = () => { |
|
|
recordedChunks = []; |
|
|
recordedSamples = 0; |
|
|
recordingComplete = false; |
|
|
revokeDownloadUrl(); |
|
|
updateSaveButtonState(); |
|
|
}; |
|
|
|
|
|
const createWavBlob = () => { |
|
|
if (!recordedSamples) { |
|
|
return null; |
|
|
} |
|
|
const wavBuffer = new ArrayBuffer(44 + recordedSamples * 2); |
|
|
const view = new DataView(wavBuffer); |
|
|
const writeString = (offset, str) => { |
|
|
for (let i = 0; i < str.length; i += 1) { |
|
|
view.setUint8(offset + i, str.charCodeAt(i)); |
|
|
} |
|
|
}; |
|
|
|
|
|
writeString(0, 'RIFF'); |
|
|
view.setUint32(4, 36 + recordedSamples * 2, true); |
|
|
writeString(8, 'WAVE'); |
|
|
writeString(12, 'fmt '); |
|
|
view.setUint32(16, 16, true); |
|
|
view.setUint16(20, 1, true); |
|
|
view.setUint16(22, 1, true); |
|
|
view.setUint32(24, SAMPLE_RATE, true); |
|
|
view.setUint32(28, SAMPLE_RATE * 2, true); |
|
|
view.setUint16(32, 2, true); |
|
|
view.setUint16(34, 16, true); |
|
|
writeString(36, 'data'); |
|
|
view.setUint32(40, recordedSamples * 2, true); |
|
|
|
|
|
const pcmData = new Int16Array(wavBuffer, 44, recordedSamples); |
|
|
let offset = 0; |
|
|
recordedChunks.forEach(chunk => { |
|
|
const chunkData = new Int16Array(chunk); |
|
|
pcmData.set(chunkData, offset); |
|
|
offset += chunkData.length; |
|
|
}); |
|
|
return new Blob([wavBuffer], { type: 'audio/wav' }); |
|
|
}; |
|
|
|
|
|
const updateCfgDisplay = () => { |
|
|
cfgValueLabel.textContent = Number(cfgSelect.value).toFixed(3); |
|
|
}; |
|
|
|
|
|
const updateStepsDisplay = () => { |
|
|
stepsValueLabel.textContent = Number(stepsSelect.value).toString(); |
|
|
}; |
|
|
|
|
|
cfgSelect.addEventListener('input', updateCfgDisplay); |
|
|
stepsSelect.addEventListener('input', updateStepsDisplay); |
|
|
updateCfgDisplay(); |
|
|
updateStepsDisplay(); |
|
|
|
|
|
const pad2 = value => value.toString().padStart(2, '0'); |
|
|
const pad3 = value => value.toString().padStart(3, '0'); |
|
|
|
|
|
const formatLocalTimestamp = () => { |
|
|
const d = new Date(); |
|
|
const year = d.getFullYear(); |
|
|
const month = pad2(d.getMonth() + 1); |
|
|
const day = pad2(d.getDate()); |
|
|
const hours = pad2(d.getHours()); |
|
|
const minutes = pad2(d.getMinutes()); |
|
|
const seconds = pad2(d.getSeconds()); |
|
|
const millis = pad3(d.getMilliseconds()); |
|
|
return `${year}-${month}-${day} ${hours}:${minutes}:${seconds}.${millis}`; |
|
|
}; |
|
|
|
|
|
const formatSeconds = raw => { |
|
|
const value = Number(raw); |
|
|
return Number.isFinite(value) ? value.toFixed(2) : '0.00'; |
|
|
}; |
|
|
|
|
|
const parseTimestamp = value => { |
|
|
if (!value) { |
|
|
return new Date(); |
|
|
} |
|
|
if (/\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}/.test(value)) { |
|
|
return new Date(value.replace(' ', 'T')); |
|
|
} |
|
|
return new Date(value); |
|
|
}; |
|
|
|
|
|
const setModelGenerated = value => { |
|
|
const numeric = Number(value); |
|
|
if (!Number.isFinite(numeric)) { |
|
|
return; |
|
|
} |
|
|
modelGeneratedTotal = Math.max(0, numeric); |
|
|
modelGeneratedLabel.textContent = formatSeconds(modelGeneratedTotal); |
|
|
}; |
|
|
|
|
|
const setPlaybackElapsed = value => { |
|
|
const capped = Math.min(modelGeneratedTotal, Math.max(0, value)); |
|
|
lastPlaybackElapsed = capped; |
|
|
playbackElapsedLabel.textContent = formatSeconds(lastPlaybackElapsed); |
|
|
}; |
|
|
|
|
|
const STREAMING_WPM = 180; |
|
|
const STREAMING_INTERVAL_MS = 60000 / STREAMING_WPM; |
|
|
let previewTimeoutId = null; |
|
|
let previewTokens = []; |
|
|
let previewIndex = 0; |
|
|
let previewActive = false; |
|
|
|
|
|
const clearPreviewTimer = () => { |
|
|
if (previewTimeoutId) { |
|
|
clearTimeout(previewTimeoutId); |
|
|
previewTimeoutId = null; |
|
|
} |
|
|
}; |
|
|
|
|
|
const setPreviewIdle = message => { |
|
|
if (!streamingPreview) { |
|
|
return; |
|
|
} |
|
|
streamingPreview.classList.remove('streaming-active'); |
|
|
streamingPreview.textContent = message; |
|
|
}; |
|
|
|
|
|
const schedulePreviewTick = () => { |
|
|
if (!streamingPreview) { |
|
|
return; |
|
|
} |
|
|
if (previewIndex >= previewTokens.length) { |
|
|
streamingPreview.classList.remove('streaming-active'); |
|
|
return; |
|
|
} |
|
|
|
|
|
streamingPreview.classList.add('streaming-active'); |
|
|
|
|
|
streamingPreview.textContent += previewTokens[previewIndex]; |
|
|
previewIndex += 1; |
|
|
previewTimeoutId = setTimeout(schedulePreviewTick, STREAMING_INTERVAL_MS); |
|
|
}; |
|
|
|
|
|
const updateStreamingPreview = () => { |
|
|
if (!streamingPreview) { |
|
|
return; |
|
|
} |
|
|
clearPreviewTimer(); |
|
|
previewIndex = 0; |
|
|
const source = (promptInput?.value || '').trimEnd(); |
|
|
streamingPreview.textContent = ''; |
|
|
previewTokens = source.match(/\S+\s*/g) || []; |
|
|
schedulePreviewTick(); |
|
|
}; |
|
|
|
|
|
const clearLogs = () => { |
|
|
if (logOutput) { |
|
|
logOutput.textContent = ''; |
|
|
} |
|
|
logEntries.length = 0; |
|
|
modelGeneratedTotal = 0; |
|
|
setModelGenerated(0); |
|
|
}; |
|
|
|
|
|
const appendLog = (message, timestamp) => { |
|
|
if (!logOutput) { |
|
|
return; |
|
|
} |
|
|
const finalTimestamp = timestamp || formatLocalTimestamp(); |
|
|
const entry = { |
|
|
timestamp: finalTimestamp, |
|
|
date: parseTimestamp(finalTimestamp), |
|
|
message, |
|
|
seq: logSequence += 1, |
|
|
}; |
|
|
logEntries.push(entry); |
|
|
logEntries.sort((a, b) => { |
|
|
const diff = a.date.getTime() - b.date.getTime(); |
|
|
return diff !== 0 ? diff : a.seq - b.seq; |
|
|
}); |
|
|
if (logEntries.length > 400) { |
|
|
logEntries.splice(0, logEntries.length - 400); |
|
|
} |
|
|
logOutput.textContent = logEntries |
|
|
.map(item => `[${item.timestamp}] ${item.message}`) |
|
|
.join('\n'); |
|
|
logOutput.scrollTop = logOutput.scrollHeight; |
|
|
}; |
|
|
|
|
|
const handleSaveClick = () => { |
|
|
if (!recordedSamples) { |
|
|
appendLog('[Frontend] Save requested but no audio received yet'); |
|
|
return; |
|
|
} |
|
|
const wavBlob = createWavBlob(); |
|
|
if (!wavBlob) { |
|
|
appendLog('[Error] Failed to assemble WAV data for download'); |
|
|
return; |
|
|
} |
|
|
revokeDownloadUrl(); |
|
|
downloadUrl = URL.createObjectURL(wavBlob); |
|
|
const link = document.createElement('a'); |
|
|
const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); |
|
|
link.href = downloadUrl; |
|
|
link.download = `vibevoice_realtime_audio_${timestamp}.wav`; |
|
|
document.body.appendChild(link); |
|
|
link.click(); |
|
|
document.body.removeChild(link); |
|
|
appendLog('[Frontend] Audio download triggered'); |
|
|
}; |
|
|
|
|
|
const stopPlaybackTimer = () => { |
|
|
if (playbackTimer) { |
|
|
clearInterval(playbackTimer); |
|
|
playbackTimer = null; |
|
|
} |
|
|
}; |
|
|
|
|
|
const startPlaybackTimer = () => { |
|
|
stopPlaybackTimer(); |
|
|
playbackTimer = setInterval(() => { |
|
|
setPlaybackElapsed(playbackSamples / SAMPLE_RATE); |
|
|
}, 250); |
|
|
}; |
|
|
|
|
|
const loadVoices = async () => { |
|
|
try { |
|
|
voiceSelect.disabled = true; |
|
|
const response = await fetch('/config'); |
|
|
if (!response.ok) { |
|
|
throw new Error(`Failed to fetch config: ${response.status}`); |
|
|
} |
|
|
const data = await response.json(); |
|
|
const voices = Array.isArray(data.voices) ? data.voices : []; |
|
|
voiceSelect.innerHTML = ''; |
|
|
if (voices.length === 0) { |
|
|
const option = document.createElement('option'); |
|
|
option.value = ''; |
|
|
option.textContent = 'No voices available'; |
|
|
voiceSelect.appendChild(option); |
|
|
voiceSelect.disabled = true; |
|
|
appendLog('[Error] No voice presets available'); |
|
|
return; |
|
|
} |
|
|
|
|
|
voices.forEach(voice => { |
|
|
const option = document.createElement('option'); |
|
|
option.value = voice; |
|
|
option.textContent = voice; |
|
|
voiceSelect.appendChild(option); |
|
|
}); |
|
|
|
|
|
if (data.default_voice && voices.includes(data.default_voice)) { |
|
|
voiceSelect.value = data.default_voice; |
|
|
} |
|
|
voiceSelect.disabled = false; |
|
|
appendLog(`[Frontend] Loaded ${voices.length} voice presets`); |
|
|
} catch (err) { |
|
|
console.error('Failed to load voices', err); |
|
|
voiceSelect.innerHTML = ''; |
|
|
const option = document.createElement('option'); |
|
|
option.value = ''; |
|
|
option.textContent = 'Load failed'; |
|
|
voiceSelect.appendChild(option); |
|
|
voiceSelect.disabled = true; |
|
|
appendLog('[Error] Failed to load voice presets'); |
|
|
} |
|
|
}; |
|
|
|
|
|
loadVoices(); |
|
|
|
|
|
resetBtn.addEventListener('click', () => { |
|
|
cfgSelect.value = '1.5'; |
|
|
stepsSelect.value = '5'; |
|
|
updateCfgDisplay(); |
|
|
updateStepsDisplay(); |
|
|
appendLog('[Frontend] Controls reset to defaults (CFG=1.5, Steps=5)'); |
|
|
}); |
|
|
|
|
|
if (promptInput) { |
|
|
promptInput.addEventListener('input', () => { |
|
|
if (previewActive) { |
|
|
updateStreamingPreview(); |
|
|
} |
|
|
}); |
|
|
} |
|
|
|
|
|
const handleLogMessage = raw => { |
|
|
let payload; |
|
|
try { |
|
|
payload = JSON.parse(raw); |
|
|
} catch (err) { |
|
|
appendLog(`[Error] Failed to parse log message: ${raw}`); |
|
|
return; |
|
|
} |
|
|
if (!payload || payload.type !== 'log') { |
|
|
appendLog(`[Log] ${raw}`); |
|
|
return; |
|
|
} |
|
|
|
|
|
const { event, data = {}, timestamp } = payload; |
|
|
switch (event) { |
|
|
case 'backend_request_received': { |
|
|
const cfg = typeof data.cfg_scale === 'number' ? data.cfg_scale.toFixed(3) : data.cfg_scale; |
|
|
const steps = data.inference_steps ?? 'default'; |
|
|
const voice = data.voice || 'default'; |
|
|
const textLength = data.text_length ?? 0; |
|
|
appendLog(`[Backend] Received request`, timestamp); |
|
|
break; |
|
|
} |
|
|
case 'backend_first_chunk_sent': |
|
|
appendLog('[Backend] Sent first audio chunk', timestamp); |
|
|
break; |
|
|
case 'model_progress': |
|
|
if (typeof data.generated_sec !== 'undefined') { |
|
|
const generated = Number(data.generated_sec); |
|
|
if (Number.isFinite(generated)) { |
|
|
setModelGenerated(generated); |
|
|
} |
|
|
} |
|
|
return; |
|
|
case 'generation_error': |
|
|
appendLog(`[Error] Generation error: ${data.message || 'Unknown error'}`, timestamp); |
|
|
break; |
|
|
case 'backend_error': |
|
|
appendLog(`[Error] Backend error: ${data.message || 'Unknown error'}`, timestamp); |
|
|
break; |
|
|
case 'client_disconnected': |
|
|
appendLog('[Frontend] Client disconnected', timestamp); |
|
|
break; |
|
|
case 'backend_stream_complete': |
|
|
appendLog('[Backend] Backend finished', timestamp); |
|
|
recordingComplete = true; |
|
|
updateSaveButtonState(); |
|
|
break; |
|
|
default: |
|
|
appendLog(`[Log] Event ${event}`, timestamp); |
|
|
break; |
|
|
} |
|
|
}; |
|
|
|
|
|
const updateButtonLabel = () => { |
|
|
controlBtn.textContent = isPlaying ? 'Stop' : 'Start'; |
|
|
controlBtn.classList.toggle('playing', isPlaying); |
|
|
}; |
|
|
|
|
|
const appendAudio = chunk => { |
|
|
const merged = new Float32Array(buffer.length + chunk.length); |
|
|
merged.set(buffer, 0); |
|
|
merged.set(chunk, buffer.length); |
|
|
buffer = merged; |
|
|
}; |
|
|
|
|
|
const pullAudio = frameCount => { |
|
|
const available = buffer.length; |
|
|
if (available === 0) { |
|
|
return new Float32Array(frameCount); |
|
|
} |
|
|
if (available <= frameCount) { |
|
|
const chunk = buffer; |
|
|
buffer = new Float32Array(0); |
|
|
if (chunk.length < frameCount) { |
|
|
const padded = new Float32Array(frameCount); |
|
|
padded.set(chunk, 0); |
|
|
return padded; |
|
|
} |
|
|
return chunk; |
|
|
} |
|
|
const chunk = buffer.subarray(0, frameCount); |
|
|
buffer = buffer.subarray(frameCount); |
|
|
return chunk; |
|
|
}; |
|
|
|
|
|
const closeSocket = () => { |
|
|
if (socket && (socket.readyState === WebSocket.OPEN || socket.readyState === WebSocket.CONNECTING)) { |
|
|
socket.close(); |
|
|
} |
|
|
socket = null; |
|
|
}; |
|
|
|
|
|
const resetPlaybackFlags = (resetSamples = true) => { |
|
|
buffer = new Float32Array(0); |
|
|
if (resetSamples) { |
|
|
playbackSamples = 0; |
|
|
setPlaybackElapsed(0); |
|
|
} |
|
|
hasStartedPlayback = false; |
|
|
silentFrameCount = 0; |
|
|
firstBrowserChunkLogged = false; |
|
|
playbackStartedLogged = false; |
|
|
}; |
|
|
|
|
|
const teardownAudio = () => { |
|
|
if (scriptNode) { |
|
|
try { scriptNode.disconnect(); } catch (err) { console.warn('disconnect error', err); } |
|
|
scriptNode.onaudioprocess = null; |
|
|
} |
|
|
if (audioCtx) { |
|
|
try { audioCtx.close(); } catch (err) { console.warn('audioCtx.close error', err); } |
|
|
} |
|
|
audioCtx = null; |
|
|
scriptNode = null; |
|
|
}; |
|
|
|
|
|
const resetState = (resetSamples = true) => { |
|
|
closeSocket(); |
|
|
teardownAudio(); |
|
|
resetPlaybackFlags(resetSamples); |
|
|
isPlaying = false; |
|
|
stopPlaybackTimer(); |
|
|
}; |
|
|
|
|
|
const createAudioChain = () => { |
|
|
teardownAudio(); |
|
|
resetPlaybackFlags(); |
|
|
audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: SAMPLE_RATE }); |
|
|
scriptNode = audioCtx.createScriptProcessor(BUFFER_SIZE, 0, 1); |
|
|
|
|
|
const minBufferSamples = Math.floor(audioCtx.sampleRate * PREBUFFER_SEC); |
|
|
|
|
|
scriptNode.onaudioprocess = event => { |
|
|
const output = event.outputBuffer.getChannelData(0); |
|
|
const needPrebuffer = !hasStartedPlayback; |
|
|
const socketClosed = !socket || socket.readyState === WebSocket.CLOSED || socket.readyState === WebSocket.CLOSING; |
|
|
|
|
|
if (needPrebuffer) { |
|
|
if (buffer.length >= minBufferSamples || socketClosed) { |
|
|
hasStartedPlayback = true; |
|
|
if (!playbackStartedLogged) { |
|
|
playbackStartedLogged = true; |
|
|
appendLog('[Frontend] Browser started to play audio'); |
|
|
startPlaybackTimer(); |
|
|
} |
|
|
} else { |
|
|
output.fill(0); |
|
|
return; |
|
|
} |
|
|
} |
|
|
|
|
|
const chunk = pullAudio(output.length); |
|
|
output.set(chunk); |
|
|
|
|
|
if (hasStartedPlayback) { |
|
|
playbackSamples += output.length; |
|
|
} |
|
|
|
|
|
if (socketClosed && buffer.length === 0 && chunk.every(sample => sample === 0)) { |
|
|
silentFrameCount += 1; |
|
|
if (silentFrameCount >= 4) { |
|
|
stop(); |
|
|
} |
|
|
} else { |
|
|
silentFrameCount = 0; |
|
|
} |
|
|
}; |
|
|
|
|
|
scriptNode.connect(audioCtx.destination); |
|
|
}; |
|
|
|
|
|
const start = () => { |
|
|
if (isPlaying) { |
|
|
return; |
|
|
} |
|
|
|
|
|
const textValue = promptInput?.value || ''; |
|
|
const cfgValue = Number(cfgSelect.value); |
|
|
const stepsValue = Number(stepsSelect.value); |
|
|
const voiceValue = voiceSelect.value || ''; |
|
|
|
|
|
clearLogs(); |
|
|
const cfgDisplay = Number.isFinite(cfgValue) ? cfgValue.toFixed(3) : 'default'; |
|
|
const stepsDisplay = Number.isFinite(stepsValue) ? stepsValue : 'default'; |
|
|
appendLog(`[Frontend] Start button clicked, CFG=${cfgDisplay}, Steps=${stepsDisplay}, Speaker=${voiceValue || 'default'}`); |
|
|
setModelGenerated(0); |
|
|
setPlaybackElapsed(0); |
|
|
|
|
|
resetState(true); |
|
|
clearRecordedChunks(); |
|
|
isPlaying = true; |
|
|
previewActive = true; |
|
|
updateStreamingPreview(); |
|
|
updateButtonLabel(); |
|
|
createAudioChain(); |
|
|
|
|
|
const params = new URLSearchParams(); |
|
|
params.set('text', textValue); |
|
|
if (!Number.isNaN(cfgValue)) { |
|
|
params.set('cfg', cfgValue.toFixed(3)); |
|
|
} |
|
|
if (!Number.isNaN(stepsValue)) { |
|
|
params.set('steps', stepsValue.toString()); |
|
|
} |
|
|
if (voiceValue) { |
|
|
params.set('voice', voiceValue); |
|
|
} |
|
|
const wsUrl = `${location.origin.replace(/^http/, 'ws')}/stream?${params.toString()}`; |
|
|
|
|
|
socket = new WebSocket(wsUrl); |
|
|
socket.binaryType = 'arraybuffer'; |
|
|
|
|
|
socket.onmessage = event => { |
|
|
if (typeof event.data === 'string') { |
|
|
handleLogMessage(event.data); |
|
|
return; |
|
|
} |
|
|
|
|
|
if (!(event.data instanceof ArrayBuffer)) { |
|
|
return; |
|
|
} |
|
|
const rawBuffer = event.data.slice(0); |
|
|
const view = new DataView(rawBuffer); |
|
|
const floatChunk = new Float32Array(view.byteLength / 2); |
|
|
for (let i = 0; i < floatChunk.length; i += 1) { |
|
|
floatChunk[i] = view.getInt16(i * 2, true) / 32768; |
|
|
} |
|
|
appendAudio(floatChunk); |
|
|
recordedChunks.push(rawBuffer); |
|
|
recordedSamples += floatChunk.length; |
|
|
updateSaveButtonState(); |
|
|
|
|
|
if (!firstBrowserChunkLogged) { |
|
|
firstBrowserChunkLogged = true; |
|
|
appendLog('[Frontend] Received first audio chunk'); |
|
|
} |
|
|
}; |
|
|
|
|
|
socket.onerror = err => { |
|
|
console.error('WebSocket error', err); |
|
|
appendLog(`[Error] WebSocket error: ${err?.message || err}`); |
|
|
stop(); |
|
|
}; |
|
|
|
|
|
socket.onclose = () => { |
|
|
socket = null; |
|
|
if (recordedSamples > 0) { |
|
|
recordingComplete = true; |
|
|
updateSaveButtonState(); |
|
|
} |
|
|
}; |
|
|
}; |
|
|
|
|
|
const stop = () => { |
|
|
if (!isPlaying) { |
|
|
resetState(false); |
|
|
updateButtonLabel(); |
|
|
return; |
|
|
} |
|
|
resetState(false); |
|
|
setPlaybackElapsed(Math.min(lastPlaybackElapsed, modelGeneratedTotal)); |
|
|
appendLog('[Frontend] Playback stopped'); |
|
|
if (recordedSamples > 0) { |
|
|
recordingComplete = true; |
|
|
updateSaveButtonState(); |
|
|
} |
|
|
previewActive = false; |
|
|
clearPreviewTimer(); |
|
|
streamingPreview?.classList.remove('streaming-active'); |
|
|
updateButtonLabel(); |
|
|
}; |
|
|
|
|
|
controlBtn.addEventListener('click', () => { |
|
|
if (isPlaying) { |
|
|
stop(); |
|
|
} else { |
|
|
start(); |
|
|
} |
|
|
}); |
|
|
if (saveBtn) { |
|
|
saveBtn.addEventListener('click', handleSaveClick); |
|
|
} |
|
|
updateButtonLabel(); |
|
|
updateSaveButtonState(); |
|
|
window.addEventListener('beforeunload', () => { |
|
|
resetState(); |
|
|
clearPreviewTimer(); |
|
|
revokeDownloadUrl(); |
|
|
}); |
|
|
})(); |
|
|
</script> |
|
|
</body> |
|
|
</html> |
|
|
|