import { useState } from "react";
import { useModel } from "./hooks/useModel";
import Waveform from "./components/Waveform";
import PulseBars from "./components/PulseBars";
const PRESETS = [
{
name: "Pop Ballad",
emoji: "💗",
duration: 60,
caption: "A gentle pop ballad with piano and soft vocals, key of C major, 80 BPM, emotional and dreamy",
lyrics: "[verse]\nUnderneath the stars tonight\nWe dance beneath the pale moonlight\nEvery moment feels so right\nHolding you so close and tight\n\n[chorus]\nThis is where I want to be\nRight here with you next to me\nLet the world just fade away\nIn your arms I want to stay",
},
{
name: "Rock Anthem",
emoji: "🎸",
duration: 60,
caption: "An energetic rock anthem with electric guitars and powerful drums, key of E minor, 140 BPM, aggressive and intense",
lyrics: "[verse]\nFire burning in my veins\nBreaking free from all these chains\nNothing left to hold me back\nRiding down the beaten track\n\n[chorus]\nWe are the ones who rise\nWith thunder in our eyes\nWe'll never be denied\nWe're burning up the sky",
},
{
name: "Lo-fi Chill",
emoji: "☕",
duration: 20,
caption: "A relaxing lo-fi hip hop beat with jazz piano samples and vinyl crackle, key of F major, 75 BPM, mellow and nostalgic",
lyrics: "[instrumental]",
},
];
function WebGPUGate({ children }) {
const supported = typeof navigator !== "undefined" && !!navigator.gpu;
if (supported) return children;
return (
🎹
WebGPU not available
This demo needs WebGPU to run ACE-Step in your browser. Try Chrome 113+, Edge 113+, or Safari 26+ on desktop.
);
}
function LoadGate({ onLoad, status, message, progress, error }) {
const loading = status === "loading";
return (
🎹
Load models
Loads ~8 GB of ONNX models. Everything runs in your browser — your prompts never leave this device.
Built with{" "}
🤗 Transformers.js
{" + "}
ONNX Runtime Web
.
{error ? (
{error}
) : loading ? (
{message && (
{message}
)}
{progress && }
) : (
)}
);
}
function PresetCard({ preset, active, onClick }) {
return (
);
}
function GenerationStatus({ status, message }) {
if (status !== "generating") return null;
return (
{message || "Generating…"}
this takes 1–4 min
);
}
function OutputCard({ audioUrl, audioInfo }) {
if (!audioUrl) return null;
return (
)}
>
)}
{/* About / methodology */}
How it works & known limitations
Pipeline
Text encoder (Qwen3-Embedding-0.6B, fp16) turns the caption into conditioning hidden states; the same model provides token embeddings for the lyric path.
5 Hz LM (ACE-Step acestep-5Hz-lm-0.6B, 4-bit MatMulNBits) writes a short chain-of-thought, then emits ~50 audio codes per 10 s of output.
FSQ → detokenizer expands the codes into 25 Hz acoustic features used as cross-attention hints.
DiT decoder (2B parameters, fp16) runs 8 Euler flow-matching steps (shift=3.0) over a random latent conditioned on text, lyrics, and hints.
Oobleck VAE (fp16) decodes the 25 Hz latent into stereo 48 kHz audio.
Why it runs in the browser
Everything executes on-device via onnxruntime-web with the WebGPU execution provider. Two Web Workers keep the LM and the diffusion+VAE graphs in separate WASM heaps so neither hits the 4 GB single-heap limit. Total download is ~2 GB (cached in the browser after the first load).
Methodology notes
Compared stage-by-stage against the PyTorch fp32 reference: every tensor agrees to within 0.2% relative L2, and the generated waveforms sound identical.
FP16 DiT is exported natively (model.half() + dynamo). An earlier fp32→fp16 conversion with post-hoc Cast insertion produced a 25 Hz helicopter artifact, now resolved.
4-bit quantization is MatMulNBits with block_size=64, asymmetric, accuracy_level=1 (fp32 accumulate).
Known limitations
First load is slow. ~2 GB of weights must be fetched and cached; subsequent runs start fast.
Vocals need ≥60 s. The 0.6B LM often refuses to emit lyric-aligned audio codes for short durations — instrumentals work at any length.
Turbo quality ceiling. We run 8 diffusion steps (shift=3.0). More steps nudge quality up but aren't supported by the turbo weights we ship.
Condition-encoder drift. The ONNX condition_encoder has a small drift (~0.4 max_diff) vs PyTorch on real inputs — inaudible today but a known residual we haven’t closed.
WebGPU only. No fallback path; the demo gates on WebGPU support (Chrome/Edge 113+, Safari 26+ desktop).
Memory. Two workers each hold ~1–2 GB; low-RAM devices may hit std::bad_alloc during model creation.
No seed control. Each generation uses a fresh RNG, so re-runs with the same prompt will differ.