Spaces:

fffiloni
/

echomimic-v2

Running

App Files Files Community

fffiloni commited on Nov 27, 2024

Commit

d506cd1

verified ·

1 Parent(s): c9a6087

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -14

app.py CHANGED Viewed

@@ -19,6 +19,28 @@ from datetime import datetime
 from torchao.quantization import quantize_, int8_weight_only
 import gc
 import requests
 import tarfile
@@ -78,6 +100,8 @@ snapshot_download(
     local_dir="./pretrained_weights/sd-image-variations-diffusers"
 )
 # Download and place the Whisper model in the "audio_processor" folder
 def download_whisper_model():
     url = "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt"
@@ -118,7 +142,7 @@ elif ffmpeg_path not in os.getenv('PATH'):
     os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}"
-def generate(image_input, audio_input, pose_input, width, height, length, steps, sample_rate, cfg, fps, context_frames, context_overlap, quantization_input, seed):
     gc.collect()
     torch.cuda.empty_cache()
     torch.cuda.ipc_collect()
@@ -216,6 +240,10 @@ def generate(image_input, audio_input, pose_input, width, height, length, steps,
         seed = random.randint(100, 1000000)
         generator = torch.manual_seed(seed)
     inputs_dict = {
         "refimg": image_input,
         "audio": audio_input,
@@ -289,25 +317,36 @@ def generate(image_input, audio_input, pose_input, width, height, length, steps,
 with gr.Blocks() as demo:
     gr.Markdown("""
-            <div>
-                <h2 style="font-size: 30px;text-align: center;">EchoMimicV2</h2>
-            </div>
-            <div style="text-align: center;">
-                <a href="https://github.com/antgroup/echomimic_v2">🌐 Github</a> |
-                <a href="https://arxiv.org/abs/2411.10061">📜 arXiv </a>
-            </div>
-            <div style="text-align: center; font-weight: bold; color: red;">
-                ⚠️ This demonstration is for academic research and experiential use only.
-            </div>
             """)
     with gr.Column():
         with gr.Row():
             with gr.Column():
                 with gr.Group():
                     image_input = gr.Image(label="Image Input (Auto Scaling)", type="filepath")
-                    audio_input = gr.Audio(label="Audio Input", type="filepath")
-                    pose_input = gr.Textbox(label="Pose Input (Directory Path)", placeholder="Please enter the directory path for pose data.", value="assets/halfbody_demo/pose/01")
                 with gr.Accordion("Advanced Settings", open=False):
                     with gr.Row():
                         width = gr.Number(label="Width (multiple of 16, recommended: 768)", value=768)
@@ -352,4 +391,4 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
     demo.queue()
-    demo.launch(inbrowser=True)

 from torchao.quantization import quantize_, int8_weight_only
 import gc
+import tempfile
+from pydub import AudioSegment
+def cut_audio_to_5_seconds(audio_path):
+    try:
+        # Load the audio file
+        audio = AudioSegment.from_file(audio_path)
+        # Trim to a maximum of 5 seconds (5000 milliseconds)
+        trimmed_audio = audio[:5000]
+        # Create a temporary directory
+        temp_dir = tempfile.mkdtemp()
+        output_path = os.path.join(temp_dir, "trimmed_audio.wav")
+        # Export the trimmed audio
+        trimmed_audio.export(output_path, format="wav")
+        return output_path
+    except Exception as e:
+        return f"An error occurred while trying to trim audio: {str(e)}"
 import requests
 import tarfile
     local_dir="./pretrained_weights/sd-image-variations-diffusers"
 )
+is_shared_ui = True if "fffiloni/echomimic-v2" in os.environ['SPACE_ID'] else False
 # Download and place the Whisper model in the "audio_processor" folder
 def download_whisper_model():
     url = "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt"
     os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}"
+def generate(image_input, audio_input, pose_input, width, height, length, steps, sample_rate, cfg, fps, context_frames, context_overlap, quantization_input, seed, progress=gr.Progress(track_tqdm=True)):
     gc.collect()
     torch.cuda.empty_cache()
     torch.cuda.ipc_collect()
         seed = random.randint(100, 1000000)
         generator = torch.manual_seed(seed)
+    if is_shared_ui:
+        audio_input = cut_audio_to_5_seconds(audio_input)
+        print(f"Trimmed audio saved at: {audio_input}")
     inputs_dict = {
         "refimg": image_input,
         "audio": audio_input,
 with gr.Blocks() as demo:
     gr.Markdown("""
+            # EchoMimicV2
+            ⚠️ This demonstration is for academic research and experiential use only.
             """)
+    gr.HTML("""
+        <div style="display:flex;column-gap:4px;">
+            <a href="https://github.com/antgroup/echomimic_v2">
+                <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
+            </a>
+            <a href="https://antgroup.github.io/ai/echomimic_v2/">
+                <img src='https://img.shields.io/badge/Project-Page-green'>
+            </a>
+			<a href="https://arxiv.org/abs/2411.10061">
+                <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
+            </a>
+            <a href="https://huggingface.co/spaces/fffiloni/echomimic-v2?duplicate=true">
+				<img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
+			</a>
+			<a href="https://huggingface.co/fffiloni">
+				<img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-sm-dark.svg" alt="Follow me on HF">
+			</a>
+        </div>
+        """)
     with gr.Column():
         with gr.Row():
             with gr.Column():
                 with gr.Group():
                     image_input = gr.Image(label="Image Input (Auto Scaling)", type="filepath")
+                    audio_input = gr.Audio(label="Audio Input - max 5 seconds on shared UI", type="filepath")
+                    # pose_input = gr.Textbox(label="Pose Input (Directory Path)", placeholder="Please enter the directory path for pose data.", value="assets/halfbody_demo/pose/01")
                 with gr.Accordion("Advanced Settings", open=False):
                     with gr.Row():
                         width = gr.Number(label="Width (multiple of 16, recommended: 768)", value=768)
 if __name__ == "__main__":
     demo.queue()
+    demo.launch(show_api=False, show_error=True, ssr_mode=False)