Spaces:

lulavc
/

Z-Image-Turbo

Running on Zero

App Files Files

lulavc commited on 2 days ago

Commit

8ba4a62

verified ·

1 Parent(s): 564ce51

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +49 -63

app.py CHANGED Viewed

@@ -382,13 +382,8 @@ API_TIMEOUT = 90.0
 API_MAX_RETRIES = 2
 MAX_DESCRIPTION_LENGTH = 6000  # For GLM prompt generation - doubled for very detailed descriptions
-# Enable optimized backends (SDPA uses FlashAttention when available)
-torch.backends.cuda.enable_flash_sdp(True)
-torch.backends.cuda.enable_mem_efficient_sdp(True)
-torch.backends.cudnn.benchmark = True
-# Enable TF32 for better performance on Ampere+ GPUs
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
 # Singleton clients with timeout and retry
 _deepseek_client: Optional[OpenAI] = None
@@ -861,64 +856,15 @@ MAX_SEQ_LEN = 65536   # 2048x2048 -> 256x256 -> 65,536
 # which requires special handling in torch.export. Enable with ENABLE_AOTI=true once fixed.
 ENABLE_AOTI = os.environ.get("ENABLE_AOTI", "false").lower() == "true"
-logger.info("Loading Z-Image-Turbo pipeline...")
 pipe_t2i = DiffusionPipeline.from_pretrained(
     "Tongyi-MAI/Z-Image-Turbo",
     torch_dtype=torch.bfloat16,
 )
-pipe_t2i.to("cuda")
-# Enable FlashAttention-3 via kernels library (H100/H200 Hopper GPUs)
-try:
-    pipe_t2i.transformer.set_attention_backend("_flash_3_hub")
-    logger.info("FlashAttention-3 enabled via kernels library")
-except Exception as e:
-    logger.warning(f"FA3 not available, using default SDPA attention: {e}")
-# =============================================================================
-# APPLY AOTI COMPILATION
-# =============================================================================
-if ENABLE_AOTI:
-    try:
-        # Use the corrected compile function that handles positional args properly
-        @spaces.GPU(duration=1500)
-        def _compile_wrapper():
-            return compile_transformer_aoti(
-                pipe=pipe_t2i,
-                example_prompt="example prompt for compilation",
-                height=1024,
-                width=1024,
-                num_inference_steps=1,
-                inductor_configs=INDUCTOR_CONFIGS,
-                min_seq_len=MIN_SEQ_LEN,
-                max_seq_len=MAX_SEQ_LEN,
-            )
-        compiled_transformer = _compile_wrapper()
-        if compiled_transformer is not None:
-            spaces.aoti_apply(compiled_transformer, pipe_t2i.transformer)
-            logger.info("AoTI transformer applied successfully (1.3x-1.8x speedup expected)")
-        else:
-            logger.warning("Using non-compiled transformer (no AoTI speedup)")
-    except Exception as e:
-        logger.error(f"AoTI application failed: {type(e).__name__}: {str(e)}")
-        logger.warning("Using non-compiled transformer")
-else:
-    logger.info("AoTI compilation disabled via ENABLE_AOTI=false")
-# Enable torch.compile for VAE decoder (keep existing optimization)
-try:
-    pipe_t2i.vae.decode = torch.compile(
-        pipe_t2i.vae.decode,
-        mode="reduce-overhead",
-    )
-    logger.info("torch.compile enabled for VAE decoder")
-except Exception as e:
-    logger.warning(f"VAE torch.compile failed: {e}")
-# Create image-to-image pipeline (shares compiled transformer)
 pipe_i2i = ZImageImg2ImgPipeline(
     transformer=pipe_t2i.transformer,
     vae=pipe_t2i.vae,
@@ -927,10 +873,46 @@ pipe_i2i = ZImageImg2ImgPipeline(
     scheduler=pipe_t2i.scheduler,
 )
-if ENABLE_AOTI:
-    logger.info("Pipelines ready! (TF32 + FA3 + AoTI Transformer + VAE compile)")
-else:
-    logger.info("Pipelines ready! (TF32 + FA3 + VAE compile) - AoTI disabled")
 STYLES = ["None", "Photorealistic", "Cinematic", "Anime", "Digital Art",
           "Oil Painting", "Watercolor", "3D Render", "Fantasy", "Sci-Fi"]
@@ -1130,6 +1112,7 @@ def transform_with_polish(input_image: Optional[Image.Image], prompt: str, style
 @spaces.GPU(duration=120)
 def generate(full_prompt: str, polished_display: str, ratio: str, steps: int, seed: int, randomize: bool, progress=gr.Progress(track_tqdm=True)) -> Tuple[Optional[Image.Image], int]:
     """Generate image from text prompt."""
     if randomize:
         seed = torch.randint(0, 2**32 - 1, (1,)).item()
     seed = int(seed)
@@ -1159,6 +1142,7 @@ def generate(full_prompt: str, polished_display: str, ratio: str, steps: int, se
 @spaces.GPU(duration=90)
 def transform(input_image: Optional[Image.Image], full_prompt: str, polished_display: str, strength: float, steps: int, seed: int, randomize: bool, progress=gr.Progress(track_tqdm=True)) -> Tuple[Optional[Image.Image], int]:
     """Transform image using prompt guidance."""
     if input_image is None:
         return None, 0
@@ -1207,6 +1191,7 @@ def transform(input_image: Optional[Image.Image], full_prompt: str, polished_dis
 def mcp_generate(prompt: str, style: str = "None", ratio: str = "1:1 Square (1024x1024)",
                  steps: int = 9, seed: int = 42, randomize: bool = True) -> Tuple[Optional[Image.Image], int]:
     """MCP-friendly image generation. Takes prompt directly and handles polish internally."""
     if randomize:
         seed = torch.randint(0, 2**32 - 1, (1,)).item()
     seed = int(seed)
@@ -1241,6 +1226,7 @@ def mcp_transform(image: Optional[Image.Image], prompt: str, style: str = "None"
                   strength: float = 0.6, steps: int = 9, seed: int = 42,
                   randomize: bool = True) -> Tuple[Optional[Image.Image], int]:
     """MCP-friendly image transformation. Takes all parameters directly."""
     if image is None:
         return None, 0

 API_MAX_RETRIES = 2
 MAX_DESCRIPTION_LENGTH = 6000  # For GLM prompt generation - doubled for very detailed descriptions
+# Backend settings will be applied when GPU is available (inside @spaces.GPU functions)
+# Don't set them here to avoid CUDA initialization at module load time
 # Singleton clients with timeout and retry
 _deepseek_client: Optional[OpenAI] = None
 # which requires special handling in torch.export. Enable with ENABLE_AOTI=true once fixed.
 ENABLE_AOTI = os.environ.get("ENABLE_AOTI", "false").lower() == "true"
+logger.info("Loading Z-Image-Turbo pipeline (CPU)...")
 pipe_t2i = DiffusionPipeline.from_pretrained(
     "Tongyi-MAI/Z-Image-Turbo",
     torch_dtype=torch.bfloat16,
 )
+# Don't move to CUDA here - ZeroGPU requires GPU ops inside @spaces.GPU functions
+# Create image-to-image pipeline (shares components)
 pipe_i2i = ZImageImg2ImgPipeline(
     transformer=pipe_t2i.transformer,
     vae=pipe_t2i.vae,
     scheduler=pipe_t2i.scheduler,
 )
+# Track if pipelines have been moved to GPU and optimized
+_gpu_initialized = False
+def _ensure_gpu():
+    """Move pipelines to GPU and apply optimizations (called inside @spaces.GPU functions)."""
+    global _gpu_initialized
+    if _gpu_initialized:
+        return
+    # Enable optimized backends
+    torch.backends.cuda.enable_flash_sdp(True)
+    torch.backends.cuda.enable_mem_efficient_sdp(True)
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Move to GPU
+    pipe_t2i.to("cuda")
+    # Enable FlashAttention-3 via kernels library (H100/H200 Hopper GPUs)
+    try:
+        pipe_t2i.transformer.set_attention_backend("_flash_3_hub")
+        logger.info("[GPU] FlashAttention-3 enabled via kernels library")
+    except Exception as e:
+        logger.warning(f"[GPU] FA3 not available, using default SDPA attention: {e}")
+    # Enable torch.compile for VAE decoder
+    try:
+        pipe_t2i.vae.decode = torch.compile(
+            pipe_t2i.vae.decode,
+            mode="reduce-overhead",
+        )
+        logger.info("[GPU] torch.compile enabled for VAE decoder")
+    except Exception as e:
+        logger.warning(f"[GPU] VAE torch.compile failed: {e}")
+    _gpu_initialized = True
+    logger.info("[GPU] Pipelines ready! (TF32 + FA3 + VAE compile)")
+logger.info("Pipelines loaded on CPU - will move to GPU on first generation")
 STYLES = ["None", "Photorealistic", "Cinematic", "Anime", "Digital Art",
           "Oil Painting", "Watercolor", "3D Render", "Fantasy", "Sci-Fi"]
 @spaces.GPU(duration=120)
 def generate(full_prompt: str, polished_display: str, ratio: str, steps: int, seed: int, randomize: bool, progress=gr.Progress(track_tqdm=True)) -> Tuple[Optional[Image.Image], int]:
     """Generate image from text prompt."""
+    _ensure_gpu()
     if randomize:
         seed = torch.randint(0, 2**32 - 1, (1,)).item()
     seed = int(seed)
 @spaces.GPU(duration=90)
 def transform(input_image: Optional[Image.Image], full_prompt: str, polished_display: str, strength: float, steps: int, seed: int, randomize: bool, progress=gr.Progress(track_tqdm=True)) -> Tuple[Optional[Image.Image], int]:
     """Transform image using prompt guidance."""
+    _ensure_gpu()
     if input_image is None:
         return None, 0
 def mcp_generate(prompt: str, style: str = "None", ratio: str = "1:1 Square (1024x1024)",
                  steps: int = 9, seed: int = 42, randomize: bool = True) -> Tuple[Optional[Image.Image], int]:
     """MCP-friendly image generation. Takes prompt directly and handles polish internally."""
+    _ensure_gpu()
     if randomize:
         seed = torch.randint(0, 2**32 - 1, (1,)).item()
     seed = int(seed)
                   strength: float = 0.6, steps: int = 9, seed: int = 42,
                   randomize: bool = True) -> Tuple[Optional[Image.Image], int]:
     """MCP-friendly image transformation. Takes all parameters directly."""
+    _ensure_gpu()
     if image is None:
         return None, 0