lulavc commited on
Commit
8ba4a62
·
verified ·
1 Parent(s): 564ce51

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +49 -63
app.py CHANGED
@@ -382,13 +382,8 @@ API_TIMEOUT = 90.0
382
  API_MAX_RETRIES = 2
383
  MAX_DESCRIPTION_LENGTH = 6000 # For GLM prompt generation - doubled for very detailed descriptions
384
 
385
- # Enable optimized backends (SDPA uses FlashAttention when available)
386
- torch.backends.cuda.enable_flash_sdp(True)
387
- torch.backends.cuda.enable_mem_efficient_sdp(True)
388
- torch.backends.cudnn.benchmark = True
389
- # Enable TF32 for better performance on Ampere+ GPUs
390
- torch.backends.cuda.matmul.allow_tf32 = True
391
- torch.backends.cudnn.allow_tf32 = True
392
 
393
  # Singleton clients with timeout and retry
394
  _deepseek_client: Optional[OpenAI] = None
@@ -861,64 +856,15 @@ MAX_SEQ_LEN = 65536 # 2048x2048 -> 256x256 -> 65,536
861
  # which requires special handling in torch.export. Enable with ENABLE_AOTI=true once fixed.
862
  ENABLE_AOTI = os.environ.get("ENABLE_AOTI", "false").lower() == "true"
863
 
864
- logger.info("Loading Z-Image-Turbo pipeline...")
865
 
866
  pipe_t2i = DiffusionPipeline.from_pretrained(
867
  "Tongyi-MAI/Z-Image-Turbo",
868
  torch_dtype=torch.bfloat16,
869
  )
870
- pipe_t2i.to("cuda")
871
 
872
- # Enable FlashAttention-3 via kernels library (H100/H200 Hopper GPUs)
873
- try:
874
- pipe_t2i.transformer.set_attention_backend("_flash_3_hub")
875
- logger.info("FlashAttention-3 enabled via kernels library")
876
- except Exception as e:
877
- logger.warning(f"FA3 not available, using default SDPA attention: {e}")
878
-
879
- # =============================================================================
880
- # APPLY AOTI COMPILATION
881
- # =============================================================================
882
-
883
- if ENABLE_AOTI:
884
- try:
885
- # Use the corrected compile function that handles positional args properly
886
- @spaces.GPU(duration=1500)
887
- def _compile_wrapper():
888
- return compile_transformer_aoti(
889
- pipe=pipe_t2i,
890
- example_prompt="example prompt for compilation",
891
- height=1024,
892
- width=1024,
893
- num_inference_steps=1,
894
- inductor_configs=INDUCTOR_CONFIGS,
895
- min_seq_len=MIN_SEQ_LEN,
896
- max_seq_len=MAX_SEQ_LEN,
897
- )
898
-
899
- compiled_transformer = _compile_wrapper()
900
- if compiled_transformer is not None:
901
- spaces.aoti_apply(compiled_transformer, pipe_t2i.transformer)
902
- logger.info("AoTI transformer applied successfully (1.3x-1.8x speedup expected)")
903
- else:
904
- logger.warning("Using non-compiled transformer (no AoTI speedup)")
905
- except Exception as e:
906
- logger.error(f"AoTI application failed: {type(e).__name__}: {str(e)}")
907
- logger.warning("Using non-compiled transformer")
908
- else:
909
- logger.info("AoTI compilation disabled via ENABLE_AOTI=false")
910
-
911
- # Enable torch.compile for VAE decoder (keep existing optimization)
912
- try:
913
- pipe_t2i.vae.decode = torch.compile(
914
- pipe_t2i.vae.decode,
915
- mode="reduce-overhead",
916
- )
917
- logger.info("torch.compile enabled for VAE decoder")
918
- except Exception as e:
919
- logger.warning(f"VAE torch.compile failed: {e}")
920
-
921
- # Create image-to-image pipeline (shares compiled transformer)
922
  pipe_i2i = ZImageImg2ImgPipeline(
923
  transformer=pipe_t2i.transformer,
924
  vae=pipe_t2i.vae,
@@ -927,10 +873,46 @@ pipe_i2i = ZImageImg2ImgPipeline(
927
  scheduler=pipe_t2i.scheduler,
928
  )
929
 
930
- if ENABLE_AOTI:
931
- logger.info("Pipelines ready! (TF32 + FA3 + AoTI Transformer + VAE compile)")
932
- else:
933
- logger.info("Pipelines ready! (TF32 + FA3 + VAE compile) - AoTI disabled")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
934
 
935
  STYLES = ["None", "Photorealistic", "Cinematic", "Anime", "Digital Art",
936
  "Oil Painting", "Watercolor", "3D Render", "Fantasy", "Sci-Fi"]
@@ -1130,6 +1112,7 @@ def transform_with_polish(input_image: Optional[Image.Image], prompt: str, style
1130
  @spaces.GPU(duration=120)
1131
  def generate(full_prompt: str, polished_display: str, ratio: str, steps: int, seed: int, randomize: bool, progress=gr.Progress(track_tqdm=True)) -> Tuple[Optional[Image.Image], int]:
1132
  """Generate image from text prompt."""
 
1133
  if randomize:
1134
  seed = torch.randint(0, 2**32 - 1, (1,)).item()
1135
  seed = int(seed)
@@ -1159,6 +1142,7 @@ def generate(full_prompt: str, polished_display: str, ratio: str, steps: int, se
1159
  @spaces.GPU(duration=90)
1160
  def transform(input_image: Optional[Image.Image], full_prompt: str, polished_display: str, strength: float, steps: int, seed: int, randomize: bool, progress=gr.Progress(track_tqdm=True)) -> Tuple[Optional[Image.Image], int]:
1161
  """Transform image using prompt guidance."""
 
1162
  if input_image is None:
1163
  return None, 0
1164
 
@@ -1207,6 +1191,7 @@ def transform(input_image: Optional[Image.Image], full_prompt: str, polished_dis
1207
  def mcp_generate(prompt: str, style: str = "None", ratio: str = "1:1 Square (1024x1024)",
1208
  steps: int = 9, seed: int = 42, randomize: bool = True) -> Tuple[Optional[Image.Image], int]:
1209
  """MCP-friendly image generation. Takes prompt directly and handles polish internally."""
 
1210
  if randomize:
1211
  seed = torch.randint(0, 2**32 - 1, (1,)).item()
1212
  seed = int(seed)
@@ -1241,6 +1226,7 @@ def mcp_transform(image: Optional[Image.Image], prompt: str, style: str = "None"
1241
  strength: float = 0.6, steps: int = 9, seed: int = 42,
1242
  randomize: bool = True) -> Tuple[Optional[Image.Image], int]:
1243
  """MCP-friendly image transformation. Takes all parameters directly."""
 
1244
  if image is None:
1245
  return None, 0
1246
 
 
382
  API_MAX_RETRIES = 2
383
  MAX_DESCRIPTION_LENGTH = 6000 # For GLM prompt generation - doubled for very detailed descriptions
384
 
385
+ # Backend settings will be applied when GPU is available (inside @spaces.GPU functions)
386
+ # Don't set them here to avoid CUDA initialization at module load time
 
 
 
 
 
387
 
388
  # Singleton clients with timeout and retry
389
  _deepseek_client: Optional[OpenAI] = None
 
856
  # which requires special handling in torch.export. Enable with ENABLE_AOTI=true once fixed.
857
  ENABLE_AOTI = os.environ.get("ENABLE_AOTI", "false").lower() == "true"
858
 
859
+ logger.info("Loading Z-Image-Turbo pipeline (CPU)...")
860
 
861
  pipe_t2i = DiffusionPipeline.from_pretrained(
862
  "Tongyi-MAI/Z-Image-Turbo",
863
  torch_dtype=torch.bfloat16,
864
  )
865
+ # Don't move to CUDA here - ZeroGPU requires GPU ops inside @spaces.GPU functions
866
 
867
+ # Create image-to-image pipeline (shares components)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
868
  pipe_i2i = ZImageImg2ImgPipeline(
869
  transformer=pipe_t2i.transformer,
870
  vae=pipe_t2i.vae,
 
873
  scheduler=pipe_t2i.scheduler,
874
  )
875
 
876
+ # Track if pipelines have been moved to GPU and optimized
877
+ _gpu_initialized = False
878
+
879
+ def _ensure_gpu():
880
+ """Move pipelines to GPU and apply optimizations (called inside @spaces.GPU functions)."""
881
+ global _gpu_initialized
882
+ if _gpu_initialized:
883
+ return
884
+
885
+ # Enable optimized backends
886
+ torch.backends.cuda.enable_flash_sdp(True)
887
+ torch.backends.cuda.enable_mem_efficient_sdp(True)
888
+ torch.backends.cudnn.benchmark = True
889
+ torch.backends.cuda.matmul.allow_tf32 = True
890
+ torch.backends.cudnn.allow_tf32 = True
891
+
892
+ # Move to GPU
893
+ pipe_t2i.to("cuda")
894
+
895
+ # Enable FlashAttention-3 via kernels library (H100/H200 Hopper GPUs)
896
+ try:
897
+ pipe_t2i.transformer.set_attention_backend("_flash_3_hub")
898
+ logger.info("[GPU] FlashAttention-3 enabled via kernels library")
899
+ except Exception as e:
900
+ logger.warning(f"[GPU] FA3 not available, using default SDPA attention: {e}")
901
+
902
+ # Enable torch.compile for VAE decoder
903
+ try:
904
+ pipe_t2i.vae.decode = torch.compile(
905
+ pipe_t2i.vae.decode,
906
+ mode="reduce-overhead",
907
+ )
908
+ logger.info("[GPU] torch.compile enabled for VAE decoder")
909
+ except Exception as e:
910
+ logger.warning(f"[GPU] VAE torch.compile failed: {e}")
911
+
912
+ _gpu_initialized = True
913
+ logger.info("[GPU] Pipelines ready! (TF32 + FA3 + VAE compile)")
914
+
915
+ logger.info("Pipelines loaded on CPU - will move to GPU on first generation")
916
 
917
  STYLES = ["None", "Photorealistic", "Cinematic", "Anime", "Digital Art",
918
  "Oil Painting", "Watercolor", "3D Render", "Fantasy", "Sci-Fi"]
 
1112
  @spaces.GPU(duration=120)
1113
  def generate(full_prompt: str, polished_display: str, ratio: str, steps: int, seed: int, randomize: bool, progress=gr.Progress(track_tqdm=True)) -> Tuple[Optional[Image.Image], int]:
1114
  """Generate image from text prompt."""
1115
+ _ensure_gpu()
1116
  if randomize:
1117
  seed = torch.randint(0, 2**32 - 1, (1,)).item()
1118
  seed = int(seed)
 
1142
  @spaces.GPU(duration=90)
1143
  def transform(input_image: Optional[Image.Image], full_prompt: str, polished_display: str, strength: float, steps: int, seed: int, randomize: bool, progress=gr.Progress(track_tqdm=True)) -> Tuple[Optional[Image.Image], int]:
1144
  """Transform image using prompt guidance."""
1145
+ _ensure_gpu()
1146
  if input_image is None:
1147
  return None, 0
1148
 
 
1191
  def mcp_generate(prompt: str, style: str = "None", ratio: str = "1:1 Square (1024x1024)",
1192
  steps: int = 9, seed: int = 42, randomize: bool = True) -> Tuple[Optional[Image.Image], int]:
1193
  """MCP-friendly image generation. Takes prompt directly and handles polish internally."""
1194
+ _ensure_gpu()
1195
  if randomize:
1196
  seed = torch.randint(0, 2**32 - 1, (1,)).item()
1197
  seed = int(seed)
 
1226
  strength: float = 0.6, steps: int = 9, seed: int = 42,
1227
  randomize: bool = True) -> Tuple[Optional[Image.Image], int]:
1228
  """MCP-friendly image transformation. Takes all parameters directly."""
1229
+ _ensure_gpu()
1230
  if image is None:
1231
  return None, 0
1232