lulavc commited on
Commit
67fd625
·
verified ·
1 Parent(s): 779976d

Enable FlashAttention-3 + torch.compile (AoTI) for H200

Browse files
Files changed (1) hide show
  1. app.py +19 -1
app.py CHANGED
@@ -711,6 +711,24 @@ pipe_t2i = DiffusionPipeline.from_pretrained(
711
  )
712
  pipe_t2i.to("cuda")
713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
714
  # Note: ZImagePipeline custom pipeline doesn't support VAE slicing/tiling optimization
715
 
716
  pipe_i2i = ZImageImg2ImgPipeline(
@@ -721,7 +739,7 @@ pipe_i2i = ZImageImg2ImgPipeline(
721
  scheduler=pipe_t2i.scheduler,
722
  )
723
 
724
- logger.info("Pipelines ready! (TF32 + SDPA optimizations enabled)")
725
 
726
  STYLES = ["None", "Photorealistic", "Cinematic", "Anime", "Digital Art",
727
  "Oil Painting", "Watercolor", "3D Render", "Fantasy", "Sci-Fi"]
 
711
  )
712
  pipe_t2i.to("cuda")
713
 
714
+ # Enable FlashAttention-3 on Hopper GPUs (H100/H200) via kernels library
715
+ try:
716
+ pipe_t2i.transformer.set_attention_backend("_flash_3_hub")
717
+ logger.info("FlashAttention-3 enabled via kernels library")
718
+ except Exception as e:
719
+ logger.warning(f"FA3 not available, using default attention: {e}")
720
+
721
+ # Enable torch.compile with AoTI (Ahead of Time Inductor) for faster inference
722
+ try:
723
+ pipe_t2i.transformer = torch.compile(
724
+ pipe_t2i.transformer,
725
+ mode="max-autotune", # Best performance on H200
726
+ fullgraph=True,
727
+ )
728
+ logger.info("torch.compile (AoTI) enabled for transformer")
729
+ except Exception as e:
730
+ logger.warning(f"torch.compile not available: {e}")
731
+
732
  # Note: ZImagePipeline custom pipeline doesn't support VAE slicing/tiling optimization
733
 
734
  pipe_i2i = ZImageImg2ImgPipeline(
 
739
  scheduler=pipe_t2i.scheduler,
740
  )
741
 
742
+ logger.info("Pipelines ready! (TF32 + FA3 + AoTI optimizations enabled)")
743
 
744
  STYLES = ["None", "Photorealistic", "Cinematic", "Anime", "Digital Art",
745
  "Oil Painting", "Watercolor", "3D Render", "Fantasy", "Sci-Fi"]