JS6969 commited on
Commit
a85a2ba
·
verified ·
1 Parent(s): e69a823

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -71
app.py CHANGED
@@ -1,5 +1,5 @@
1
  # =====================================================================
2
- # ForgeCaptions - Gradio app for single & batch image captioning
3
  # =====================================================================
4
 
5
  # ------------------------------
@@ -17,6 +17,7 @@ import gradio as gr
17
  from PIL import Image
18
  import torch
19
  from transformers import LlavaForConditionalGeneration, AutoProcessor
 
20
 
21
  # Optional: Liger kernel (ignored if missing)
22
  try:
@@ -25,14 +26,6 @@ except Exception:
25
  def apply_liger_kernel_to_llama(*args, **kwargs):
26
  pass
27
 
28
- # Try Spaces; we’ll use explicit @spaces.GPU() when available
29
- try:
30
- import spaces
31
- HAS_SPACES = True
32
- except Exception:
33
- spaces = None
34
- HAS_SPACES = False
35
-
36
 
37
  # ------------------------------
38
  # 1) Paths & small constants
@@ -80,8 +73,8 @@ _DTYPE = torch.float32
80
 
81
  def get_model():
82
  """
83
- Create/reuse the model. IMPORTANT: call ONLY inside GPU-decorated functions on Spaces.
84
- Avoids CUDA init in main process (Stateless GPU rule).
85
  """
86
  global _MODEL, _DEVICE, _DTYPE
87
  if _MODEL is None:
@@ -94,7 +87,7 @@ def get_model():
94
  low_cpu_mem_usage=True,
95
  device_map=0,
96
  )
97
- # Try to enable Liger on the LLM submodule (best-effort)
98
  try:
99
  lm = getattr(_MODEL, "language_model", None) or getattr(_MODEL, "model", None)
100
  if lm is not None:
@@ -230,7 +223,6 @@ def load_settings() -> dict:
230
  except Exception:
231
  cfg = {}
232
 
233
- # Defaults
234
  defaults = {
235
  "dataset_name": "forgecaptions",
236
  "temperature": 0.6,
@@ -254,7 +246,6 @@ def load_settings() -> dict:
254
  for k, v in defaults.items():
255
  cfg.setdefault(k, v)
256
 
257
- # Normalize
258
  styles = cfg.get("styles") or []
259
  if not isinstance(styles, list):
260
  styles = [styles]
@@ -440,7 +431,7 @@ def final_instruction(style_list: List[str], extra_opts: List[str], name_value:
440
 
441
 
442
  # ------------------------------
443
- # 8) GPU caption functions
444
  # ------------------------------
445
  def _build_inputs(im: Image.Image, instr: str, dtype) -> Dict[str, Any]:
446
  convo = [
@@ -453,59 +444,37 @@ def _build_inputs(im: Image.Image, instr: str, dtype) -> Dict[str, Any]:
453
  inputs["pixel_values"] = inputs["pixel_values"].to(dtype)
454
  return inputs
455
 
456
- # ---- caption_single (explicit @spaces.GPU() on Spaces) ----
457
- if HAS_SPACES:
458
- @spaces.GPU()
459
- @torch.no_grad()
460
- def caption_single(img: Image.Image, instr: str) -> str:
461
- if img is None:
462
- return "No image provided."
463
- s = load_settings()
464
- im = resize_for_model(img, int(s.get("max_side", 896)))
465
- cap = caption_once_core(im, instr, s)
466
- return cap
467
- else:
468
- @torch.no_grad()
469
- def caption_single(img: Image.Image, instr: str) -> str:
470
- if img is None:
471
- return "No image provided."
472
- s = load_settings()
473
- im = resize_for_model(img, int(s.get("max_side", 896)))
474
- cap = caption_once_core(im, instr, s)
475
- return cap
476
-
477
- # ---- run_batch (explicit @spaces.GPU() on Spaces) ----
478
- if HAS_SPACES:
479
- @spaces.GPU()
480
- @torch.no_grad()
481
- def run_batch(
482
- files: List[Any],
483
- session_rows: List[dict],
484
- instr_text: str,
485
- temp: float,
486
- top_p: float,
487
- max_tokens: int,
488
- max_side: int,
489
- time_budget_s: float | None = None,
490
- progress: gr.Progress = gr.Progress(track_tqdm=True),
491
- ) -> Tuple[List[dict], list, list, str, List[str], int, int]:
492
- return run_batch_core(files, session_rows, instr_text, temp, top_p, max_tokens, max_side, time_budget_s, progress)
493
- else:
494
- @torch.no_grad()
495
- def run_batch(
496
- files: List[Any],
497
- session_rows: List[dict],
498
- instr_text: str,
499
- temp: float,
500
- top_p: float,
501
- max_tokens: int,
502
- max_side: int,
503
- time_budget_s: float | None = None,
504
- progress: gr.Progress = gr.Progress(track_tqdm=True),
505
- ) -> Tuple[List[dict], list, list, str, List[str], int, int]:
506
- return run_batch_core(files, session_rows, instr_text, temp, top_p, max_tokens, max_side, time_budget_s, progress)
507
-
508
- # ---- shared core routines used by both CPU and GPU-decorated wrappers ----
509
  def caption_once_core(im: Image.Image, instr: str, settings: dict) -> str:
510
  cap = caption_once(
511
  im, instr,
@@ -862,9 +831,8 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
862
  gr.Markdown(
863
  "### 🔷 Shape Aliases\n"
864
  "Replace literal **shape tokens** in captions with a preferred **name**.\n\n"
865
- "**How to use:**\n"
866
- "- Left column = a single token **or** comma/pipe-separated synonyms, e.g. `diamond, rhombus | lozenge`\n"
867
- "- Right column = replacement name, e.g. `family-emblem`\n"
868
  "Matches are case-insensitive, catches simple plurals, and also matches `*-shaped` / `* shaped` variants."
869
  )
870
 
 
1
  # =====================================================================
2
+ # ForgeCaptions - Gradio app for single & batch image captioning (Spaces-only)
3
  # =====================================================================
4
 
5
  # ------------------------------
 
17
  from PIL import Image
18
  import torch
19
  from transformers import LlavaForConditionalGeneration, AutoProcessor
20
+ import spaces # Spaces-only
21
 
22
  # Optional: Liger kernel (ignored if missing)
23
  try:
 
26
  def apply_liger_kernel_to_llama(*args, **kwargs):
27
  pass
28
 
 
 
 
 
 
 
 
 
29
 
30
  # ------------------------------
31
  # 1) Paths & small constants
 
73
 
74
  def get_model():
75
  """
76
+ Create/reuse the model.
77
+ IMPORTANT: call ONLY inside @spaces.GPU() functions on Spaces (ZeroGPU stateless rule).
78
  """
79
  global _MODEL, _DEVICE, _DTYPE
80
  if _MODEL is None:
 
87
  low_cpu_mem_usage=True,
88
  device_map=0,
89
  )
90
+ # Best-effort Liger on the LLM submodule
91
  try:
92
  lm = getattr(_MODEL, "language_model", None) or getattr(_MODEL, "model", None)
93
  if lm is not None:
 
223
  except Exception:
224
  cfg = {}
225
 
 
226
  defaults = {
227
  "dataset_name": "forgecaptions",
228
  "temperature": 0.6,
 
246
  for k, v in defaults.items():
247
  cfg.setdefault(k, v)
248
 
 
249
  styles = cfg.get("styles") or []
250
  if not isinstance(styles, list):
251
  styles = [styles]
 
431
 
432
 
433
  # ------------------------------
434
+ # 8) GPU caption functions (Spaces-only)
435
  # ------------------------------
436
  def _build_inputs(im: Image.Image, instr: str, dtype) -> Dict[str, Any]:
437
  convo = [
 
444
  inputs["pixel_values"] = inputs["pixel_values"].to(dtype)
445
  return inputs
446
 
447
+ @spaces.GPU()
448
+ @torch.no_grad()
449
+ def caption_single(img: Image.Image, instr: str) -> str:
450
+ if img is None:
451
+ return "No image provided."
452
+ s = load_settings()
453
+ im = resize_for_model(img, int(s.get("max_side", 896)))
454
+ cap = caption_once_core(im, instr, s)
455
+ return cap
456
+
457
+ @spaces.GPU()
458
+ @torch.no_grad()
459
+ def run_batch(
460
+ files: List[Any],
461
+ session_rows: List[dict],
462
+ instr_text: str,
463
+ temp: float,
464
+ top_p: float,
465
+ max_tokens: int,
466
+ max_side: int,
467
+ time_budget_s: float | None = None,
468
+ progress: gr.Progress = gr.Progress(track_tqdm=True),
469
+ ) -> Tuple[List[dict], list, list, str, List[str], int, int]:
470
+ return run_batch_core(files, session_rows, instr_text, temp, top_p, max_tokens, max_side, time_budget_s, progress)
471
+
472
+ # Optional tiny probe to satisfy strict scanners (not called)
473
+ @spaces.GPU()
474
+ def _gpu_probe() -> str:
475
+ return "ok"
476
+
477
+ # ---- shared core routines used by both GPU functions ----
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
  def caption_once_core(im: Image.Image, instr: str, settings: dict) -> str:
479
  cap = caption_once(
480
  im, instr,
 
831
  gr.Markdown(
832
  "### 🔷 Shape Aliases\n"
833
  "Replace literal **shape tokens** in captions with a preferred **name**.\n\n"
834
+ "- Left column = a single token **or** comma/pipe-separated synonyms (e.g., `diamond, rhombus | lozenge`)\n"
835
+ "- Right column = replacement name (e.g., `family-emblem`)\n"
 
836
  "Matches are case-insensitive, catches simple plurals, and also matches `*-shaped` / `* shaped` variants."
837
  )
838