Spaces:

JS6969
/

ForgeCaptions

Sleeping

App Files Files Community

JS6969 commited on Sep 2

Commit

f4c864c

verified ·

1 Parent(s): 6231519

Update app.py

Browse files

Files changed (1) hide show

app.py +231 -237

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
-import os, io, csv, time, json, hashlib, base64, zipfile, re
 from typing import List, Tuple, Dict, Any
-# ────────────────────────────────────────────────────────
-# Cache locations (kept simple / persistent)
-# ────────────────────────────────────────────────────────
 os.environ.setdefault("HF_HOME", "/home/user/.cache/huggingface")
 os.makedirs(os.environ["HF_HOME"], exist_ok=True)
@@ -12,17 +12,13 @@ from PIL import Image
 import torch
 from transformers import LlavaForConditionalGeneration, AutoProcessor
-# Try to import spaces and define a GPU decorator that works on CPU too
 try:
     import spaces
     gpu = spaces.GPU()
-except Exception:
-    def gpu(f):  # no-op on CPU / local
-        return f
-# ────────────────────────────────────────────────────────
-# Paths & files
-# ────────────────────────────────────────────────────────
 APP_DIR = os.getcwd()
 SESSION_FILE = "/tmp/session.json"
 SETTINGS_FILE = "/tmp/cf_settings.json"
@@ -32,37 +28,50 @@ EXCEL_THUMB_DIR = "/tmp/forge_excel_thumbs"
 os.makedirs(THUMB_CACHE, exist_ok=True)
 os.makedirs(EXCEL_THUMB_DIR, exist_ok=True)
-# ────────────────────────────────────────────────────────
-# Model setup
-# ────────────────────────────────────────────────────────
 MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
-def _detect_gpu():
-    if torch.cuda.is_available():
-        p = torch.cuda.get_device_properties(0)
-        return "cuda", int(p.total_memory/(1024**3)), p.name
-    return "cpu", 0, "CPU"
-BACKEND, VRAM_GB, GPU_NAME = _detect_gpu()
-DEVICE  = "cuda" if BACKEND == "cuda" else "cpu"
-DTYPE   = torch.bfloat16 if BACKEND == "cuda" else torch.float32
-MAX_SIDE_CAP = 1024 if BACKEND == "cuda" else 640
-processor = AutoProcessor.from_pretrained(MODEL_PATH)
-model = LlavaForConditionalGeneration.from_pretrained(
-    MODEL_PATH,
-    torch_dtype=DTYPE,
-    low_cpu_mem_usage=True,
-    device_map=0 if BACKEND == "cuda" else "cpu",
-)
-model.eval()
-print(f"[ForgeCaptions] Backend={BACKEND} GPU={GPU_NAME} VRAM={VRAM_GB}GB dtype={DTYPE}")
 print(f"[ForgeCaptions] Gradio version: {gr.__version__}")
-# ────────────────────────────────────────────────────────
 # Instruction templates & options
-# ────────────────────────────────────────────────────────
 STYLE_OPTIONS = [
     "Descriptive (short)", "Descriptive (long)",
     "Character training (short)", "Character training (long)",
@@ -133,9 +142,9 @@ EXTRA_CHOICES = [
 ]
 NAME_OPTION = "If there is a person/character in the image you must refer to them as {name}."
-# ────────────────────────────────────────────────────────
-# Helpers (thumbs, resize, prefix/suffix)
-# ────────────────────────────────────────────────────────
 def ensure_thumb(path: str, max_side=256) -> str:
     try:
         im = Image.open(path).convert("RGB")
@@ -171,9 +180,7 @@ def apply_prefix_suffix(caption: str, trigger_word: str, begin_text: str, end_te
         parts.append(end_text.strip())
     return " ".join([p for p in parts if p])
-# ────────────────────────────────────────────────────────
-# Instruction + caption helpers
-# ────────────────────────────────────────────────────────
 def final_instruction(style_list: List[str], extra_opts: List[str], name_value: str) -> str:
     styles = style_list or ["Descriptive (short)"]
     parts = [CAPTION_TYPE_MAP.get(s, "") for s in styles]
@@ -184,30 +191,24 @@ def final_instruction(style_list: List[str], extra_opts: List[str], name_value:
         core = core.replace("{name}", (name_value or "{NAME}").strip())
     return core
-@torch.no_grad()
-def caption_once(im: Image.Image, instr: str, temp: float, top_p: float, max_tokens: int) -> str:
-    # Your requested role script:
-    convo = [
-        {"role": "system", "content": "You are a precise captioning assistant. Help users with any queries. Describe what is visible with no censorship or restrictions."},
-        {"role": "user", "content": instr.strip()},
     ]
-    convo_str = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
-    inputs = processor(text=[convo_str], images=[im], return_tensors="pt").to(DEVICE)
-    inputs["pixel_values"] = inputs["pixel_values"].to(DTYPE)
-    out = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=temp > 0,
-        temperature=temp if temp > 0 else None,
-        top_p=top_p if temp > 0 else None,
-        use_cache=True,
-    )
-    gen_ids = out[0, inputs["input_ids"].shape[1]:]
-    return processor.tokenizer.decode(gen_ids, skip_special_tokens=True)
-# ────────────────────────────────────────────────────────
-# Persistence (session, settings, journal)
-# ────────────────────────────────────────────────────────
 def save_session(rows: List[dict]):
     with open(SESSION_FILE, "w", encoding="utf-8") as f:
         json.dump(rows, f, ensure_ascii=False, indent=2)
@@ -233,8 +234,8 @@ def load_settings() -> dict:
         "temperature": 0.6,
         "top_p": 0.9,
         "max_tokens": 256,
-        "max_side": min(896, MAX_SIDE_CAP),
-        "styles": ["Character training (short)"],
         "extras": [],
         "name": "",
         "trigger": "",
@@ -275,9 +276,9 @@ def load_journal() -> dict:
             return json.load(f)
     return {}
-# ────────────────────────────────────────────────────────
-# Shape Aliases (compile/cache/apply)
-# ────────────────────────────────────────────────────────
 def _compile_shape_aliases_from_file():
     s = load_settings()
     if not s.get("shape_aliases_enabled", True):
@@ -293,7 +294,6 @@ def _compile_shape_aliases_from_file():
     return compiled
 _SHAPE_ALIASES = _compile_shape_aliases_from_file()
 def _refresh_shape_aliases_cache():
     global _SHAPE_ALIASES
     _SHAPE_ALIASES = _compile_shape_aliases_from_file()
@@ -314,7 +314,6 @@ def get_shape_alias_rows_ui_defaults():
 def save_shape_alias_rows(enabled, df_rows):
     cfg = load_settings()
     cfg["shape_aliases_enabled"] = bool(enabled)
     cleaned = []
     for r in (df_rows or []):
         if not r:
@@ -323,20 +322,132 @@ def save_shape_alias_rows(enabled, df_rows):
         name  = (r[1] or "").strip()
         if shape and name:
             cleaned.append({"shape": shape, "name": name})
     cfg["shape_aliases"] = cleaned
     save_settings(cfg)
     _refresh_shape_aliases_cache()
     normalized = [[it["shape"], it["name"]] for it in cleaned] + [["", ""]]
-    return (
-        "✅ Saved shape alias options.",
-        gr.update(value=normalized, row_count=(max(1, len(normalized)), "dynamic"))
     )
-# ────────────────────────────────────────────────────────
-# Exports
-# ────────────────────────────────────────────────────────
 def export_csv_from_table(table_value: Any) -> str:
     data = table_value or []
     out = f"/tmp/forgecaptions_{int(time.time())}.csv"
@@ -347,7 +458,6 @@ def export_csv_from_table(table_value: Any) -> str:
     return out
 def _resize_for_excel(path: str, px: int) -> str:
-    """Create a temp resized copy for Excel embedding."""
     try:
         im = Image.open(path).convert("RGB")
     except Exception:
@@ -371,7 +481,6 @@ def export_excel_with_thumbs(table_value: Any, session_rows: List[dict], thumb_p
     except Exception as e:
         raise RuntimeError("Excel export requires 'openpyxl' in requirements.txt.") from e
-    # Respect user edits (table wins)
     caption_by_file = {}
     for row in (table_value or []):
         if not row:
@@ -387,9 +496,7 @@ def export_excel_with_thumbs(table_value: Any, session_rows: List[dict], thumb_p
     ws.column_dimensions["B"].width = 42
     ws.column_dimensions["C"].width = 100
-    # px→points (~0.75 pt per screen px @ ~96dpi)
-    row_h = int(thumb_px * 0.75)
     r_i = 2
     for r in (session_rows or []):
         fn = r.get("filename","")
@@ -411,72 +518,6 @@ def export_excel_with_thumbs(table_value: Any, session_rows: List[dict], thumb_p
     wb.save(out)
     return out
-# Rows<->Table helpers
-def _rows_to_table(rows: List[dict]) -> list:
-    return [[r.get("filename",""), r.get("caption","")] for r in (rows or [])]
-def _table_to_rows(table_value: Any, rows: List[dict]) -> List[dict]:
-    tbl = table_value or []
-    new = []
-    for i, r in enumerate(rows or []):
-        r = dict(r)
-        if i < len(tbl) and len(tbl[i]) >= 2:
-            r["filename"] = str(tbl[i][0]) if tbl[i][0] is not None else r.get("filename","")
-            r["caption"]  = str(tbl[i][1]) if tbl[i][1] is not None else r.get("caption","")
-        new.append(r)
-    return new
-# ────────────────────────────────────────────────────────
-# Batch captioning (GPU) + sync
-# ────────────────────────────────────────────────────────
-@gpu
-@torch.no_grad()
-def run_batch(
-    files: List[Any],
-    session_rows: List[dict],
-    instr_text: str,
-    temp: float,
-    top_p: float,
-    max_tokens: int,
-    max_side: int,
-) -> Tuple[List[dict], list, list, str]:
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    session_rows = session_rows or []
-    files = files or []
-    if not files:
-        gallery_pairs = [
-            ((r.get("thumb_path") or r.get("path")), r.get("caption",""))
-            for r in session_rows if (r.get("thumb_path") or r.get("path"))
-        ]
-        return session_rows, gallery_pairs, _rows_to_table(session_rows), f"Saved • {time.strftime('%H:%M:%S')}"
-    for f in files:
-        path = f if isinstance(f, str) else getattr(f, "name", None) or getattr(f, "path", None)
-        if not path or not os.path.exists(path):
-            continue
-        try:
-            im = Image.open(path).convert("RGB")
-        except Exception:
-            continue
-        im = resize_for_model(im, max_side)
-        cap = caption_once(im, instr_text, temp, top_p, max_tokens)
-        cap = apply_shape_aliases(cap)
-        s = load_settings()
-        cap = apply_prefix_suffix(cap, s.get("trigger",""), s.get("begin",""), s.get("end",""))
-        filename = os.path.basename(path)
-        thumb = ensure_thumb(path, 256)
-        session_rows.append({"filename": filename, "caption": cap, "path": path, "thumb_path": thumb})
-    save_session(session_rows)
-    gallery_pairs = [
-        ((r.get("thumb_path") or r.get("path")), r.get("caption",""))
-        for r in session_rows if (r.get("thumb_path") or r.get("path"))
-    ]
-    return session_rows, gallery_pairs, _rows_to_table(session_rows), f"Saved • {time.strftime('%H:%M:%S')}"
 def sync_table_to_session(table_value: Any, session_rows: List[dict]) -> Tuple[List[dict], list, str]:
     session_rows = _table_to_rows(table_value, session_rows or [])
     save_session(session_rows)
@@ -486,61 +527,28 @@ def sync_table_to_session(table_value: Any, session_rows: List[dict]) -> Tuple[L
     ]
     return session_rows, gallery_pairs, f"Saved • {time.strftime('%H:%M:%S')}"
-# Tiny GPU warmup for HF Spaces detection
-@gpu
-@torch.no_grad()
-def _gpu_startup_warm():
-    try:
-        im = Image.new("RGB", (64, 64), (127, 127, 127))
-        _ = caption_once(im, "Warm up.", temp=0.0, top_p=1.0, max_tokens=8)
-        print("[ForgeCaptions] GPU warmup complete")
-    except Exception as e:
-        print("[ForgeCaptions] GPU warmup skipped:", e)
-# ────────────────────────────────────────────────────────
 # UI
-# ────────────────────────────────────────────────────────
 BASE_CSS = """
 :root{--galleryW:50%;--tableW:50%;}
 .gradio-container{max-width:100%!important}
-.cf-hero{
-  display:flex; align-items:center; justify-content:center; gap:16px;
-  margin:4px 0 12px; text-align:center;
-}
 .cf-hero > div { text-align:center; }
 .cf-logo{height:calc(3.25rem + 3 * 1.1rem + 18px);width:auto;object-fit:contain}
 .cf-title{margin:0;font-size:3.25rem;line-height:1;letter-spacing:.2px}
 .cf-sub{margin:6px 0 0;font-size:1.1rem;color:#cfd3da}
-.cf-row{display:flex;gap:12px}
-.cf-col-gallery{flex:0 0 var(--galleryW)}
-.cf-col-table{flex:0 0 var(--tableW)}
-/* Shared scroll look */
 .cf-scroll{max-height:70vh; overflow-y:auto; border:1px solid #e6e6e6; border-radius:10px; padding:8px}
-/* Uniform sizes */
 #cfGal .grid > div { height: 96px; }
 """
-def logo_b64_img() -> str:
-    candidates = [
-        os.path.join(APP_DIR, "forgecaptions-logo.png"),
-        os.path.join(APP_DIR, "captionforge-logo.png"),
-        "/home/user/app/forgecaptions-logo.png",
-        "forgecaptions-logo.png",
-        "captionforge-logo.png",
-    ]
-    for p in candidates:
-        if os.path.exists(p):
-            with open(p, "rb") as f:
-                b64 = base64.b64encode(f.read()).decode("ascii")
-            return f"<img src='data:image/png;base64,{b64}' alt='ForgeCaptions' class='cf-logo'>"
-    return ""
 with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
-    # Ensure HF GPU detection runs once UI starts
     demo.load(_gpu_startup_warm, inputs=None, outputs=None)
     settings = load_settings()
-    settings["styles"] = [s for s in settings.get("styles", []) if s in STYLE_OPTIONS] or ["Character training (short)"]
     gr.HTML(value=f"""
 <div class="cf-hero">
@@ -552,43 +560,45 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
     <div class="cf-sub">CSV / Excel export</div>
   </div>
 </div>
-<hr>
-""")
-    # ===== Controls (top)
     with gr.Group():
         with gr.Row():
             with gr.Column(scale=2):
-                style_checks = gr.CheckboxGroup(
-                    choices=STYLE_OPTIONS,
-                    value=settings.get("styles", ["Character training (short)"]),
-                    label="Caption style (choose one or combine)"
-                )
-                with gr.Accordion("Extra options", open=True):
                     extra_opts = gr.CheckboxGroup(
                         choices=[NAME_OPTION] + EXTRA_CHOICES,
                         value=settings.get("extras", []),
                         label=None
                     )
-                with gr.Accordion("Name & Prefix/Suffix", open=True):
                     name_input = gr.Textbox(label="Person / Character Name", value=settings.get("name", ""))
                     trig       = gr.Textbox(label="Trigger word", value=settings.get("trigger",""))
                     add_start  = gr.Textbox(label="Add text to start", value=settings.get("begin",""))
                     add_end    = gr.Textbox(label="Add text to end", value=settings.get("end",""))
             with gr.Column(scale=1):
-                instruction_preview = gr.Textbox(label="Model Instructions", lines=12)
-                dataset_name = gr.Textbox(label="Dataset name (used for export file titles)", value=settings.get("dataset_name", "forgecaptions"))
-                max_side   = gr.Slider(256, MAX_SIDE_CAP, settings.get("max_side", min(896, MAX_SIDE_CAP)), step=32, label="Max side (resize)")
-                excel_thumb_px = gr.Slider(64, 256, value=settings.get("excel_thumb_px", 128), step=8, label="Excel thumbnail size (px)")
-                gr.Markdown("Generation (settings): temperature 0.6 • top-p 0.9 • max_tokens 256")
-    # Persist options + live instruction
     def _refresh_instruction(styles, extra, name_value, trigv, begv, endv, excel_px, ms):
-        instr = final_instruction(styles or ["Character training (short)"], extra or [], name_value)
         cfg = load_settings()
         cfg.update({
-            "styles": styles or ["Character training (short)"],
             "extras": extra or [],
             "name": name_value,
             "trigger": trigv, "begin": begv, "end": endv,
@@ -599,16 +609,14 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
         return instr
     for comp in [style_checks, extra_opts, name_input, trig, add_start, add_end, excel_thumb_px, max_side]:
-        comp.change(
-            _refresh_instruction,
-            inputs=[style_checks, extra_opts, name_input, trig, add_start, add_end, excel_thumb_px, max_side],
-            outputs=[instruction_preview]
-        )
-    demo.load(lambda s,e,n: final_instruction(s or ["Character training (short)"], e or [], n),
               inputs=[style_checks, extra_opts, name_input], outputs=[instruction_preview])
-    # ===== Shape Aliases (improved UX: add row / clear / save)
     with gr.Accordion("Shape Aliases", open=False):
         gr.Markdown(
             "### 🔷 Shape Aliases\n"
@@ -622,7 +630,7 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
             value=init_rows,
             col_count=(2, "fixed"),
             row_count=(max(1, len(init_rows)), "dynamic"),
-            datatype=["str", "str"],
             type="array",
             interactive=True
         )
@@ -640,28 +648,14 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
         clear_btn.click(_clear_rows, outputs=[alias_table])
         save_btn.click(save_shape_alias_rows, inputs=[enable_aliases, alias_table], outputs=[save_status, alias_table])
-    # ===== Tabs (Single + Batch)
     with gr.Tabs():
         with gr.Tab("Single"):
             input_image_single = gr.Image(type="pil", label="Input Image", height=512, width=512)
             single_caption_btn = gr.Button("Caption")
             single_caption_out = gr.Textbox(label="Caption (single)")
-            def _caption_single(img, instr):
-                if img is None:
-                    return "No image provided."
-                s = load_settings()
-                im = resize_for_model(img, int(s.get("max_side", MAX_SIDE_CAP)))
-                t = s.get("temperature", 0.6)
-                p = s.get("top_p", 0.9)
-                m = s.get("max_tokens", 256)
-                cap = caption_once(im, instr, t, p, m)
-                cap = apply_shape_aliases(cap)
-                cap = apply_prefix_suffix(cap, s.get("trigger",""), s.get("begin",""), s.get("end",""))
-                return cap
             single_caption_btn.click(
-                _caption_single,
                 inputs=[input_image_single, instruction_preview],
                 outputs=[single_caption_out]
             )
@@ -671,7 +665,7 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
                 input_files = gr.File(label="Drop images", file_types=["image"], file_count="multiple", type="filepath")
             run_button = gr.Button("Caption batch", variant="primary")
-    # ===== Results + Table (kept in the same place)
     rows_state  = gr.State(load_session())
     autosave_md = gr.Markdown("Ready.")
@@ -705,10 +699,10 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
             export_xlsx_btn = gr.Button("Export Excel (.xlsx) with thumbnails")
             xlsx_file       = gr.File(label="Excel file", visible=False)
-    # Initial gallery render
     def _initial_gallery(rows):
         rows = rows or []
-        return [((r.get("thumb_path") or r.get("path")), r.get("caption","")) for r in rows if (r.get("thumb_path") or r.get("path"))]
     demo.load(_initial_gallery, inputs=[rows_state], outputs=[gallery])
     # Scroll sync
@@ -756,7 +750,7 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
 </script>
 """)
-    # Run batch
     def _run_click(files, rows, instr, ms):
         s = load_settings()
         t = s.get("temperature", 0.6)
@@ -771,14 +765,14 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
         outputs=[rows_state, gallery, table, autosave_md]
     )
-    # Table edits sync → rows + gallery
     table.change(
         sync_table_to_session,
         inputs=[table, rows_state],
         outputs=[rows_state, gallery, autosave_md]
     )
-    # Exports (use slider value)
     export_csv_btn.click(
         lambda tbl: (export_csv_from_table(tbl), gr.update(visible=True)),
         inputs=[table], outputs=[csv_file, csv_file]
@@ -788,7 +782,7 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
         inputs=[table, rows_state, excel_thumb_px], outputs=[xlsx_file, xlsx_file]
     )
-# Launch (disable experimental SSR to reduce churn)
 if __name__ == "__main__":
     demo.queue(max_size=64).launch(
         server_name="0.0.0.0",

+import os, io, csv, time, json, base64, re
 from typing import List, Tuple, Dict, Any
+# ---------------------------------------------------------------------
+# Caching
+# ---------------------------------------------------------------------
 os.environ.setdefault("HF_HOME", "/home/user/.cache/huggingface")
 os.makedirs(os.environ["HF_HOME"], exist_ok=True)
 import torch
 from transformers import LlavaForConditionalGeneration, AutoProcessor
+# ── HF Spaces GPU decorator (no-op on CPU/local) ─────────────────────
 try:
     import spaces
     gpu = spaces.GPU()
+except Exception:  # local/CPU
+    def gpu(f): return f
 APP_DIR = os.getcwd()
 SESSION_FILE = "/tmp/session.json"
 SETTINGS_FILE = "/tmp/cf_settings.json"
 os.makedirs(THUMB_CACHE, exist_ok=True)
 os.makedirs(EXCEL_THUMB_DIR, exist_ok=True)
+# ---------------------------------------------------------------------
+# Model identifiers
+# ---------------------------------------------------------------------
 MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
+# Load the processor on CPU (safe in stateless env)
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+# Lazy GPU/CPU model (created inside GPU worker only)
+_MODEL = None
+_DEVICE = "cpu"
+_DTYPE = torch.float32
+def get_model():
+    """Create/reuse model; only call this from inside @gpu functions."""
+    global _MODEL, _DEVICE, _DTYPE
+    if _MODEL is None:
+        if torch.cuda.is_available():
+            _DEVICE = "cuda"
+            _DTYPE = torch.bfloat16
+            _MODEL = LlavaForConditionalGeneration.from_pretrained(
+                MODEL_PATH,
+                torch_dtype=_DTYPE,
+                low_cpu_mem_usage=True,
+                device_map=0,  # GPU:0 (inside GPU worker process)
+            )
+        else:
+            _DEVICE = "cpu"
+            _DTYPE = torch.float32
+            _MODEL = LlavaForConditionalGeneration.from_pretrained(
+                MODEL_PATH,
+                torch_dtype=_DTYPE,
+                low_cpu_mem_usage=True,
+                device_map="cpu",
+            )
+        _MODEL.eval()
+        print(f"[ForgeCaptions] Model ready on {_DEVICE} dtype={_DTYPE}")
+    return _MODEL, _DEVICE, _DTYPE
 print(f"[ForgeCaptions] Gradio version: {gr.__version__}")
+# ---------------------------------------------------------------------
 # Instruction templates & options
+# ---------------------------------------------------------------------
 STYLE_OPTIONS = [
     "Descriptive (short)", "Descriptive (long)",
     "Character training (short)", "Character training (long)",
 ]
 NAME_OPTION = "If there is a person/character in the image you must refer to them as {name}."
+# ---------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------
 def ensure_thumb(path: str, max_side=256) -> str:
     try:
         im = Image.open(path).convert("RGB")
         parts.append(end_text.strip())
     return " ".join([p for p in parts if p])
+# Instruction + caption
 def final_instruction(style_list: List[str], extra_opts: List[str], name_value: str) -> str:
     styles = style_list or ["Descriptive (short)"]
     parts = [CAPTION_TYPE_MAP.get(s, "") for s in styles]
         core = core.replace("{name}", (name_value or "{NAME}").strip())
     return core
+def logo_b64_img() -> str:
+    candidates = [
+        os.path.join(APP_DIR, "forgecaptions-logo.png"),
+        os.path.join(APP_DIR, "captionforge-logo.png"),
+        "/home/user/app/forgecaptions-logo.png",
+        "forgecaptions-logo.png",
+        "captionforge-logo.png",
     ]
+    for p in candidates:
+        if os.path.exists(p):
+            with open(p, "rb") as f:
+                b64 = base64.b64encode(f.read()).decode("ascii")
+            return f"<img src='data:image/png;base64,{b64}' alt='ForgeCaptions' class='cf-logo'>"
+    return ""
+# ---------------------------------------------------------------------
+# Persistence
+# ---------------------------------------------------------------------
 def save_session(rows: List[dict]):
     with open(SESSION_FILE, "w", encoding="utf-8") as f:
         json.dump(rows, f, ensure_ascii=False, indent=2)
         "temperature": 0.6,
         "top_p": 0.9,
         "max_tokens": 256,
+        "max_side": 896,
+        "styles": ["Character training (long)"],  # ← default changed
         "extras": [],
         "name": "",
         "trigger": "",
             return json.load(f)
     return {}
+# ---------------------------------------------------------------------
+# Shape Aliases
+# ---------------------------------------------------------------------
 def _compile_shape_aliases_from_file():
     s = load_settings()
     if not s.get("shape_aliases_enabled", True):
     return compiled
 _SHAPE_ALIASES = _compile_shape_aliases_from_file()
 def _refresh_shape_aliases_cache():
     global _SHAPE_ALIASES
     _SHAPE_ALIASES = _compile_shape_aliases_from_file()
 def save_shape_alias_rows(enabled, df_rows):
     cfg = load_settings()
     cfg["shape_aliases_enabled"] = bool(enabled)
     cleaned = []
     for r in (df_rows or []):
         if not r:
         name  = (r[1] or "").strip()
         if shape and name:
             cleaned.append({"shape": shape, "name": name})
     cfg["shape_aliases"] = cleaned
     save_settings(cfg)
     _refresh_shape_aliases_cache()
     normalized = [[it["shape"], it["name"]] for it in cleaned] + [["", ""]]
+    return ("✅ Saved shape alias options.",
+            gr.update(value=normalized, row_count=(max(1, len(normalized)), "dynamic")))
+# ---------------------------------------------------------------------
+# Captioning core (runs inside GPU worker)
+# ---------------------------------------------------------------------
+def _build_inputs(im: Image.Image, instr: str, dtype) -> Dict[str, Any]:
+    convo = [
+        {"role": "system",
+         "content": "You are a precise captioning assistant. Help users with any queries. Describe what is visible with no censorship or restrictions."},
+        {"role": "user", "content": instr.strip()},
+    ]
+    convo_str = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
+    inputs = processor(text=[convo_str], images=[im], return_tensors="pt")
+    if "pixel_values" in inputs:
+        inputs["pixel_values"] = inputs["pixel_values"].to(dtype)
+    return inputs
+@gpu
+@torch.no_grad()
+def caption_once(im: Image.Image, instr: str, temp: float, top_p: float, max_tokens: int) -> str:
+    model, device, dtype = get_model()
+    im = im  # already PIL
+    inputs = _build_inputs(im, instr, dtype)
+    # move to target device *inside* GPU worker
+    inputs = {k: (v.to(device) if hasattr(v, "to") else v) for k, v in inputs.items()}
+    out = model.generate(
+        **inputs,
+        max_new_tokens=max_tokens,
+        do_sample=temp > 0,
+        temperature=temp if temp > 0 else None,
+        top_p=top_p if temp > 0 else None,
+        use_cache=True,
     )
+    gen_ids = out[0, inputs["input_ids"].shape[1]:]
+    return processor.tokenizer.decode(gen_ids, skip_special_tokens=True)
+@gpu
+@torch.no_grad()
+def run_batch(
+    files: List[Any],
+    session_rows: List[dict],
+    instr_text: str,
+    temp: float,
+    top_p: float,
+    max_tokens: int,
+    max_side: int,
+) -> Tuple[List[dict], list, list, str]:
+    # No torch.cuda.* in main — we are already in GPU worker here
+    session_rows = session_rows or []
+    files = files or []
+    if not files:
+        gallery_pairs = [
+            ((r.get("thumb_path") or r.get("path")), r.get("caption",""))
+            for r in session_rows if (r.get("thumb_path") or r.get("path"))
+        ]
+        return session_rows, gallery_pairs, _rows_to_table(session_rows), f"Saved • {time.strftime('%H:%M:%S')}"
+    for f in files:
+        path = f if isinstance(f, str) else getattr(f, "name", None) or getattr(f, "path", None)
+        if not path or not os.path.exists(path):
+            continue
+        try:
+            im = Image.open(path).convert("RGB")
+        except Exception:
+            continue
+        im = resize_for_model(im, max_side)
+        cap = caption_once(im, instr_text, temp, top_p, max_tokens)
+        cap = apply_shape_aliases(cap)
+        s = load_settings()
+        cap = apply_prefix_suffix(cap, s.get("trigger",""), s.get("begin",""), s.get("end",""))
+        filename = os.path.basename(path)
+        thumb = ensure_thumb(path, 256)
+        session_rows.append({"filename": filename, "caption": cap, "path": path, "thumb_path": thumb})
+    save_session(session_rows)
+    gallery_pairs = [
+        ((r.get("thumb_path") or r.get("path")), r.get("caption",""))
+        for r in session_rows if (r.get("thumb_path") or r.get("path"))
+    ]
+    return session_rows, gallery_pairs, _rows_to_table(session_rows), f"Saved • {time.strftime('%H:%M:%S')}"
+@gpu
+@torch.no_grad()
+def caption_single(img: Image.Image, instr: str) -> str:
+    if img is None:
+        return "No image provided."
+    s = load_settings()
+    im = resize_for_model(img, int(s.get("max_side", 896)))
+    cap = caption_once(im, instr, s.get("temperature",0.6), s.get("top_p",0.9), s.get("max_tokens",256))
+    cap = apply_shape_aliases(cap)
+    cap = apply_prefix_suffix(cap, s.get("trigger",""), s.get("begin",""), s.get("end",""))
+    return cap
+# tiny warmup so Spaces sees a GPU function at startup
+@gpu
+@torch.no_grad()
+def _gpu_startup_warm():
+    try:
+        im = Image.new("RGB", (64, 64), (127,127,127))
+        _ = caption_once(im, "Warm up.", temp=0.0, top_p=1.0, max_tokens=8)
+        print("[ForgeCaptions] GPU warmup complete")
+    except Exception as e:
+        print("[ForgeCaptions] GPU warmup skipped:", e)
+# ---------------------------------------------------------------------
+# Export helpers
+# ---------------------------------------------------------------------
+def _rows_to_table(rows: List[dict]) -> list:
+    return [[r.get("filename",""), r.get("caption","")] for r in (rows or [])]
+def _table_to_rows(table_value: Any, rows: List[dict]) -> List[dict]:
+    tbl = table_value or []
+    new = []
+    for i, r in enumerate(rows or []):
+        r = dict(r)
+        if i < len(tbl) and len(tbl[i]) >= 2:
+            r["filename"] = str(tbl[i][0]) if tbl[i][0] is not None else r.get("filename","")
+            r["caption"]  = str(tbl[i][1]) if tbl[i][1] is not None else r.get("caption","")
+        new.append(r)
+    return new
 def export_csv_from_table(table_value: Any) -> str:
     data = table_value or []
     out = f"/tmp/forgecaptions_{int(time.time())}.csv"
     return out
 def _resize_for_excel(path: str, px: int) -> str:
     try:
         im = Image.open(path).convert("RGB")
     except Exception:
     except Exception as e:
         raise RuntimeError("Excel export requires 'openpyxl' in requirements.txt.") from e
     caption_by_file = {}
     for row in (table_value or []):
         if not row:
     ws.column_dimensions["B"].width = 42
     ws.column_dimensions["C"].width = 100
+    row_h = int(int(thumb_px) * 0.75)  # px→pt-ish
     r_i = 2
     for r in (session_rows or []):
         fn = r.get("filename","")
     wb.save(out)
     return out
 def sync_table_to_session(table_value: Any, session_rows: List[dict]) -> Tuple[List[dict], list, str]:
     session_rows = _table_to_rows(table_value, session_rows or [])
     save_session(session_rows)
     ]
     return session_rows, gallery_pairs, f"Saved • {time.strftime('%H:%M:%S')}"
+# ---------------------------------------------------------------------
 # UI
+# ---------------------------------------------------------------------
 BASE_CSS = """
 :root{--galleryW:50%;--tableW:50%;}
 .gradio-container{max-width:100%!important}
+.cf-hero{display:flex; align-items:center; justify-content:center; gap:16px;
+  margin:4px 0 12px; text-align:center;}
 .cf-hero > div { text-align:center; }
 .cf-logo{height:calc(3.25rem + 3 * 1.1rem + 18px);width:auto;object-fit:contain}
 .cf-title{margin:0;font-size:3.25rem;line-height:1;letter-spacing:.2px}
 .cf-sub{margin:6px 0 0;font-size:1.1rem;color:#cfd3da}
 .cf-scroll{max-height:70vh; overflow-y:auto; border:1px solid #e6e6e6; border-radius:10px; padding:8px}
 #cfGal .grid > div { height: 96px; }
 """
 with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
+    # ensure Spaces sees a GPU function at start (without touching CUDA in main)
     demo.load(_gpu_startup_warm, inputs=None, outputs=None)
     settings = load_settings()
+    settings["styles"] = [s for s in settings.get("styles", []) if s in STYLE_OPTIONS] or ["Character training (long)"]
     gr.HTML(value=f"""
 <div class="cf-hero">
     <div class="cf-sub">CSV / Excel export</div>
   </div>
 </div>
+<hr>""")
+    # ── Controls
     with gr.Group():
         with gr.Row():
             with gr.Column(scale=2):
+                with gr.Accordion("Caption style (choose one or combine)", open=True):
+                    style_checks = gr.CheckboxGroup(
+                        choices=STYLE_OPTIONS,
+                        value=settings.get("styles", ["Character training (long)"]),
+                        label=None
+                    )
+                with gr.Accordion("Extra options", open=False):
                     extra_opts = gr.CheckboxGroup(
                         choices=[NAME_OPTION] + EXTRA_CHOICES,
                         value=settings.get("extras", []),
                         label=None
                     )
+                with gr.Accordion("Name & Prefix/Suffix", open=False):
                     name_input = gr.Textbox(label="Person / Character Name", value=settings.get("name", ""))
                     trig       = gr.Textbox(label="Trigger word", value=settings.get("trigger",""))
                     add_start  = gr.Textbox(label="Add text to start", value=settings.get("begin",""))
                     add_end    = gr.Textbox(label="Add text to end", value=settings.get("end",""))
             with gr.Column(scale=1):
+                with gr.Accordion("Model Instructions", open=False):
+                    instruction_preview = gr.Textbox(label=None, lines=12)
+                dataset_name = gr.Textbox(label="Dataset name (export title prefix)",
+                                          value=settings.get("dataset_name", "forgecaptions"))
+                max_side   = gr.Slider(256, 1024, settings.get("max_side", 896), step=32, label="Max side (resize)")
+                excel_thumb_px = gr.Slider(64, 256, value=settings.get("excel_thumb_px", 128),
+                                           step=8, label="Excel thumbnail size (px)")
+                gr.Markdown("Generation settings: temperature 0.6 • top-p 0.9 • max tokens 256")
     def _refresh_instruction(styles, extra, name_value, trigv, begv, endv, excel_px, ms):
+        instr = final_instruction(styles or ["Character training (long)"], extra or [], name_value)
         cfg = load_settings()
         cfg.update({
+            "styles": styles or ["Character training (long)"],
             "extras": extra or [],
             "name": name_value,
             "trigger": trigv, "begin": begv, "end": endv,
         return instr
     for comp in [style_checks, extra_opts, name_input, trig, add_start, add_end, excel_thumb_px, max_side]:
+        comp.change(_refresh_instruction,
+                    inputs=[style_checks, extra_opts, name_input, trig, add_start, add_end, excel_thumb_px, max_side],
+                    outputs=[instruction_preview])
+    demo.load(lambda s,e,n: final_instruction(s or ["Character training (long)"], e or [], n),
               inputs=[style_checks, extra_opts, name_input], outputs=[instruction_preview])
+    # ── Shape Aliases (improved)
     with gr.Accordion("Shape Aliases", open=False):
         gr.Markdown(
             "### 🔷 Shape Aliases\n"
             value=init_rows,
             col_count=(2, "fixed"),
             row_count=(max(1, len(init_rows)), "dynamic"),
+            datatype=["str","str"],
             type="array",
             interactive=True
         )
         clear_btn.click(_clear_rows, outputs=[alias_table])
         save_btn.click(save_shape_alias_rows, inputs=[enable_aliases, alias_table], outputs=[save_status, alias_table])
+    # ── Tabs: Single & Batch
     with gr.Tabs():
         with gr.Tab("Single"):
             input_image_single = gr.Image(type="pil", label="Input Image", height=512, width=512)
             single_caption_btn = gr.Button("Caption")
             single_caption_out = gr.Textbox(label="Caption (single)")
             single_caption_btn.click(
+                caption_single,
                 inputs=[input_image_single, instruction_preview],
                 outputs=[single_caption_out]
             )
                 input_files = gr.File(label="Drop images", file_types=["image"], file_count="multiple", type="filepath")
             run_button = gr.Button("Caption batch", variant="primary")
+    # ── Results + Table (same position)
     rows_state  = gr.State(load_session())
     autosave_md = gr.Markdown("Ready.")
             export_xlsx_btn = gr.Button("Export Excel (.xlsx) with thumbnails")
             xlsx_file       = gr.File(label="Excel file", visible=False)
     def _initial_gallery(rows):
         rows = rows or []
+        return [((r.get("thumb_path") or r.get("path")), r.get("caption",""))
+                for r in rows if (r.get("thumb_path") or r.get("path"))]
     demo.load(_initial_gallery, inputs=[rows_state], outputs=[gallery])
     # Scroll sync
 </script>
 """)
+    # Batch run → rows + gallery + table
     def _run_click(files, rows, instr, ms):
         s = load_settings()
         t = s.get("temperature", 0.6)
         outputs=[rows_state, gallery, table, autosave_md]
     )
+    # Table edits sync
     table.change(
         sync_table_to_session,
         inputs=[table, rows_state],
         outputs=[rows_state, gallery, autosave_md]
     )
+    # Exports
     export_csv_btn.click(
         lambda tbl: (export_csv_from_table(tbl), gr.update(visible=True)),
         inputs=[table], outputs=[csv_file, csv_file]
         inputs=[table, rows_state, excel_thumb_px], outputs=[xlsx_file, xlsx_file]
     )
+# Launch (SSR off for stability on Spaces)
 if __name__ == "__main__":
     demo.queue(max_size=64).launch(
         server_name="0.0.0.0",