import os import time from typing import List, Dict, Tuple import gradio as gr from transformers import pipeline import spaces # === Config (override via Space secrets/env vars) === MODEL_ID = os.environ.get("MODEL_ID", "openai/gpt-oss-20b") STATIC_PROMPT = """""" DEFAULT_MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", 512)) DEFAULT_TEMPERATURE = float(os.environ.get("TEMPERATURE", 0.7)) DEFAULT_TOP_P = float(os.environ.get("TOP_P", 0.95)) DEFAULT_REPETITION_PENALTY = float(os.environ.get("REPETITION_PENALTY", 1.0)) ZGPU_DURATION = int(os.environ.get("ZGPU_DURATION", 120)) # seconds _pipe = None # cached pipeline _tok = None # tokenizer for parsing Harmony format def _to_messages(policy: str, user_prompt: str) -> List[Dict[str, str]]: messages = [] if policy.strip(): messages.append({"role": "system", "content": policy.strip()}) # if STATIC_PROMPT: # messages.append({"role": "system", "content": STATIC_PROMPT}) messages.append({"role": "user", "content": user_prompt}) return messages def _parse_harmony_output(last, tokenizer): analysis, content = None, None if isinstance(last, dict) and ("content" in last or "thinking" in last): analysis = last.get("thinking") content = last.get("content") else: parsed = tokenizer.parse_response(last) analysis = parsed.get("thinking") content = parsed.get("content") return analysis, content @spaces.GPU(duration=ZGPU_DURATION) def generate_long_prompt( policy: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float, repetition_penalty: float, ) -> Tuple[str, str, str]: global _pipe, _tok start = time.time() if _pipe is None: _pipe = pipeline( task="text-generation", model=MODEL_ID, torch_dtype="auto", device_map="auto", ) _tok = _pipe.tokenizer messages = _to_messages(policy, prompt) outputs = _pipe( messages, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty, ) res = outputs[0] last = res.get("generated_text", []) if isinstance(last, list) and last: last = last[-1] analysis, content = _parse_harmony_output(last, _tok) elapsed = time.time() - start meta = f"Model: {MODEL_ID} | Time: {elapsed:.1f}s | max_new_tokens={max_new_tokens}" return analysis or "(No analysis)", content or "(No answer)", meta CUSTOM_CSS = "/** Simple styling **/\n.gradio-container {font-family: ui-sans-serif, system-ui, Inter, Roboto;}\ntextarea {font-family: ui-monospace, monospace;}\nfooter {display:none;}" with gr.Blocks(css=CUSTOM_CSS, theme=gr.themes.Soft()) as demo: gr.Markdown("""# GPT‑OSS Harmony Demo\nProvide a **Policy**, a **Prompt**, and see both **Analysis** and **Answer** separately.""") with gr.Row(): with gr.Column(scale=1): policy = gr.Textbox(label="Policy (system)", lines=20, placeholder="Enter the guiding rules and tone…") prompt = gr.Textbox(label="Prompt (user)", lines=10, placeholder="Enter your main prompt…") with gr.Accordion("Advanced settings", open=False): max_new_tokens = gr.Slider(16, 4096, value=DEFAULT_MAX_NEW_TOKENS, step=8, label="max_new_tokens") temperature = gr.Slider(0.0, 1.5, value=DEFAULT_TEMPERATURE, step=0.05, label="temperature") top_p = gr.Slider(0.0, 1.0, value=DEFAULT_TOP_P, step=0.01, label="top_p") repetition_penalty = gr.Slider(0.8, 2.0, value=DEFAULT_REPETITION_PENALTY, step=0.05, label="repetition_penalty") generate = gr.Button("Generate", variant="primary") with gr.Column(scale=1): analysis = gr.Textbox(label="Analysis (Harmony thinking)", lines=10) answer = gr.Textbox(label="Answer", lines=10) meta = gr.Markdown() generate.click( fn=generate_long_prompt, inputs=[policy, prompt, max_new_tokens, temperature, top_p, repetition_penalty], outputs=[analysis, answer, meta], concurrency_limit=1, api_name="generate", ) if __name__ == "__main__": demo.queue(max_size=32).launch()