File size: 5,509 Bytes
0a2f92c
 
374e6aa
0a2f92c
374e6aa
0a2f92c
 
 
 
2949bc9
 
 
 
0a2f92c
 
4c2c6b8
0a2f92c
374e6aa
 
 
 
 
 
b7f6ab9
374e6aa
 
 
0a2f92c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374e6aa
 
0a2f92c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374e6aa
0a2f92c
 
 
 
 
 
 
 
 
 
 
 
 
 
374e6aa
0a2f92c
 
 
374e6aa
0a2f92c
 
 
 
 
 
 
 
 
 
 
 
 
 
374e6aa
 
 
 
 
 
 
0a2f92c
374e6aa
0a2f92c
 
 
 
 
 
 
374e6aa
 
0a2f92c
 
 
374e6aa
0a2f92c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374e6aa
0a2f92c
 
 
 
 
 
 
374e6aa
0a2f92c
 
 
 
 
 
 
374e6aa
0a2f92c
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread

# Model configuration
MODEL_ID = "LiquidAI/LFM2-8B-A1B"

# NOTE: This model requires transformers from source.
# Add this to your requirements.txt:
# git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6

# Load tokenizer globally (doesn't need GPU)
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=False)

# Load model globally (will be moved to GPU by ZeroGPU decorator)
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="cpu",  # Load on CPU first, ZeroGPU will move it to GPU
    torch_dtype=torch.bfloat16,
    trust_remote_code=False,
    # attn_implementation="flash_attention_2"  # Uncomment if you have compatible GPU
)
print("Model loaded successfully!")

@spaces.GPU(duration=120)
def generate_response(
    message: str,
    history: list[dict[str, str]],
    system_message: str,
    max_new_tokens: int,
    temperature: float,
    min_p: float,
    repetition_penalty: float,
):
    """
    Generate a response using the LiquidAI LFM2-8B model
    
    Args:
        message: The current user message
        history: Chat history in the format [{"role": "user"/"assistant", "content": "..."}]
        system_message: System prompt to guide the model
        max_new_tokens: Maximum tokens to generate
        temperature: Sampling temperature
        min_p: Minimum probability threshold
        repetition_penalty: Penalty for repetition
    
    Yields:
        Generated text tokens (streaming)
    """
    if not message.strip():
        yield "Please enter a message."
        return
    
    # Move model to GPU (handled by ZeroGPU)
    model.to("cuda")
    
    # Build conversation history
    messages = []
    
    # Add system message if provided
    if system_message.strip():
        messages.append({"role": "system", "content": system_message})
    
    # Add chat history
    for msg in history:
        messages.append(msg)
    
    # Add current user message
    messages.append({"role": "user", "content": message})
    
    # Prepare input
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True,
    ).to(model.device)
    
    # Generate response with streaming
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
    
    generation_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        do_sample=True,
        temperature=temperature,
        min_p=min_p,
        repetition_penalty=repetition_penalty,
        max_new_tokens=max_new_tokens,
    )
    
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    # Stream the output
    streamer_output = ""
    for new_text in streamer:
        streamer_output += new_text
        yield streamer_output
    
    thread.join()


# Create Gradio ChatInterface
demo = gr.ChatInterface(
    generate_response,
    type="messages",
    title="🌊 LiquidAI LFM2-8B Chat",
    description="""
    Chat with the **LiquidAI LFM2-8B-A1B** model using ZeroGPU. 
    This is a hybrid MoE model with 8.3B total parameters and 1.5B active parameters, optimized for edge AI deployment.
    
    πŸ’‘ **Tip:** The first response may take a moment as the GPU is allocated. The model excels at:
    - Instruction following
    - Math and reasoning
    - Multi-turn conversations
    - Agentic tasks and data extraction
    
    ⚠️ **Note:** This model is best suited for narrow use cases. It may not perform well on knowledge-intensive tasks.
    """,
    theme=gr.themes.Soft(),
    examples=[
        ["What is C. elegans?"],
        ["Explain quantum entanglement in simple terms."],
        ["Write a short poem about artificial intelligence."],
        ["What are the main differences between Python and JavaScript?"],
        ["Solve this math problem: If a train travels 120 miles in 2 hours, what is its average speed?"],
        ["Help me plan a 3-day itinerary for visiting Paris."],
    ],
    additional_inputs=[
        gr.Textbox(
            value="You are a helpful assistant trained by Liquid AI.",
            label="System Message",
            info="Set the behavior and personality of the assistant"
        ),
        gr.Slider(
            minimum=64,
            maximum=2048,
            value=512,
            step=64,
            label="Max New Tokens",
            info="Maximum length of generated response"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=2.0,
            value=0.3,
            step=0.1,
            label="Temperature",
            info="Higher values make output more random (recommended: 0.3)"
        ),
        gr.Slider(
            minimum=0.0,
            maximum=1.0,
            value=0.15,
            step=0.05,
            label="Min P",
            info="Minimum probability threshold for sampling (recommended: 0.15)"
        ),
        gr.Slider(
            minimum=1.0,
            maximum=2.0,
            value=1.05,
            step=0.05,
            label="Repetition Penalty",
            info="Penalty for repeating tokens (recommended: 1.05)"
        ),
    ],
    cache_examples=False,
)

if __name__ == "__main__":
    demo.launch()