LFM2-8B-A1B / app.py
Mohaddz's picture
Update app.py
b7f6ab9 verified
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread
# Model configuration
MODEL_ID = "LiquidAI/LFM2-8B-A1B"
# NOTE: This model requires transformers from source.
# Add this to your requirements.txt:
# git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6
# Load tokenizer globally (doesn't need GPU)
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=False)
# Load model globally (will be moved to GPU by ZeroGPU decorator)
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="cpu", # Load on CPU first, ZeroGPU will move it to GPU
torch_dtype=torch.bfloat16,
trust_remote_code=False,
# attn_implementation="flash_attention_2" # Uncomment if you have compatible GPU
)
print("Model loaded successfully!")
@spaces.GPU(duration=120)
def generate_response(
message: str,
history: list[dict[str, str]],
system_message: str,
max_new_tokens: int,
temperature: float,
min_p: float,
repetition_penalty: float,
):
"""
Generate a response using the LiquidAI LFM2-8B model
Args:
message: The current user message
history: Chat history in the format [{"role": "user"/"assistant", "content": "..."}]
system_message: System prompt to guide the model
max_new_tokens: Maximum tokens to generate
temperature: Sampling temperature
min_p: Minimum probability threshold
repetition_penalty: Penalty for repetition
Yields:
Generated text tokens (streaming)
"""
if not message.strip():
yield "Please enter a message."
return
# Move model to GPU (handled by ZeroGPU)
model.to("cuda")
# Build conversation history
messages = []
# Add system message if provided
if system_message.strip():
messages.append({"role": "system", "content": system_message})
# Add chat history
for msg in history:
messages.append(msg)
# Add current user message
messages.append({"role": "user", "content": message})
# Prepare input
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
tokenize=True,
).to(model.device)
# Generate response with streaming
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
generation_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
do_sample=True,
temperature=temperature,
min_p=min_p,
repetition_penalty=repetition_penalty,
max_new_tokens=max_new_tokens,
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# Stream the output
streamer_output = ""
for new_text in streamer:
streamer_output += new_text
yield streamer_output
thread.join()
# Create Gradio ChatInterface
demo = gr.ChatInterface(
generate_response,
type="messages",
title="🌊 LiquidAI LFM2-8B Chat",
description="""
Chat with the **LiquidAI LFM2-8B-A1B** model using ZeroGPU.
This is a hybrid MoE model with 8.3B total parameters and 1.5B active parameters, optimized for edge AI deployment.
πŸ’‘ **Tip:** The first response may take a moment as the GPU is allocated. The model excels at:
- Instruction following
- Math and reasoning
- Multi-turn conversations
- Agentic tasks and data extraction
⚠️ **Note:** This model is best suited for narrow use cases. It may not perform well on knowledge-intensive tasks.
""",
theme=gr.themes.Soft(),
examples=[
["What is C. elegans?"],
["Explain quantum entanglement in simple terms."],
["Write a short poem about artificial intelligence."],
["What are the main differences between Python and JavaScript?"],
["Solve this math problem: If a train travels 120 miles in 2 hours, what is its average speed?"],
["Help me plan a 3-day itinerary for visiting Paris."],
],
additional_inputs=[
gr.Textbox(
value="You are a helpful assistant trained by Liquid AI.",
label="System Message",
info="Set the behavior and personality of the assistant"
),
gr.Slider(
minimum=64,
maximum=2048,
value=512,
step=64,
label="Max New Tokens",
info="Maximum length of generated response"
),
gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.3,
step=0.1,
label="Temperature",
info="Higher values make output more random (recommended: 0.3)"
),
gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.15,
step=0.05,
label="Min P",
info="Minimum probability threshold for sampling (recommended: 0.15)"
),
gr.Slider(
minimum=1.0,
maximum=2.0,
value=1.05,
step=0.05,
label="Repetition Penalty",
info="Penalty for repeating tokens (recommended: 1.05)"
),
],
cache_examples=False,
)
if __name__ == "__main__":
demo.launch()