Spaces:
Running
Running
File size: 5,509 Bytes
0a2f92c 374e6aa 0a2f92c 374e6aa 0a2f92c 2949bc9 0a2f92c 4c2c6b8 0a2f92c 374e6aa b7f6ab9 374e6aa 0a2f92c 374e6aa 0a2f92c 374e6aa 0a2f92c 374e6aa 0a2f92c 374e6aa 0a2f92c 374e6aa 0a2f92c 374e6aa 0a2f92c 374e6aa 0a2f92c 374e6aa 0a2f92c 374e6aa 0a2f92c 374e6aa 0a2f92c 374e6aa 0a2f92c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread
# Model configuration
MODEL_ID = "LiquidAI/LFM2-8B-A1B"
# NOTE: This model requires transformers from source.
# Add this to your requirements.txt:
# git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6
# Load tokenizer globally (doesn't need GPU)
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=False)
# Load model globally (will be moved to GPU by ZeroGPU decorator)
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="cpu", # Load on CPU first, ZeroGPU will move it to GPU
torch_dtype=torch.bfloat16,
trust_remote_code=False,
# attn_implementation="flash_attention_2" # Uncomment if you have compatible GPU
)
print("Model loaded successfully!")
@spaces.GPU(duration=120)
def generate_response(
message: str,
history: list[dict[str, str]],
system_message: str,
max_new_tokens: int,
temperature: float,
min_p: float,
repetition_penalty: float,
):
"""
Generate a response using the LiquidAI LFM2-8B model
Args:
message: The current user message
history: Chat history in the format [{"role": "user"/"assistant", "content": "..."}]
system_message: System prompt to guide the model
max_new_tokens: Maximum tokens to generate
temperature: Sampling temperature
min_p: Minimum probability threshold
repetition_penalty: Penalty for repetition
Yields:
Generated text tokens (streaming)
"""
if not message.strip():
yield "Please enter a message."
return
# Move model to GPU (handled by ZeroGPU)
model.to("cuda")
# Build conversation history
messages = []
# Add system message if provided
if system_message.strip():
messages.append({"role": "system", "content": system_message})
# Add chat history
for msg in history:
messages.append(msg)
# Add current user message
messages.append({"role": "user", "content": message})
# Prepare input
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
tokenize=True,
).to(model.device)
# Generate response with streaming
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
generation_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
do_sample=True,
temperature=temperature,
min_p=min_p,
repetition_penalty=repetition_penalty,
max_new_tokens=max_new_tokens,
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# Stream the output
streamer_output = ""
for new_text in streamer:
streamer_output += new_text
yield streamer_output
thread.join()
# Create Gradio ChatInterface
demo = gr.ChatInterface(
generate_response,
type="messages",
title="π LiquidAI LFM2-8B Chat",
description="""
Chat with the **LiquidAI LFM2-8B-A1B** model using ZeroGPU.
This is a hybrid MoE model with 8.3B total parameters and 1.5B active parameters, optimized for edge AI deployment.
π‘ **Tip:** The first response may take a moment as the GPU is allocated. The model excels at:
- Instruction following
- Math and reasoning
- Multi-turn conversations
- Agentic tasks and data extraction
β οΈ **Note:** This model is best suited for narrow use cases. It may not perform well on knowledge-intensive tasks.
""",
theme=gr.themes.Soft(),
examples=[
["What is C. elegans?"],
["Explain quantum entanglement in simple terms."],
["Write a short poem about artificial intelligence."],
["What are the main differences between Python and JavaScript?"],
["Solve this math problem: If a train travels 120 miles in 2 hours, what is its average speed?"],
["Help me plan a 3-day itinerary for visiting Paris."],
],
additional_inputs=[
gr.Textbox(
value="You are a helpful assistant trained by Liquid AI.",
label="System Message",
info="Set the behavior and personality of the assistant"
),
gr.Slider(
minimum=64,
maximum=2048,
value=512,
step=64,
label="Max New Tokens",
info="Maximum length of generated response"
),
gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.3,
step=0.1,
label="Temperature",
info="Higher values make output more random (recommended: 0.3)"
),
gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.15,
step=0.05,
label="Min P",
info="Minimum probability threshold for sampling (recommended: 0.15)"
),
gr.Slider(
minimum=1.0,
maximum=2.0,
value=1.05,
step=0.05,
label="Repetition Penalty",
info="Penalty for repeating tokens (recommended: 1.05)"
),
],
cache_examples=False,
)
if __name__ == "__main__":
demo.launch() |