Spaces:
Running
Running
| import gradio as gr | |
| import spaces | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| import torch | |
| from threading import Thread | |
| # Model configuration | |
| MODEL_ID = "LiquidAI/LFM2-8B-A1B" | |
| # NOTE: This model requires transformers from source. | |
| # Add this to your requirements.txt: | |
| # git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6 | |
| # Load tokenizer globally (doesn't need GPU) | |
| print("Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=False) | |
| # Load model globally (will be moved to GPU by ZeroGPU decorator) | |
| print("Loading model...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| device_map="cpu", # Load on CPU first, ZeroGPU will move it to GPU | |
| torch_dtype=torch.bfloat16, | |
| trust_remote_code=False, | |
| # attn_implementation="flash_attention_2" # Uncomment if you have compatible GPU | |
| ) | |
| print("Model loaded successfully!") | |
| def generate_response( | |
| message: str, | |
| history: list[dict[str, str]], | |
| system_message: str, | |
| max_new_tokens: int, | |
| temperature: float, | |
| min_p: float, | |
| repetition_penalty: float, | |
| ): | |
| """ | |
| Generate a response using the LiquidAI LFM2-8B model | |
| Args: | |
| message: The current user message | |
| history: Chat history in the format [{"role": "user"/"assistant", "content": "..."}] | |
| system_message: System prompt to guide the model | |
| max_new_tokens: Maximum tokens to generate | |
| temperature: Sampling temperature | |
| min_p: Minimum probability threshold | |
| repetition_penalty: Penalty for repetition | |
| Yields: | |
| Generated text tokens (streaming) | |
| """ | |
| if not message.strip(): | |
| yield "Please enter a message." | |
| return | |
| # Move model to GPU (handled by ZeroGPU) | |
| model.to("cuda") | |
| # Build conversation history | |
| messages = [] | |
| # Add system message if provided | |
| if system_message.strip(): | |
| messages.append({"role": "system", "content": system_message}) | |
| # Add chat history | |
| for msg in history: | |
| messages.append(msg) | |
| # Add current user message | |
| messages.append({"role": "user", "content": message}) | |
| # Prepare input | |
| input_ids = tokenizer.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, | |
| return_tensors="pt", | |
| tokenize=True, | |
| ).to(model.device) | |
| # Generate response with streaming | |
| streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True) | |
| generation_kwargs = dict( | |
| input_ids=input_ids, | |
| streamer=streamer, | |
| do_sample=True, | |
| temperature=temperature, | |
| min_p=min_p, | |
| repetition_penalty=repetition_penalty, | |
| max_new_tokens=max_new_tokens, | |
| ) | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| # Stream the output | |
| streamer_output = "" | |
| for new_text in streamer: | |
| streamer_output += new_text | |
| yield streamer_output | |
| thread.join() | |
| # Create Gradio ChatInterface | |
| demo = gr.ChatInterface( | |
| generate_response, | |
| type="messages", | |
| title="π LiquidAI LFM2-8B Chat", | |
| description=""" | |
| Chat with the **LiquidAI LFM2-8B-A1B** model using ZeroGPU. | |
| This is a hybrid MoE model with 8.3B total parameters and 1.5B active parameters, optimized for edge AI deployment. | |
| π‘ **Tip:** The first response may take a moment as the GPU is allocated. The model excels at: | |
| - Instruction following | |
| - Math and reasoning | |
| - Multi-turn conversations | |
| - Agentic tasks and data extraction | |
| β οΈ **Note:** This model is best suited for narrow use cases. It may not perform well on knowledge-intensive tasks. | |
| """, | |
| theme=gr.themes.Soft(), | |
| examples=[ | |
| ["What is C. elegans?"], | |
| ["Explain quantum entanglement in simple terms."], | |
| ["Write a short poem about artificial intelligence."], | |
| ["What are the main differences between Python and JavaScript?"], | |
| ["Solve this math problem: If a train travels 120 miles in 2 hours, what is its average speed?"], | |
| ["Help me plan a 3-day itinerary for visiting Paris."], | |
| ], | |
| additional_inputs=[ | |
| gr.Textbox( | |
| value="You are a helpful assistant trained by Liquid AI.", | |
| label="System Message", | |
| info="Set the behavior and personality of the assistant" | |
| ), | |
| gr.Slider( | |
| minimum=64, | |
| maximum=2048, | |
| value=512, | |
| step=64, | |
| label="Max New Tokens", | |
| info="Maximum length of generated response" | |
| ), | |
| gr.Slider( | |
| minimum=0.1, | |
| maximum=2.0, | |
| value=0.3, | |
| step=0.1, | |
| label="Temperature", | |
| info="Higher values make output more random (recommended: 0.3)" | |
| ), | |
| gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.15, | |
| step=0.05, | |
| label="Min P", | |
| info="Minimum probability threshold for sampling (recommended: 0.15)" | |
| ), | |
| gr.Slider( | |
| minimum=1.0, | |
| maximum=2.0, | |
| value=1.05, | |
| step=0.05, | |
| label="Repetition Penalty", | |
| info="Penalty for repeating tokens (recommended: 1.05)" | |
| ), | |
| ], | |
| cache_examples=False, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |