Spaces:

Mohaddz
/

LFM2-8B-A1B

Running

File size: 5,509 Bytes

import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread

# Model configuration
MODEL_ID = "LiquidAI/LFM2-8B-A1B"

# NOTE: This model requires transformers from source.
# Add this to your requirements.txt:
# git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6

# Load tokenizer globally (doesn't need GPU)
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=False)

# Load model globally (will be moved to GPU by ZeroGPU decorator)
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="cpu",  # Load on CPU first, ZeroGPU will move it to GPU
    torch_dtype=torch.bfloat16,
    trust_remote_code=False,
    # attn_implementation="flash_attention_2"  # Uncomment if you have compatible GPU
)
print("Model loaded successfully!")

@spaces.GPU(duration=120)
def generate_response(
    message: str,
    history: list[dict[str, str]],
    system_message: str,
    max_new_tokens: int,
    temperature: float,
    min_p: float,
    repetition_penalty: float,
):
    """
    Generate a response using the LiquidAI LFM2-8B model
    
    Args:
        message: The current user message
        history: Chat history in the format [{"role": "user"/"assistant", "content": "..."}]
        system_message: System prompt to guide the model
        max_new_tokens: Maximum tokens to generate
        temperature: Sampling temperature
        min_p: Minimum probability threshold
        repetition_penalty: Penalty for repetition
    
    Yields:
        Generated text tokens (streaming)
    """
    if not message.strip():
        yield "Please enter a message."
        return
    
    # Move model to GPU (handled by ZeroGPU)
    model.to("cuda")
    
    # Build conversation history
    messages = []
    
    # Add system message if provided
    if system_message.strip():
        messages.append({"role": "system", "content": system_message})
    
    # Add chat history
    for msg in history:
        messages.append(msg)
    
    # Add current user message
    messages.append({"role": "user", "content": message})
    
    # Prepare input
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True,
    ).to(model.device)
    
    # Generate response with streaming
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
    
    generation_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        do_sample=True,
        temperature=temperature,
        min_p=min_p,
        repetition_penalty=repetition_penalty,
        max_new_tokens=max_new_tokens,
    )
    
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    # Stream the output
    streamer_output = ""
    for new_text in streamer:
        streamer_output += new_text
        yield streamer_output
    
    thread.join()


# Create Gradio ChatInterface
demo = gr.ChatInterface(
    generate_response,
    type="messages",
    title="🌊 LiquidAI LFM2-8B Chat",
    description="""
    Chat with the **LiquidAI LFM2-8B-A1B** model using ZeroGPU. 
    This is a hybrid MoE model with 8.3B total parameters and 1.5B active parameters, optimized for edge AI deployment.
    
    💡 **Tip:** The first response may take a moment as the GPU is allocated. The model excels at:
    - Instruction following
    - Math and reasoning
    - Multi-turn conversations
    - Agentic tasks and data extraction
    
    ⚠️ **Note:** This model is best suited for narrow use cases. It may not perform well on knowledge-intensive tasks.
    """,
    theme=gr.themes.Soft(),
    examples=[
        ["What is C. elegans?"],
        ["Explain quantum entanglement in simple terms."],
        ["Write a short poem about artificial intelligence."],
        ["What are the main differences between Python and JavaScript?"],
        ["Solve this math problem: If a train travels 120 miles in 2 hours, what is its average speed?"],
        ["Help me plan a 3-day itinerary for visiting Paris."],
    ],
    additional_inputs=[
        gr.Textbox(
            value="You are a helpful assistant trained by Liquid AI.",
            label="System Message",
            info="Set the behavior and personality of the assistant"
        ),
        gr.Slider(
            minimum=64,
            maximum=2048,
            value=512,
            step=64,
            label="Max New Tokens",
            info="Maximum length of generated response"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=2.0,
            value=0.3,
            step=0.1,
            label="Temperature",
            info="Higher values make output more random (recommended: 0.3)"
        ),
        gr.Slider(
            minimum=0.0,
            maximum=1.0,
            value=0.15,
            step=0.05,
            label="Min P",
            info="Minimum probability threshold for sampling (recommended: 0.15)"
        ),
        gr.Slider(
            minimum=1.0,
            maximum=2.0,
            value=1.05,
            step=0.05,
            label="Repetition Penalty",
            info="Penalty for repeating tokens (recommended: 1.05)"
        ),
    ],
    cache_examples=False,
)

if __name__ == "__main__":
    demo.launch()