Spaces:

Mohaddz
/

LFM2-8B-A1B

Running

App Files Files Community

LFM2-8B-A1B / app.py

Mohaddz

Update app.py

b7f6ab9 verified 2 months ago

raw

history blame contribute delete

5.51 kB

	import gradio as gr
	import spaces
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	import torch
	from threading import Thread

	# Model configuration
	MODEL_ID = "LiquidAI/LFM2-8B-A1B"

	# NOTE: This model requires transformers from source.
	# Add this to your requirements.txt:
	# git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6

	# Load tokenizer globally (doesn't need GPU)
	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=False)

	# Load model globally (will be moved to GPU by ZeroGPU decorator)
	print("Loading model...")
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	device_map="cpu", # Load on CPU first, ZeroGPU will move it to GPU
	torch_dtype=torch.bfloat16,
	trust_remote_code=False,
	# attn_implementation="flash_attention_2" # Uncomment if you have compatible GPU
	)
	print("Model loaded successfully!")

	@spaces.GPU(duration=120)
	def generate_response(
	message: str,
	history: list[dict[str, str]],
	system_message: str,
	max_new_tokens: int,
	temperature: float,
	min_p: float,
	repetition_penalty: float,
	):
	"""
	Generate a response using the LiquidAI LFM2-8B model

	Args:
	message: The current user message
	history: Chat history in the format [{"role": "user"/"assistant", "content": "..."}]
	system_message: System prompt to guide the model
	max_new_tokens: Maximum tokens to generate
	temperature: Sampling temperature
	min_p: Minimum probability threshold
	repetition_penalty: Penalty for repetition

	Yields:
	Generated text tokens (streaming)
	"""
	if not message.strip():
	yield "Please enter a message."
	return

	# Move model to GPU (handled by ZeroGPU)
	model.to("cuda")

	# Build conversation history
	messages = []

	# Add system message if provided
	if system_message.strip():
	messages.append({"role": "system", "content": system_message})

	# Add chat history
	for msg in history:
	messages.append(msg)

	# Add current user message
	messages.append({"role": "user", "content": message})

	# Prepare input
	input_ids = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	return_tensors="pt",
	tokenize=True,
	).to(model.device)

	# Generate response with streaming
	streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)

	generation_kwargs = dict(
	input_ids=input_ids,
	streamer=streamer,
	do_sample=True,
	temperature=temperature,
	min_p=min_p,
	repetition_penalty=repetition_penalty,
	max_new_tokens=max_new_tokens,
	)

	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	# Stream the output
	streamer_output = ""
	for new_text in streamer:
	streamer_output += new_text
	yield streamer_output

	thread.join()


	# Create Gradio ChatInterface
	demo = gr.ChatInterface(
	generate_response,
	type="messages",
	title="🌊 LiquidAI LFM2-8B Chat",
	description="""
	Chat with the LiquidAI LFM2-8B-A1B model using ZeroGPU.
	This is a hybrid MoE model with 8.3B total parameters and 1.5B active parameters, optimized for edge AI deployment.

	💡 Tip: The first response may take a moment as the GPU is allocated. The model excels at:
	- Instruction following
	- Math and reasoning
	- Multi-turn conversations
	- Agentic tasks and data extraction

	⚠️ Note: This model is best suited for narrow use cases. It may not perform well on knowledge-intensive tasks.
	""",
	theme=gr.themes.Soft(),
	examples=[
	["What is C. elegans?"],
	["Explain quantum entanglement in simple terms."],
	["Write a short poem about artificial intelligence."],
	["What are the main differences between Python and JavaScript?"],
	["Solve this math problem: If a train travels 120 miles in 2 hours, what is its average speed?"],
	["Help me plan a 3-day itinerary for visiting Paris."],
	],
	additional_inputs=[
	gr.Textbox(
	value="You are a helpful assistant trained by Liquid AI.",
	label="System Message",
	info="Set the behavior and personality of the assistant"
	),
	gr.Slider(
	minimum=64,
	maximum=2048,
	value=512,
	step=64,
	label="Max New Tokens",
	info="Maximum length of generated response"
	),
	gr.Slider(
	minimum=0.1,
	maximum=2.0,
	value=0.3,
	step=0.1,
	label="Temperature",
	info="Higher values make output more random (recommended: 0.3)"
	),
	gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.15,
	step=0.05,
	label="Min P",
	info="Minimum probability threshold for sampling (recommended: 0.15)"
	),
	gr.Slider(
	minimum=1.0,
	maximum=2.0,
	value=1.05,
	step=0.05,
	label="Repetition Penalty",
	info="Penalty for repeating tokens (recommended: 1.05)"
	),
	],
	cache_examples=False,
	)

	if __name__ == "__main__":
	demo.launch()