import os import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel BASE = "openlm-research/open_llama_3b" LORA = "GilbertAkham/openlm-llama-lora-codetrans" # ---- FIX HERE ---- tokenizer = AutoTokenizer.from_pretrained( BASE, use_fast=False # MUST be here, not on model ) model = AutoModelForCausalLM.from_pretrained( BASE, load_in_8bit=True, device_map="auto" ) # ------------------ model = PeftModel.from_pretrained(model, LORA) model.eval() def chat_fn(prompt, max_new_tokens=256): inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): out = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.3, top_p=0.9 ) return tokenizer.decode(out[0], skip_special_tokens=True) demo = gr.Interface( fn=chat_fn, inputs=[gr.Textbox(lines=6, label="Prompt"), gr.Slider(16,1024,256,label="Max new tokens")], outputs="text", title="openlm-llama-LoRA codetrans", ) demo.launch(share=True)