Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from peft import PeftModel | |
| BASE = "openlm-research/open_llama_3b" | |
| LORA = "GilbertAkham/openlm-llama-lora-codetrans" | |
| # ---- FIX HERE ---- | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| BASE, | |
| use_fast=False # MUST be here, not on model | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| BASE, | |
| load_in_8bit=True, | |
| device_map="auto" | |
| ) | |
| # ------------------ | |
| model = PeftModel.from_pretrained(model, LORA) | |
| model.eval() | |
| def chat_fn(prompt, max_new_tokens=256): | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| with torch.no_grad(): | |
| out = model.generate( | |
| **inputs, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=True, | |
| temperature=0.3, | |
| top_p=0.9 | |
| ) | |
| return tokenizer.decode(out[0], skip_special_tokens=True) | |
| demo = gr.Interface( | |
| fn=chat_fn, | |
| inputs=[gr.Textbox(lines=6, label="Prompt"), gr.Slider(16,1024,256,label="Max new tokens")], | |
| outputs="text", | |
| title="openlm-llama-LoRA codetrans", | |
| ) | |
| demo.launch(share=True) | |