Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import psutil | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import time | |
| import os | |
| from huggingface_hub import login | |
| import numpy as np | |
| # Streamlit app configuration | |
| st.set_page_config(page_title="DeepSeek Tuning App", layout="wide") | |
| st.title("DeepSeek Model Tuning for RAM and Context Length") | |
| # Sidebar for user inputs | |
| st.sidebar.header("Configuration") | |
| model_choice = st.sidebar.selectbox( | |
| "Select DeepSeek Model", | |
| ["deepseek-ai/deepseek-v2", "deepseek-ai/deepseek-coder-6.7b-instruct"], | |
| help="Select an available DeepSeek model." | |
| ) | |
| context_length = st.sidebar.slider("Max Context Length", 1024, 16384, 4096, step=1024) | |
| quantization = st.sidebar.checkbox("Enable 4-bit Quantization", value=True) | |
| hf_token = st.sidebar.text_input("Hugging Face Token (optional)", type="password") | |
| run_button = st.sidebar.button("Run Model") | |
| # Function to get RAM usage | |
| def get_ram_usage(): | |
| return psutil.virtual_memory().percent | |
| # Function to install and load the model | |
| def load_model(model_name, quantize=False, token=None): | |
| try: | |
| if token: | |
| st.write("Logging in to Hugging Face with provided token...") | |
| login(token) | |
| st.write(f"Loading {model_name}...") | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=token) | |
| if quantize and torch.cuda.is_available(): | |
| from bitsandbytes import BitsAndBytesConfig | |
| bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| trust_remote_code=True, | |
| quantization_config=bnb_config, | |
| device_map="auto", | |
| token=token | |
| ) | |
| else: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| trust_remote_code=True, | |
| torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
| device_map="auto", | |
| token=token | |
| ) | |
| return model, tokenizer | |
| except Exception as e: | |
| st.error(f"Error loading model: {str(e)}") | |
| st.write("Please verify the model name on https://huggingface.co/models or provide a valid token.") | |
| return None, None | |
| # Function to tune and run inference | |
| def run_inference(model, tokenizer, context_len): | |
| ram_usages = [] | |
| inference_times = [] | |
| prompt = "Write a detailed essay about artificial intelligence advancements." * (context_len // 50) | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=context_len) | |
| if torch.cuda.is_available(): | |
| inputs = inputs.to("cuda") | |
| start_time = time.time() | |
| ram_before = get_ram_usage() | |
| with torch.no_grad(): | |
| outputs = model.generate(**inputs, max_new_tokens=100) | |
| ram_after = get_ram_usage() | |
| inference_time = time.time() - start_time | |
| result = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| ram_usages.extend([ram_before, ram_after]) | |
| inference_times.append(inference_time) | |
| return result, ram_usages, inference_times | |
| # Visualization function | |
| def plot_results(ram_usages, inference_times, context_len): | |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5)) | |
| # RAM Usage Plot | |
| sns.barplot(x=["Before", "After"], y=ram_usages, ax=ax1) | |
| ax1.set_title(f"RAM Usage (%) - Context Length: {context_len}") | |
| ax1.set_ylabel("RAM Usage (%)") | |
| # Inference Time Plot | |
| sns.barplot(x=["Inference"], y=inference_times, ax=ax2) | |
| ax2.set_title("Inference Time (seconds)") | |
| ax2.set_ylabel("Time (s)") | |
| st.pyplot(fig) | |
| # Main execution | |
| if run_button: | |
| with st.spinner("Installing and tuning the model..."): | |
| # Install bitsandbytes if quantization is enabled | |
| if quantization and not os.path.exists("./bnb_installed"): | |
| st.write("Installing bitsandbytes for quantization...") | |
| os.system("pip install bitsandbytes") | |
| with open("./bnb_installed", "w") as f: | |
| f.write("installed") | |
| # Load model | |
| model, tokenizer = load_model(model_choice, quantization, hf_token if hf_token else None) | |
| if model is None or tokenizer is None: | |
| st.stop() | |
| # Tune for max RAM and context length | |
| st.write(f"Tuning {model_choice} with context length {context_length}...") | |
| # Run inference | |
| result, ram_usages, inference_times = run_inference(model, tokenizer, context_length) | |
| # Display results | |
| st.subheader("Generated Output") | |
| st.write(result) | |
| st.subheader("Performance Metrics") | |
| plot_results(ram_usages, inference_times, context_length) | |
| # Additional info | |
| st.write(f"Max Context Length Used: {context_length}") | |
| st.write(f"Quantization Enabled: {quantization}") | |
| st.write(f"Average RAM Usage: {np.mean(ram_usages):.2f}%") | |
| st.write(f"Inference Time: {inference_times[0]:.2f} seconds") | |
| # Instructions for user | |
| st.markdown(""" | |
| ### Instructions | |
| 1. Select a DeepSeek model from the sidebar. | |
| 2. Adjust the context length (higher values use more RAM). | |
| 3. Enable quantization to reduce RAM usage (optional). | |
| 4. Provide a Hugging Face token if the model is private. | |
| 5. Click 'Run Model' to install, tune, and visualize results. | |
| **Note:** Ensure the model name is correct and accessible. | |
| """) |