import os import json from datetime import datetime from pathlib import Path import gradio as gr from huggingface_hub import InferenceClient # PDF extraction libraries try: from pypdf import PdfReader PYPDF_AVAILABLE = True except ImportError: PYPDF_AVAILABLE = False try: import docx DOCX_AVAILABLE = True except ImportError: DOCX_AVAILABLE = False try: import pandas as pd PANDAS_AVAILABLE = True except ImportError: PANDAS_AVAILABLE = False # Model configurations MODELS = [ "Qwen/Qwen2.5-72B-Instruct", "meta-llama/Meta-Llama-3.1-8B-Instruct", "mistralai/Mixtral-8x7B-Instruct-v0.1", "google/gemma-2-9b-it", "microsoft/Phi-3-mini-4k-instruct", ] SYSTEM_PROMPTS = { "Default": "You are a helpful, respectful and honest assistant.", "Document Analyzer": "You are an expert at analyzing documents. Provide detailed insights, summaries, and answer questions based on the provided document content.", "Code Expert": "You are an expert programmer. Analyze code, provide explanations, and suggest improvements.", "Data Scientist": "You are a data science expert. Analyze data files and provide insights with statistical analysis.", "Research Assistant": "You are a research assistant. Help analyze academic papers and documents, extract key findings.", } def extract_text_from_pdf(file_path): """Extract text from PDF""" if not PYPDF_AVAILABLE: return "āŒ PDF extraction unavailable." try: reader = PdfReader(file_path) text = f"šŸ“„ PDF: {len(reader.pages)} pages\n\n" for page_num, page in enumerate(reader.pages, 1): page_text = page.extract_text() text += f"--- Page {page_num} ---\n{page_text}\n\n" return text except Exception as e: return f"āŒ Error reading PDF: {str(e)}" def extract_text_from_docx(file_path): """Extract text from DOCX""" if not DOCX_AVAILABLE: return "āŒ DOCX extraction unavailable." try: doc = docx.Document(file_path) return "\n\n".join([p.text for p in doc.paragraphs if p.text.strip()]) except Exception as e: return f"āŒ Error reading DOCX: {str(e)}" def extract_text_from_txt(file_path): """Extract text from TXT""" try: with open(file_path, 'r', encoding='utf-8') as f: return f.read() except UnicodeDecodeError: with open(file_path, 'r', encoding='latin-1') as f: return f.read() except Exception as e: return f"āŒ Error: {str(e)}" def extract_text_from_csv(file_path): """Extract text from CSV""" if not PANDAS_AVAILABLE: return "āŒ CSV extraction unavailable." try: df = pd.read_csv(file_path) text = f"šŸ“Š CSV: {len(df)} rows, {len(df.columns)} columns\n\n" text += f"Columns: {', '.join(df.columns)}\n\n" text += f"Preview (first 10 rows):\n{df.head(10).to_string()}\n\n" text += f"Statistics:\n{df.describe().to_string()}" return text except Exception as e: return f"āŒ Error: {str(e)}" def process_files(files): """Process uploaded files""" if not files: return "" content = "\n\n" + "="*50 + "\nšŸ“Ž UPLOADED DOCUMENTS\n" + "="*50 + "\n\n" for file_path in files: file_name = Path(file_path).name file_ext = Path(file_path).suffix.lower() content += f"\nšŸ“„ **{file_name}**\n\n" if file_ext == '.pdf': text = extract_text_from_pdf(file_path) elif file_ext in ['.docx', '.doc']: text = extract_text_from_docx(file_path) elif file_ext in ['.txt', '.md', '.py', '.json']: text = extract_text_from_txt(file_path) elif file_ext == '.csv': text = extract_text_from_csv(file_path) else: text = f"āš ļø Unsupported format: {file_ext}" content += text + "\n\n" + "-"*50 + "\n" return content def convert_history_to_messages(history): """Convert tuple history to OpenAI message format""" messages = [] for user_msg, assistant_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) return messages def respond(message, history, system_message, max_tokens, temperature, top_p, model_id): """Main chat function - TUPLE FORMAT""" token = os.getenv("HF_TOKEN") if not token: history.append([message, "āš ļø HF_TOKEN not configured. Please set it in Space settings → Repository secrets."]) return history, None try: client = InferenceClient(token=token, model=model_id) # Build messages from history messages = [{"role": "system", "content": system_message}] messages.extend(convert_history_to_messages(history)) # Handle message input user_text = "" files = [] if isinstance(message, dict): user_text = message.get("text", "") files = message.get("files", []) else: user_text = str(message) if message else "" if not user_text.strip() and not files: return history, None # Process files file_content = process_files(files) if files else "" full_message = user_text + file_content messages.append({"role": "user", "content": full_message}) # Stream response response = "" history.append([user_text, ""]) # Add user message with empty response for chunk in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): if chunk.choices and chunk.choices[0].delta.content: response += chunk.choices[0].delta.content history[-1] = [user_text, response] # Update the last tuple yield history, None except Exception as e: error_msg = f"āŒ Error: {str(e)}\n\nTry a different model or check token permissions." history[-1] = [user_text, error_msg] yield history, None def save_conversation(history): """Save conversation""" if not history: return "āš ļø No conversation to save" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"chat_{timestamp}.json" try: with open(filename, 'w') as f: json.dump(history, f, indent=2) return f"āœ… Saved to {filename}" except Exception as e: return f"āŒ Error: {str(e)}" def update_system_prompt(preset): return SYSTEM_PROMPTS.get(preset, SYSTEM_PROMPTS["Default"]) def clear_chat(): return [], None, "" # Custom theme custom_theme = gr.themes.Soft( primary_hue="blue", secondary_hue="slate", ) # Build interface with gr.Blocks(title="AI Document Assistant") as demo: gr.Markdown( """ # šŸ¤– AI Document Assistant Upload documents (PDF, DOCX, TXT, CSV) and chat with AI """ ) with gr.Row(): with gr.Column(scale=3): # FIXED: Removed 'type' parameter - uses default tuple format chatbot = gr.Chatbot( height=550, show_label=False, avatar_images=(None, "https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png"), ) msg_input = gr.MultimodalTextbox( file_count="multiple", file_types=[".pdf", ".docx", ".txt", ".csv", ".md", ".py", ".json"], placeholder="šŸ’¬ Ask a question or upload documents (PDF, DOCX, TXT, CSV)...", show_label=False, ) with gr.Row(): clear_btn = gr.Button("šŸ—‘ļø Clear Chat", variant="stop") with gr.Column(scale=1): gr.Markdown("### āš™ļø Settings") model_dropdown = gr.Dropdown( choices=MODELS, value=MODELS[0], label="šŸ¤– Model", ) preset_dropdown = gr.Dropdown( choices=list(SYSTEM_PROMPTS.keys()), value="Document Analyzer", label="šŸ“‹ Preset", ) system_prompt = gr.Textbox( value=SYSTEM_PROMPTS["Document Analyzer"], label="šŸ’¬ System Prompt", lines=3, ) gr.Markdown("### šŸŽ›ļø Parameters") max_tokens = gr.Slider(128, 4096, 2048, step=128, label="Max Tokens") temperature = gr.Slider(0.1, 2.0, 0.7, step=0.1, label="Temperature") top_p = gr.Slider(0.1, 1.0, 0.95, step=0.05, label="Top-p") gr.Markdown("### šŸ’¾ Save") save_btn = gr.Button("šŸ’¾ Save Chat") status = gr.Textbox(show_label=False, interactive=False, placeholder="Status...") gr.Markdown( """ **šŸ’” How to use:** 1. Upload PDF, DOCX, TXT, or CSV files 2. Ask questions about the document content 3. Adjust temperature for creativity (lower = focused, higher = creative) **šŸ“¦ Supported libraries:** pypdf, python-docx, pandas """ ) # Events preset_dropdown.change(update_system_prompt, [preset_dropdown], [system_prompt]) msg_input.submit( respond, [msg_input, chatbot, system_prompt, max_tokens, temperature, top_p, model_dropdown], [chatbot, msg_input], ) clear_btn.click(clear_chat, None, [chatbot, msg_input, status]) save_btn.click(save_conversation, [chatbot], [status]) if __name__ == "__main__": demo.queue() demo.launch()