|
|
|
|
|
""" |
|
|
GAIA Agent Production Interface |
|
|
Production-ready Gradio app for the GAIA benchmark agent system with Unit 4 API integration |
|
|
""" |
|
|
|
|
|
import os |
|
|
import gradio as gr |
|
|
import logging |
|
|
import time |
|
|
import requests |
|
|
import pandas as pd |
|
|
from typing import Optional, Tuple, Dict, Any |
|
|
import tempfile |
|
|
from pathlib import Path |
|
|
import json |
|
|
from datetime import datetime |
|
|
import csv |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
from workflow.gaia_workflow import SimpleGAIAWorkflow, create_gaia_workflow |
|
|
from models.qwen_client import QwenClient |
|
|
from agents.state import GAIAAgentState |
|
|
|
|
|
|
|
|
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" |
|
|
|
|
|
class GAIAResultLogger: |
|
|
""" |
|
|
Logger for GAIA evaluation results with export functionality |
|
|
""" |
|
|
|
|
|
def __init__(self): |
|
|
self.results_dir = Path("results") |
|
|
self.results_dir.mkdir(exist_ok=True) |
|
|
|
|
|
def log_evaluation_results(self, username: str, questions_data: list, results_log: list, |
|
|
final_result: dict, execution_time: float) -> dict: |
|
|
""" |
|
|
Log complete evaluation results to multiple formats |
|
|
Returns paths to generated files |
|
|
""" |
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
base_filename = f"gaia_evaluation_{username}_{timestamp}" |
|
|
|
|
|
files_created = {} |
|
|
|
|
|
try: |
|
|
|
|
|
csv_path = self.results_dir / f"{base_filename}.csv" |
|
|
self._save_csv_results(csv_path, results_log, final_result) |
|
|
files_created["csv"] = str(csv_path) |
|
|
|
|
|
|
|
|
json_path = self.results_dir / f"{base_filename}.json" |
|
|
detailed_results = self._create_detailed_results( |
|
|
username, questions_data, results_log, final_result, execution_time, timestamp |
|
|
) |
|
|
self._save_json_results(json_path, detailed_results) |
|
|
files_created["json"] = str(json_path) |
|
|
|
|
|
|
|
|
summary_path = self.results_dir / f"{base_filename}_summary.md" |
|
|
self._save_summary_report(summary_path, detailed_results) |
|
|
files_created["summary"] = str(summary_path) |
|
|
|
|
|
logger.info(f"✅ Results logged to {len(files_created)} files: {list(files_created.keys())}") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"❌ Error logging results: {e}") |
|
|
files_created["error"] = str(e) |
|
|
|
|
|
return files_created |
|
|
|
|
|
def _save_csv_results(self, path: Path, results_log: list, final_result: dict): |
|
|
"""Save results in CSV format for easy sharing""" |
|
|
with open(path, 'w', newline='', encoding='utf-8') as csvfile: |
|
|
if not results_log: |
|
|
return |
|
|
|
|
|
fieldnames = list(results_log[0].keys()) + ['Correct', 'Score'] |
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) |
|
|
|
|
|
|
|
|
writer.writeheader() |
|
|
|
|
|
|
|
|
score = final_result.get('score', 'N/A') |
|
|
correct_count = final_result.get('correct_count', 'N/A') |
|
|
total_attempted = final_result.get('total_attempted', len(results_log)) |
|
|
|
|
|
|
|
|
for i, row in enumerate(results_log): |
|
|
row_data = row.copy() |
|
|
row_data['Correct'] = 'Unknown' |
|
|
row_data['Score'] = f"{score}% ({correct_count}/{total_attempted})" if i == 0 else "" |
|
|
writer.writerow(row_data) |
|
|
|
|
|
def _create_detailed_results(self, username: str, questions_data: list, results_log: list, |
|
|
final_result: dict, execution_time: float, timestamp: str) -> dict: |
|
|
"""Create comprehensive results dictionary""" |
|
|
return { |
|
|
"metadata": { |
|
|
"username": username, |
|
|
"timestamp": timestamp, |
|
|
"execution_time_seconds": execution_time, |
|
|
"total_questions": len(questions_data), |
|
|
"total_processed": len(results_log), |
|
|
"system_info": { |
|
|
"gradio_version": "4.44.0", |
|
|
"python_version": "3.x", |
|
|
"space_id": os.getenv("SPACE_ID", "local"), |
|
|
"space_host": os.getenv("SPACE_HOST", "local") |
|
|
} |
|
|
}, |
|
|
"evaluation_results": { |
|
|
"overall_score": final_result.get('score', 'N/A'), |
|
|
"correct_count": final_result.get('correct_count', 'N/A'), |
|
|
"total_attempted": final_result.get('total_attempted', len(results_log)), |
|
|
"success_rate": f"{final_result.get('score', 0)}%", |
|
|
"api_message": final_result.get('message', 'No message'), |
|
|
"submission_successful": 'score' in final_result |
|
|
}, |
|
|
"question_details": [ |
|
|
{ |
|
|
"index": i + 1, |
|
|
"task_id": item.get("task_id"), |
|
|
"question": item.get("question"), |
|
|
"level": item.get("Level", "Unknown"), |
|
|
"file_name": item.get("file_name", ""), |
|
|
"submitted_answer": next( |
|
|
(r["Submitted Answer"] for r in results_log if r.get("Task ID") == item.get("task_id")), |
|
|
"No answer" |
|
|
), |
|
|
"question_length": len(item.get("question", "")), |
|
|
"answer_length": len(next( |
|
|
(r["Submitted Answer"] for r in results_log if r.get("Task ID") == item.get("task_id")), |
|
|
"" |
|
|
)) |
|
|
} |
|
|
for i, item in enumerate(questions_data) |
|
|
], |
|
|
"processing_summary": { |
|
|
"questions_by_level": self._analyze_questions_by_level(questions_data), |
|
|
"questions_with_files": len([q for q in questions_data if q.get("file_name")]), |
|
|
"average_question_length": sum(len(q.get("question", "")) for q in questions_data) / len(questions_data) if questions_data else 0, |
|
|
"average_answer_length": sum(len(r.get("Submitted Answer", "")) for r in results_log) / len(results_log) if results_log else 0, |
|
|
"processing_time_per_question": execution_time / len(results_log) if results_log else 0 |
|
|
}, |
|
|
"raw_results_log": results_log, |
|
|
"api_response": final_result |
|
|
} |
|
|
|
|
|
def _analyze_questions_by_level(self, questions_data: list) -> dict: |
|
|
"""Analyze question distribution by level""" |
|
|
level_counts = {} |
|
|
for q in questions_data: |
|
|
level = q.get("Level", "Unknown") |
|
|
level_counts[level] = level_counts.get(level, 0) + 1 |
|
|
return level_counts |
|
|
|
|
|
def _save_json_results(self, path: Path, detailed_results: dict): |
|
|
"""Save detailed results in JSON format""" |
|
|
with open(path, 'w', encoding='utf-8') as jsonfile: |
|
|
json.dump(detailed_results, jsonfile, indent=2, ensure_ascii=False) |
|
|
|
|
|
def _save_summary_report(self, path: Path, detailed_results: dict): |
|
|
"""Save human-readable summary report""" |
|
|
metadata = detailed_results["metadata"] |
|
|
results = detailed_results["evaluation_results"] |
|
|
summary = detailed_results["processing_summary"] |
|
|
|
|
|
report = f"""# GAIA Agent Evaluation Report |
|
|
|
|
|
## Summary |
|
|
- **User**: {metadata['username']} |
|
|
- **Date**: {metadata['timestamp']} |
|
|
- **Overall Score**: {results['overall_score']}% ({results['correct_count']}/{results['total_attempted']} correct) |
|
|
- **Execution Time**: {metadata['execution_time_seconds']:.2f} seconds |
|
|
- **Submission Status**: {'✅ Success' if results['submission_successful'] else '❌ Failed'} |
|
|
|
|
|
## Question Analysis |
|
|
- **Total Questions**: {metadata['total_questions']} |
|
|
- **Successfully Processed**: {metadata['total_processed']} |
|
|
- **Questions with Files**: {summary['questions_with_files']} |
|
|
- **Average Question Length**: {summary['average_question_length']:.0f} characters |
|
|
- **Average Answer Length**: {summary['average_answer_length']:.0f} characters |
|
|
- **Processing Time per Question**: {summary['processing_time_per_question']:.2f} seconds |
|
|
|
|
|
## Questions by Level |
|
|
""" |
|
|
|
|
|
for level, count in summary['questions_by_level'].items(): |
|
|
report += f"- **Level {level}**: {count} questions\n" |
|
|
|
|
|
report += f""" |
|
|
## API Response |
|
|
{results['api_message']} |
|
|
|
|
|
## System Information |
|
|
- **Space ID**: {metadata['system_info']['space_id']} |
|
|
- **Space Host**: {metadata['system_info']['space_host']} |
|
|
- **Gradio Version**: {metadata['system_info']['gradio_version']} |
|
|
|
|
|
--- |
|
|
*Report generated automatically by GAIA Agent System* |
|
|
""" |
|
|
|
|
|
with open(path, 'w', encoding='utf-8') as f: |
|
|
f.write(report) |
|
|
|
|
|
def get_latest_results(self, username: str = None) -> list: |
|
|
"""Get list of latest result files""" |
|
|
pattern = f"gaia_evaluation_{username}_*" if username else "gaia_evaluation_*" |
|
|
files = list(self.results_dir.glob(pattern)) |
|
|
files.sort(key=lambda x: x.stat().st_mtime, reverse=True) |
|
|
return files[:10] |
|
|
|
|
|
class GAIAAgentApp: |
|
|
"""Production GAIA Agent Application with LangGraph workflow and Qwen models""" |
|
|
|
|
|
def __init__(self, hf_token: Optional[str] = None): |
|
|
"""Initialize the application with LangGraph workflow and Qwen models only""" |
|
|
|
|
|
|
|
|
if not hf_token: |
|
|
hf_token = os.getenv("HF_TOKEN") |
|
|
|
|
|
if not hf_token: |
|
|
raise ValueError("HuggingFace token with inference permissions is required. Please set HF_TOKEN environment variable or login with full access.") |
|
|
|
|
|
try: |
|
|
|
|
|
from models.qwen_client import QwenClient |
|
|
self.llm_client = QwenClient(hf_token=hf_token) |
|
|
|
|
|
|
|
|
self.workflow = SimpleGAIAWorkflow(self.llm_client) |
|
|
|
|
|
self.initialized = True |
|
|
logger.info("✅ GAIA Agent system initialized with LangGraph workflow and Qwen models") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"❌ Failed to initialize GAIA Agent system: {e}") |
|
|
raise RuntimeError(f"System initialization failed: {e}. Please ensure HF_TOKEN has inference permissions.") |
|
|
|
|
|
@classmethod |
|
|
def create_with_oauth_token(cls, oauth_token: str) -> "GAIAAgentApp": |
|
|
"""Create a new instance with OAuth token""" |
|
|
if not oauth_token: |
|
|
raise ValueError("Valid OAuth token is required for GAIA Agent initialization") |
|
|
return cls(hf_token=oauth_token) |
|
|
|
|
|
def __call__(self, question: str) -> str: |
|
|
""" |
|
|
Main agent call for Unit 4 API compatibility |
|
|
""" |
|
|
if not self.initialized: |
|
|
return "System not initialized" |
|
|
|
|
|
try: |
|
|
result_state = self.workflow.process_question( |
|
|
question=question, |
|
|
task_id=f"unit4_{hash(question) % 10000}" |
|
|
) |
|
|
|
|
|
|
|
|
return result_state.final_answer if result_state.final_answer else "Unable to process question" |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Error processing question: {e}") |
|
|
return f"Processing error: {str(e)}" |
|
|
|
|
|
def process_question_detailed(self, question: str, file_input=None, show_reasoning: bool = False) -> Tuple[str, str, str]: |
|
|
""" |
|
|
Process a question through the GAIA agent system with detailed output |
|
|
|
|
|
Returns: |
|
|
Tuple of (answer, details, reasoning) |
|
|
""" |
|
|
|
|
|
if not self.initialized: |
|
|
return "❌ System not initialized", "", "" |
|
|
|
|
|
if not question.strip(): |
|
|
return "❌ Please provide a question", "", "" |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
file_path = None |
|
|
file_name = None |
|
|
if file_input is not None: |
|
|
file_path = file_input.name |
|
|
file_name = os.path.basename(file_path) |
|
|
|
|
|
try: |
|
|
|
|
|
result_state = self.workflow.process_question( |
|
|
question=question, |
|
|
file_path=file_path, |
|
|
file_name=file_name, |
|
|
task_id=f"manual_{hash(question) % 10000}" |
|
|
) |
|
|
|
|
|
processing_time = time.time() - start_time |
|
|
|
|
|
|
|
|
answer = result_state.final_answer |
|
|
if not answer: |
|
|
answer = "Unable to process question - no answer generated" |
|
|
|
|
|
|
|
|
details = self._format_details(result_state, processing_time) |
|
|
|
|
|
|
|
|
reasoning = "" |
|
|
if show_reasoning: |
|
|
reasoning = self._format_reasoning(result_state) |
|
|
|
|
|
return answer, details, reasoning |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"Processing failed: {str(e)}" |
|
|
logger.error(error_msg) |
|
|
return f"❌ {error_msg}", "Please try again or contact support", "" |
|
|
|
|
|
def _format_details(self, state, processing_time: float) -> str: |
|
|
"""Format processing details""" |
|
|
|
|
|
details = [] |
|
|
|
|
|
|
|
|
details.append(f"🎯 **Question Type**: {state.question_type.value}") |
|
|
details.append(f"⚡ **Processing Time**: {processing_time:.2f}s") |
|
|
details.append(f"📊 **Confidence**: {state.final_confidence:.2f}") |
|
|
details.append(f"💰 **Cost**: ${state.total_cost:.4f}") |
|
|
|
|
|
|
|
|
agents_used = [result.agent_role.value for result in state.agent_results] |
|
|
details.append(f"🤖 **Agents Used**: {', '.join(agents_used) if agents_used else 'None'}") |
|
|
|
|
|
|
|
|
tools_used = [] |
|
|
for result in state.agent_results: |
|
|
tools_used.extend(result.tools_used) |
|
|
unique_tools = list(set(tools_used)) |
|
|
details.append(f"🔧 **Tools Used**: {', '.join(unique_tools) if unique_tools else 'None'}") |
|
|
|
|
|
|
|
|
if state.file_name: |
|
|
details.append(f"📁 **File Processed**: {state.file_name}") |
|
|
|
|
|
|
|
|
if state.confidence_threshold_met: |
|
|
details.append("✅ **Quality**: High confidence") |
|
|
elif state.final_confidence > 0.5: |
|
|
details.append("⚠️ **Quality**: Medium confidence") |
|
|
else: |
|
|
details.append("❌ **Quality**: Low confidence") |
|
|
|
|
|
|
|
|
if state.requires_human_review: |
|
|
details.append("👁️ **Review**: Human review recommended") |
|
|
|
|
|
|
|
|
if state.error_messages: |
|
|
details.append(f"⚠️ **Errors**: {len(state.error_messages)} encountered") |
|
|
|
|
|
return "\n".join(details) |
|
|
|
|
|
def _format_reasoning(self, state) -> str: |
|
|
"""Format detailed reasoning and workflow steps""" |
|
|
|
|
|
reasoning = [] |
|
|
|
|
|
|
|
|
reasoning.append("## 🧭 Routing Decision") |
|
|
reasoning.append(f"**Classification**: {state.question_type.value}") |
|
|
reasoning.append(f"**Selected Agents**: {[a.value for a in state.selected_agents]}") |
|
|
reasoning.append(f"**Reasoning**: {state.routing_decision}") |
|
|
reasoning.append("") |
|
|
|
|
|
|
|
|
reasoning.append("## 🤖 Agent Processing") |
|
|
for i, result in enumerate(state.agent_results, 1): |
|
|
reasoning.append(f"### Agent {i}: {result.agent_role.value}") |
|
|
reasoning.append(f"**Success**: {'✅' if result.success else '❌'}") |
|
|
reasoning.append(f"**Confidence**: {result.confidence:.2f}") |
|
|
reasoning.append(f"**Tools Used**: {', '.join(result.tools_used) if result.tools_used else 'None'}") |
|
|
reasoning.append(f"**Reasoning**: {result.reasoning}") |
|
|
reasoning.append(f"**Result**: {result.result[:200]}...") |
|
|
reasoning.append("") |
|
|
|
|
|
|
|
|
reasoning.append("## 🔗 Synthesis Process") |
|
|
reasoning.append(f"**Strategy**: {state.answer_source}") |
|
|
reasoning.append(f"**Final Reasoning**: {state.final_reasoning}") |
|
|
reasoning.append("") |
|
|
|
|
|
|
|
|
reasoning.append("## ⏱️ Processing Timeline") |
|
|
for i, step in enumerate(state.processing_steps, 1): |
|
|
reasoning.append(f"{i}. {step}") |
|
|
|
|
|
return "\n".join(reasoning) |
|
|
|
|
|
def get_examples(self) -> list: |
|
|
"""Get example questions for the interface that showcase multi-agent capabilities""" |
|
|
return [ |
|
|
"How many studio albums were published by Mercedes Sosa between 2000 and 2009?", |
|
|
"What is the capital of the country that has the most time zones?", |
|
|
"Calculate the compound interest on $1000 at 5% annual rate compounded quarterly for 3 years", |
|
|
"What is the square root of the sum of the first 10 prime numbers?", |
|
|
"Who was the first person to walk on the moon and what year did it happen?", |
|
|
"Compare the GDP of Japan and Germany in 2023 and tell me the difference", |
|
|
] |
|
|
|
|
|
def process_with_langgraph(self, question: str, question_id: str = None) -> Dict[str, Any]: |
|
|
""" |
|
|
Process question using enhanced LangGraph workflow with multi-phase planning |
|
|
""" |
|
|
try: |
|
|
logger.info(f"📝 Processing question with enhanced LangGraph workflow: {question[:100]}...") |
|
|
|
|
|
|
|
|
state = GAIAAgentState( |
|
|
question=question, |
|
|
question_id=question_id, |
|
|
file_name=None, |
|
|
file_content=None |
|
|
) |
|
|
|
|
|
|
|
|
workflow = create_gaia_workflow(self.llm_client, self.tools) |
|
|
|
|
|
logger.info("🚀 Starting enhanced multi-phase workflow execution") |
|
|
|
|
|
|
|
|
result_state = workflow.invoke(state) |
|
|
|
|
|
|
|
|
processing_details = { |
|
|
"steps": result_state.processing_steps, |
|
|
"agents_used": [r.agent_role.value for r in result_state.agent_results], |
|
|
"router_analysis": getattr(result_state, 'router_analysis', {}), |
|
|
"agent_sequence": getattr(result_state, 'agent_sequence', []), |
|
|
"total_steps": len(result_state.processing_steps), |
|
|
"refinement_attempted": getattr(result_state, 'refinement_attempted', False) |
|
|
} |
|
|
|
|
|
|
|
|
if result_state.agent_results: |
|
|
confidences = [r.confidence for r in result_state.agent_results] |
|
|
avg_confidence = sum(confidences) / len(confidences) |
|
|
max_confidence = max(confidences) |
|
|
|
|
|
enhanced_confidence = min(0.95, (avg_confidence + max_confidence) / 2) |
|
|
else: |
|
|
enhanced_confidence = 0.1 |
|
|
|
|
|
return { |
|
|
"answer": result_state.final_answer or "Unable to determine answer", |
|
|
"confidence": enhanced_confidence, |
|
|
"reasoning": result_state.synthesis_reasoning or "Multi-phase processing completed", |
|
|
"cost": result_state.total_cost, |
|
|
"processing_time": time.time() - result_state.start_time, |
|
|
"processing_details": processing_details, |
|
|
"agent_results": [ |
|
|
{ |
|
|
"agent": r.agent_role.value, |
|
|
"success": r.success, |
|
|
"confidence": r.confidence, |
|
|
"reasoning": r.reasoning[:200] + "..." if len(r.reasoning) > 200 else r.reasoning, |
|
|
"processing_time": r.processing_time, |
|
|
"cost": r.cost_estimate |
|
|
} |
|
|
for r in result_state.agent_results |
|
|
], |
|
|
"errors": result_state.errors |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"Enhanced LangGraph processing failed: {str(e)}" |
|
|
logger.error(error_msg) |
|
|
return { |
|
|
"answer": "Processing failed with enhanced workflow", |
|
|
"confidence": 0.0, |
|
|
"reasoning": error_msg, |
|
|
"cost": 0.0, |
|
|
"processing_time": 0.0, |
|
|
"processing_details": {"error": error_msg}, |
|
|
"agent_results": [], |
|
|
"errors": [error_msg] |
|
|
} |
|
|
|
|
|
def check_oauth_scopes(oauth_token: str) -> Dict[str, any]: |
|
|
""" |
|
|
Check what scopes are available with the OAuth token |
|
|
Returns a dictionary with scope information and capabilities |
|
|
""" |
|
|
if not oauth_token: |
|
|
return { |
|
|
"logged_in": False, |
|
|
"scopes": [], |
|
|
"can_inference": False, |
|
|
"can_read": False, |
|
|
"user_info": {}, |
|
|
"message": "Not logged in" |
|
|
} |
|
|
|
|
|
try: |
|
|
headers = {"Authorization": f"Bearer {oauth_token}"} |
|
|
|
|
|
|
|
|
logger.info("🔍 Testing OAuth token with whoami endpoint...") |
|
|
try: |
|
|
whoami_response = requests.get("https://huggingface.co/api/whoami", headers=headers, timeout=10) |
|
|
can_read = whoami_response.status_code == 200 |
|
|
logger.info(f"✅ Whoami response: {whoami_response.status_code}") |
|
|
|
|
|
if whoami_response.status_code == 401: |
|
|
logger.warning("⚠️ OAuth token unauthorized for whoami endpoint") |
|
|
elif whoami_response.status_code != 200: |
|
|
logger.warning(f"⚠️ Unexpected whoami response: {whoami_response.status_code}") |
|
|
|
|
|
except Exception as whoami_error: |
|
|
logger.error(f"❌ Whoami test failed: {whoami_error}") |
|
|
can_read = False |
|
|
|
|
|
|
|
|
logger.info("🔍 Testing OAuth token with inference endpoint...") |
|
|
can_inference = False |
|
|
try: |
|
|
|
|
|
inference_url = "https://api-inference.huggingface.co/models/microsoft/DialoGPT-medium" |
|
|
test_payload = {"inputs": "test", "options": {"wait_for_model": False, "use_cache": True}} |
|
|
inference_response = requests.post(inference_url, headers=headers, json=test_payload, timeout=15) |
|
|
|
|
|
|
|
|
can_inference = inference_response.status_code in [200, 503] |
|
|
logger.info(f"✅ Inference response: {inference_response.status_code}") |
|
|
|
|
|
if inference_response.status_code == 401: |
|
|
logger.warning("⚠️ OAuth token unauthorized for inference endpoint - likely missing 'inference' scope") |
|
|
elif inference_response.status_code == 403: |
|
|
logger.warning("⚠️ OAuth token forbidden for inference endpoint - insufficient permissions") |
|
|
elif inference_response.status_code not in [200, 503]: |
|
|
logger.warning(f"⚠️ Unexpected inference response: {inference_response.status_code}") |
|
|
|
|
|
except Exception as inference_error: |
|
|
logger.error(f"❌ Inference test failed: {inference_error}") |
|
|
can_inference = False |
|
|
|
|
|
|
|
|
if not can_inference: |
|
|
logger.info("🔍 Testing OAuth token with Qwen model directly...") |
|
|
try: |
|
|
qwen_url = "https://api-inference.huggingface.co/models/Qwen/Qwen2.5-7B-Instruct" |
|
|
qwen_payload = {"inputs": "Hello", "options": {"wait_for_model": False}} |
|
|
qwen_response = requests.post(qwen_url, headers=headers, json=qwen_payload, timeout=15) |
|
|
|
|
|
qwen_inference = qwen_response.status_code in [200, 503] |
|
|
if qwen_inference: |
|
|
can_inference = True |
|
|
logger.info(f"✅ Qwen model response: {qwen_response.status_code}") |
|
|
else: |
|
|
logger.warning(f"⚠️ Qwen model response: {qwen_response.status_code}") |
|
|
|
|
|
except Exception as qwen_error: |
|
|
logger.error(f"❌ Qwen model test failed: {qwen_error}") |
|
|
|
|
|
|
|
|
probable_scopes = [] |
|
|
if can_read: |
|
|
probable_scopes.append("read") |
|
|
if can_inference: |
|
|
probable_scopes.append("inference") |
|
|
|
|
|
logger.info(f"📊 Final scope assessment: {probable_scopes}") |
|
|
|
|
|
|
|
|
user_info = {} |
|
|
if can_read and whoami_response.status_code == 200: |
|
|
try: |
|
|
user_data = whoami_response.json() |
|
|
user_info = { |
|
|
"name": user_data.get("name", "Unknown"), |
|
|
"fullname": user_data.get("fullName", ""), |
|
|
"avatar": user_data.get("avatarUrl", "") |
|
|
} |
|
|
logger.info(f"✅ User info retrieved: {user_info.get('name', 'unknown')}") |
|
|
except Exception as user_error: |
|
|
logger.warning(f"⚠️ Could not parse user info: {user_error}") |
|
|
user_info = {} |
|
|
|
|
|
return { |
|
|
"logged_in": True, |
|
|
"scopes": probable_scopes, |
|
|
"can_inference": can_inference, |
|
|
"can_read": can_read, |
|
|
"user_info": user_info, |
|
|
"message": f"Logged in with scopes: {', '.join(probable_scopes) if probable_scopes else 'limited'}" |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"❌ OAuth scope check failed: {e}") |
|
|
return { |
|
|
"logged_in": True, |
|
|
"scopes": ["unknown"], |
|
|
"can_inference": False, |
|
|
"can_read": False, |
|
|
"user_info": {}, |
|
|
"message": f"Could not determine scopes: {str(e)}" |
|
|
} |
|
|
|
|
|
def format_auth_status(profile: gr.OAuthProfile | None) -> str: |
|
|
"""Format authentication status for display in UI""" |
|
|
|
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
|
|
|
|
if hf_token: |
|
|
|
|
|
return """ |
|
|
### 🎯 Authentication Status: HF_TOKEN Environment Variable |
|
|
|
|
|
**🚀 FULL SYSTEM CAPABILITIES ENABLED** |
|
|
|
|
|
**Authentication Source**: HF_TOKEN environment variable |
|
|
**Model Access**: Qwen 2.5 models (7B/32B/72B) via HuggingFace Inference API |
|
|
**Workflow**: LangGraph multi-agent system with specialized tools |
|
|
|
|
|
**Available Features:** |
|
|
- ✅ **Advanced Model Access**: Full Qwen model capabilities (7B/32B/72B) |
|
|
- ✅ **High Performance**: 30%+ expected GAIA score |
|
|
- ✅ **LangGraph Workflow**: Multi-agent orchestration with synthesis |
|
|
- ✅ **Specialized Agents**: Web research, file processing, mathematical reasoning |
|
|
- ✅ **Professional Tools**: Wikipedia, web search, calculator, file processor |
|
|
- ✅ **Manual Testing**: Individual question processing with detailed analysis |
|
|
- ✅ **Official Evaluation**: GAIA benchmark submission |
|
|
|
|
|
💡 **Status**: Optimal configuration for GAIA benchmark performance with real AI agents. |
|
|
""" |
|
|
|
|
|
|
|
|
oauth_scopes = os.getenv("OAUTH_SCOPES") |
|
|
oauth_client_id = os.getenv("OAUTH_CLIENT_ID") |
|
|
|
|
|
has_inference_scope = oauth_scopes and ("inference-api" in oauth_scopes or "inference" in oauth_scopes) |
|
|
|
|
|
if not profile: |
|
|
oauth_status = "" |
|
|
if oauth_client_id: |
|
|
if has_inference_scope: |
|
|
oauth_status = "**🔑 OAuth Configuration**: ✅ Space configured with inference scope" |
|
|
else: |
|
|
oauth_status = "**⚠️ OAuth Configuration**: Space OAuth enabled but missing inference scope" |
|
|
else: |
|
|
oauth_status = "**❌ OAuth Configuration**: Space not configured for OAuth (missing `hf_oauth: true` in README.md)" |
|
|
|
|
|
return f""" |
|
|
### 🔐 Authentication Status: Not Logged In |
|
|
|
|
|
Please log in to access GAIA evaluation with Qwen models and LangGraph workflow. |
|
|
|
|
|
{oauth_status} |
|
|
|
|
|
**What you need:** |
|
|
- 🔑 HuggingFace login with `read` and `inference` permissions |
|
|
- 🤖 Access to Qwen 2.5 models via HF Inference API |
|
|
- 🧠 LangGraph multi-agent system capabilities |
|
|
|
|
|
**🔑 OAuth Scopes**: Login requests inference scope for Qwen model access. |
|
|
**📈 Expected Performance**: 30%+ GAIA score with full LangGraph workflow and Qwen models. |
|
|
**⚠️ No Fallbacks**: System requires proper authentication - no simplified responses. |
|
|
""" |
|
|
|
|
|
username = profile.username |
|
|
oauth_token = getattr(profile, 'oauth_token', None) or getattr(profile, 'token', None) |
|
|
|
|
|
|
|
|
if not oauth_token: |
|
|
for attr in ['access_token', 'id_token', 'bearer_token']: |
|
|
token = getattr(profile, attr, None) |
|
|
if token: |
|
|
oauth_token = token |
|
|
logger.info(f"🔑 Found OAuth token via {attr}") |
|
|
break |
|
|
|
|
|
|
|
|
if not oauth_token and hasattr(profile, '__dict__'): |
|
|
token_attrs = [attr for attr in profile.__dict__.keys() if 'token' in attr.lower()] |
|
|
if token_attrs: |
|
|
logger.info(f"🔍 Available token attributes: {token_attrs}") |
|
|
|
|
|
oauth_token = getattr(profile, token_attrs[0], None) |
|
|
if oauth_token: |
|
|
logger.info(f"🔑 Using token from {token_attrs[0]}") |
|
|
|
|
|
scope_info = check_oauth_scopes(oauth_token) if oauth_token else { |
|
|
"logged_in": True, |
|
|
"scopes": [], |
|
|
"can_inference": False, |
|
|
"can_read": False, |
|
|
"user_info": {}, |
|
|
"message": "Logged in but no OAuth token found" |
|
|
} |
|
|
|
|
|
status_parts = [f"### 🔐 Authentication Status: Logged In as {username}"] |
|
|
|
|
|
|
|
|
user_info = scope_info.get("user_info", {}) |
|
|
if user_info and user_info.get("fullname"): |
|
|
status_parts.append(f"**Full Name**: {user_info['fullname']}") |
|
|
|
|
|
|
|
|
if oauth_client_id: |
|
|
if has_inference_scope: |
|
|
status_parts.append("**🏠 Space OAuth**: ✅ Configured with inference scope") |
|
|
else: |
|
|
status_parts.append("**🏠 Space OAuth**: ⚠️ Missing inference scope in README.md") |
|
|
status_parts.append(f"**Available Scopes**: {oauth_scopes}") |
|
|
else: |
|
|
status_parts.append("**🏠 Space OAuth**: ❌ Not configured (`hf_oauth: true` missing)") |
|
|
|
|
|
|
|
|
scopes = scope_info.get("scopes", []) |
|
|
status_parts.append(f"**Detected Token Scopes**: {', '.join(scopes) if scopes else 'None detected'}") |
|
|
status_parts.append("") |
|
|
status_parts.append("**System Capabilities:**") |
|
|
|
|
|
|
|
|
can_inference = scope_info.get("can_inference", False) |
|
|
can_read = scope_info.get("can_read", False) |
|
|
|
|
|
if can_inference: |
|
|
status_parts.extend([ |
|
|
"- ✅ **Qwen Model Access**: Full Qwen 2.5 model capabilities (7B/32B/72B)", |
|
|
"- ✅ **High Performance**: 30%+ expected GAIA score", |
|
|
"- ✅ **LangGraph Workflow**: Multi-agent orchestration with synthesis", |
|
|
"- ✅ **Specialized Agents**: Web research, file processing, reasoning", |
|
|
"- ✅ **Professional Tools**: Wikipedia, web search, calculator, file processor", |
|
|
"- ✅ **Inference Access**: Full model generation capabilities" |
|
|
]) |
|
|
else: |
|
|
status_parts.extend([ |
|
|
"- ❌ **No Qwen Model Access**: Insufficient OAuth permissions", |
|
|
"- ❌ **No LangGraph Workflow**: Requires inference permissions", |
|
|
"- ❌ **Limited Functionality**: Cannot process GAIA questions", |
|
|
"- ❌ **No Inference Access**: Read-only permissions detected" |
|
|
]) |
|
|
|
|
|
if can_read: |
|
|
status_parts.append("- ✅ **Profile Access**: Can read user information") |
|
|
|
|
|
status_parts.extend([ |
|
|
"- ✅ **Manual Testing**: Individual question processing (if authenticated)", |
|
|
"- ✅ **Official Evaluation**: GAIA benchmark submission (if authenticated)" |
|
|
]) |
|
|
|
|
|
if not can_inference: |
|
|
if not has_inference_scope: |
|
|
status_parts.extend([ |
|
|
"", |
|
|
"🔧 **Space Configuration Issue**: Add inference scope to README.md:", |
|
|
"```yaml", |
|
|
"hf_oauth_scopes:", |
|
|
" - inference-api", |
|
|
"```", |
|
|
"**After updating**: Space will restart and request proper scopes on next login." |
|
|
]) |
|
|
|
|
|
status_parts.extend([ |
|
|
"", |
|
|
"🔑 **Authentication Required**: Your OAuth session lacks inference permissions.", |
|
|
"**Solution**: Logout and login again to request full inference access.", |
|
|
"**Alternative**: Set HF_TOKEN as a Space secret for guaranteed Qwen model access.", |
|
|
"**Note**: System requires Qwen model access - no simplified fallbacks available." |
|
|
]) |
|
|
|
|
|
|
|
|
if not oauth_token: |
|
|
status_parts.extend([ |
|
|
"", |
|
|
"🔍 **OAuth Token Issue**: Could not extract OAuth token from your session.", |
|
|
"**Troubleshooting**: Click '🔍 Debug OAuth' button above to investigate.", |
|
|
"**Common Fix**: Logout and login again to refresh your OAuth session." |
|
|
]) |
|
|
else: |
|
|
status_parts.extend([ |
|
|
"", |
|
|
"🎉 **Excellent**: You have full inference access for optimal GAIA performance!", |
|
|
"🤖 **Ready**: LangGraph workflow with Qwen models fully operational." |
|
|
]) |
|
|
|
|
|
return "\n".join(status_parts) |
|
|
|
|
|
def run_and_submit_all(profile: gr.OAuthProfile | None): |
|
|
""" |
|
|
Fetches all questions from Unit 4 API, runs the GAIA Agent with LangGraph workflow, |
|
|
and displays the results. Handles OAuth authentication at runtime. |
|
|
""" |
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
result_logger = GAIAResultLogger() |
|
|
|
|
|
|
|
|
oauth_client_id = os.getenv("OAUTH_CLIENT_ID") |
|
|
oauth_scopes = os.getenv("OAUTH_SCOPES") |
|
|
|
|
|
if not oauth_client_id: |
|
|
return "❌ OAuth not configured. Please add 'hf_oauth: true' to README.md", None, format_auth_status(None), None, None, None |
|
|
|
|
|
|
|
|
if not oauth_scopes or not ("inference-api" in oauth_scopes or "inference" in oauth_scopes): |
|
|
return f"❌ Missing inference scope. Current scopes: {oauth_scopes}. Please add inference scope to README.md", None, format_auth_status(None), None, None, None |
|
|
|
|
|
|
|
|
space_id = os.getenv("SPACE_ID") |
|
|
|
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
|
oauth_token = None |
|
|
username = "unknown_user" |
|
|
|
|
|
if hf_token: |
|
|
logger.info("🎯 Using HF_TOKEN environment variable for Qwen model access") |
|
|
oauth_token = hf_token |
|
|
username = "hf_token_user" |
|
|
elif profile: |
|
|
username = f"{profile.username}" |
|
|
|
|
|
|
|
|
oauth_token = getattr(profile, 'oauth_token', None) or getattr(profile, 'token', None) |
|
|
|
|
|
if not oauth_token: |
|
|
for attr in ['access_token', 'id_token', 'bearer_token']: |
|
|
token = getattr(profile, attr, None) |
|
|
if token: |
|
|
oauth_token = token |
|
|
logger.info(f"🔑 Found OAuth token via {attr}") |
|
|
break |
|
|
|
|
|
if oauth_token: |
|
|
logger.info(f"✅ User logged in: {username}, OAuth token extracted successfully") |
|
|
|
|
|
|
|
|
try: |
|
|
headers = {"Authorization": f"Bearer {oauth_token}"} |
|
|
test_response = requests.get("https://huggingface.co/api/whoami", headers=headers, timeout=5) |
|
|
|
|
|
if test_response.status_code == 401: |
|
|
logger.error("❌ OAuth token has insufficient scopes for Qwen model inference") |
|
|
return "Authentication Error: Your OAuth token lacks inference permissions. Please logout and login again to refresh your OAuth session.", None, format_auth_status(profile), None, None, None |
|
|
elif test_response.status_code == 200: |
|
|
logger.info("✅ OAuth token validated successfully") |
|
|
else: |
|
|
logger.warning(f"⚠️ OAuth token validation returned {test_response.status_code}") |
|
|
|
|
|
except Exception as e: |
|
|
logger.warning(f"⚠️ Could not validate OAuth token: {e}") |
|
|
else: |
|
|
logger.warning(f"⚠️ User {username} logged in but no OAuth token found") |
|
|
return f"OAuth Token Missing: Could not extract authentication token for user {username}. Please logout and login again.", None, format_auth_status(profile), None, None, None |
|
|
else: |
|
|
logger.error("❌ No authentication provided") |
|
|
return "Authentication Required: Please login with HuggingFace. Your Space has OAuth configured but you need to login first.", None, format_auth_status(None), None, None, None |
|
|
|
|
|
if not oauth_token: |
|
|
return "Authentication Required: Valid token with inference permissions needed for Qwen model access.", None, format_auth_status(profile), None, None, None |
|
|
|
|
|
|
|
|
auth_status = format_auth_status(profile) |
|
|
api_url = DEFAULT_API_URL |
|
|
questions_url = f"{api_url}/questions" |
|
|
submit_url = f"{api_url}/submit" |
|
|
|
|
|
|
|
|
try: |
|
|
logger.info("🚀 Creating GAIA Agent with LangGraph workflow and Qwen models") |
|
|
agent = GAIAAgentApp.create_with_oauth_token(oauth_token) |
|
|
|
|
|
if not agent.initialized: |
|
|
return "System Error: GAIA Agent failed to initialize with LangGraph workflow", None, auth_status, None, None, None |
|
|
|
|
|
logger.info("✅ GAIA Agent initialized successfully") |
|
|
|
|
|
except ValueError as ve: |
|
|
logger.error(f"Authentication error: {ve}") |
|
|
return f"Authentication Error: {ve}", None, auth_status, None, None, None |
|
|
except RuntimeError as re: |
|
|
logger.error(f"System initialization error: {re}") |
|
|
return f"System Error: {re}", None, auth_status, None, None, None |
|
|
except Exception as e: |
|
|
logger.error(f"Unexpected error initializing agent: {e}") |
|
|
return f"Unexpected Error: {e}. Please check your authentication and try again.", None, auth_status, None, None, None |
|
|
|
|
|
|
|
|
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Local Development" |
|
|
logger.info(f"Agent code URL: {agent_code}") |
|
|
|
|
|
|
|
|
logger.info(f"Fetching questions from: {questions_url}") |
|
|
try: |
|
|
response = requests.get(questions_url, timeout=15) |
|
|
response.raise_for_status() |
|
|
questions_data = response.json() |
|
|
if not questions_data: |
|
|
logger.error("Fetched questions list is empty.") |
|
|
return "Fetched questions list is empty or invalid format.", None, auth_status, None, None, None |
|
|
logger.info(f"Fetched {len(questions_data)} questions.") |
|
|
except requests.exceptions.RequestException as e: |
|
|
logger.error(f"Error fetching questions: {e}") |
|
|
return f"Error fetching questions: {e}", None, auth_status, None, None, None |
|
|
except requests.exceptions.JSONDecodeError as e: |
|
|
logger.error(f"Error decoding JSON response from questions endpoint: {e}") |
|
|
return f"Error decoding server response for questions: {e}", None, auth_status, None, None, None |
|
|
except Exception as e: |
|
|
logger.error(f"An unexpected error occurred fetching questions: {e}") |
|
|
return f"An unexpected error occurred fetching questions: {e}", None, auth_status, None, None, None |
|
|
|
|
|
|
|
|
results_log = [] |
|
|
answers_payload = [] |
|
|
logger.info(f"🤖 Running GAIA Agent on {len(questions_data)} questions with LangGraph workflow...") |
|
|
|
|
|
for i, item in enumerate(questions_data, 1): |
|
|
task_id = item.get("task_id") |
|
|
question_text = item.get("question") |
|
|
if not task_id or question_text is None: |
|
|
logger.warning(f"Skipping item with missing task_id or question: {item}") |
|
|
continue |
|
|
|
|
|
logger.info(f"Processing question {i}/{len(questions_data)}: {task_id}") |
|
|
try: |
|
|
submitted_answer = agent(question_text) |
|
|
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer}) |
|
|
results_log.append({ |
|
|
"Task ID": task_id, |
|
|
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text, |
|
|
"Submitted Answer": submitted_answer[:200] + "..." if len(submitted_answer) > 200 else submitted_answer |
|
|
}) |
|
|
logger.info(f"✅ Question {i} processed successfully") |
|
|
except Exception as e: |
|
|
logger.error(f"Error running GAIA agent on task {task_id}: {e}") |
|
|
error_answer = f"AGENT ERROR: {str(e)}" |
|
|
answers_payload.append({"task_id": task_id, "submitted_answer": error_answer}) |
|
|
results_log.append({ |
|
|
"Task ID": task_id, |
|
|
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text, |
|
|
"Submitted Answer": error_answer |
|
|
}) |
|
|
|
|
|
if not answers_payload: |
|
|
logger.error("GAIA Agent did not produce any answers to submit.") |
|
|
return "GAIA Agent did not produce any answers to submit.", pd.DataFrame(results_log), auth_status, None, None, None |
|
|
|
|
|
|
|
|
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload} |
|
|
status_update = f"🚀 GAIA Agent finished processing {len(answers_payload)} questions. Submitting results for user '{username}'..." |
|
|
logger.info(status_update) |
|
|
|
|
|
|
|
|
logger.info(f"📤 Submitting {len(answers_payload)} answers to: {submit_url}") |
|
|
try: |
|
|
response = requests.post(submit_url, json=submission_data, timeout=120) |
|
|
logger.info(f"📨 Unit 4 API response status: {response.status_code}") |
|
|
|
|
|
response.raise_for_status() |
|
|
result_data = response.json() |
|
|
|
|
|
|
|
|
logger.info(f"📊 Unit 4 API response data: {result_data}") |
|
|
|
|
|
|
|
|
execution_time = time.time() - start_time |
|
|
|
|
|
|
|
|
logger.info("📝 Logging evaluation results...") |
|
|
logged_files = result_logger.log_evaluation_results( |
|
|
username=username, |
|
|
questions_data=questions_data, |
|
|
results_log=results_log, |
|
|
final_result=result_data, |
|
|
execution_time=execution_time |
|
|
) |
|
|
|
|
|
|
|
|
csv_file = logged_files.get("csv") |
|
|
json_file = logged_files.get("json") |
|
|
summary_file = logged_files.get("summary") |
|
|
|
|
|
final_status = ( |
|
|
f"🎉 GAIA Agent Evaluation Complete!\n" |
|
|
f"👤 User: {result_data.get('username')}\n" |
|
|
f"🏆 Overall Score: {result_data.get('score', 'N/A')}% " |
|
|
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n" |
|
|
f"⏱️ Execution Time: {execution_time:.2f} seconds\n" |
|
|
f"💬 API Response: {result_data.get('message', 'No message received.')}\n\n" |
|
|
f"📁 Results saved to {len([f for f in [csv_file, json_file, summary_file] if f])} files for download." |
|
|
) |
|
|
logger.info("✅ GAIA evaluation completed successfully") |
|
|
results_df = pd.DataFrame(results_log) |
|
|
return final_status, results_df, auth_status, csv_file, json_file, summary_file |
|
|
|
|
|
except requests.exceptions.HTTPError as e: |
|
|
error_detail = f"Server responded with status {e.response.status_code}." |
|
|
try: |
|
|
error_json = e.response.json() |
|
|
error_detail += f" Detail: {error_json.get('detail', e.response.text)}" |
|
|
except requests.exceptions.JSONDecodeError: |
|
|
error_detail += f" Response: {e.response.text[:500]}" |
|
|
status_message = f"❌ Submission Failed: {error_detail}" |
|
|
logger.error(status_message) |
|
|
results_df = pd.DataFrame(results_log) |
|
|
return status_message, results_df, auth_status, None, None, None |
|
|
except requests.exceptions.Timeout: |
|
|
status_message = "❌ Submission Failed: The request timed out." |
|
|
logger.error(status_message) |
|
|
results_df = pd.DataFrame(results_log) |
|
|
return status_message, results_df, auth_status, None, None, None |
|
|
except requests.exceptions.RequestException as e: |
|
|
status_message = f"❌ Submission Failed: Network error - {e}" |
|
|
logger.error(status_message) |
|
|
results_df = pd.DataFrame(results_log) |
|
|
return status_message, results_df, auth_status, None, None, None |
|
|
except Exception as e: |
|
|
status_message = f"❌ An unexpected error occurred during submission: {e}" |
|
|
logger.error(status_message) |
|
|
results_df = pd.DataFrame(results_log) |
|
|
return status_message, results_df, auth_status, None, None, None |
|
|
|
|
|
def create_interface(): |
|
|
"""Create the Gradio interface with both Unit 4 API and manual testing""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
css = """ |
|
|
/* Base styling for proper contrast */ |
|
|
.gradio-container { |
|
|
color: #3c3c3c !important; |
|
|
background-color: #faf9f7 !important; |
|
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important; |
|
|
} |
|
|
|
|
|
/* Fix all text elements EXCEPT buttons */ |
|
|
.gradio-container *:not(button):not(.gr-button):not(.gr-button-primary):not(.gr-button-secondary), |
|
|
.gradio-container *:not(button):not(.gr-button):not(.gr-button-primary):not(.gr-button-secondary)::before, |
|
|
.gradio-container *:not(button):not(.gr-button):not(.gr-button-primary):not(.gr-button-secondary)::after { |
|
|
color: #3c3c3c !important; |
|
|
} |
|
|
|
|
|
/* Headers */ |
|
|
.gradio-container h1, |
|
|
.gradio-container h2, |
|
|
.gradio-container h3, |
|
|
.gradio-container h4, |
|
|
.gradio-container h5, |
|
|
.gradio-container h6 { |
|
|
color: #2c2c2c !important; |
|
|
font-weight: 600 !important; |
|
|
} |
|
|
|
|
|
/* Paragraphs and text content */ |
|
|
.gradio-container p, |
|
|
.gradio-container div:not(.gr-button):not(.gr-button-primary):not(.gr-button-secondary), |
|
|
.gradio-container span:not(.gr-button):not(.gr-button-primary):not(.gr-button-secondary), |
|
|
.gradio-container label { |
|
|
color: #3c3c3c !important; |
|
|
} |
|
|
|
|
|
/* Input fields */ |
|
|
.gradio-container input, |
|
|
.gradio-container textarea { |
|
|
color: #3c3c3c !important; |
|
|
background-color: #ffffff !important; |
|
|
border: 1px solid #d4c4b0 !important; |
|
|
border-radius: 6px !important; |
|
|
} |
|
|
|
|
|
/* Buttons - Subtle professional styling */ |
|
|
.gradio-container button, |
|
|
.gradio-container .gr-button, |
|
|
.gradio-container .gr-button-primary, |
|
|
.gradio-container .gr-button-secondary, |
|
|
.gradio-container button *, |
|
|
.gradio-container .gr-button *, |
|
|
.gradio-container .gr-button-primary *, |
|
|
.gradio-container .gr-button-secondary * { |
|
|
color: #3c3c3c !important; |
|
|
font-weight: 500 !important; |
|
|
text-shadow: none !important; |
|
|
border-radius: 6px !important; |
|
|
border: 1px solid #d4c4b0 !important; |
|
|
transition: all 0.2s ease !important; |
|
|
} |
|
|
|
|
|
.gradio-container .gr-button-primary, |
|
|
.gradio-container button[variant="primary"] { |
|
|
background: #f5f3f0 !important; |
|
|
color: #3c3c3c !important; |
|
|
border: 1px solid #d4c4b0 !important; |
|
|
padding: 8px 16px !important; |
|
|
border-radius: 6px !important; |
|
|
} |
|
|
|
|
|
.gradio-container .gr-button-secondary, |
|
|
.gradio-container button[variant="secondary"] { |
|
|
background: #ffffff !important; |
|
|
color: #3c3c3c !important; |
|
|
border: 1px solid #d4c4b0 !important; |
|
|
padding: 8px 16px !important; |
|
|
border-radius: 6px !important; |
|
|
} |
|
|
|
|
|
.gradio-container button:not([variant]) { |
|
|
background: #f8f6f3 !important; |
|
|
color: #3c3c3c !important; |
|
|
border: 1px solid #d4c4b0 !important; |
|
|
padding: 8px 16px !important; |
|
|
border-radius: 6px !important; |
|
|
} |
|
|
|
|
|
/* Button hover states - subtle changes */ |
|
|
.gradio-container button:hover, |
|
|
.gradio-container .gr-button:hover, |
|
|
.gradio-container .gr-button-primary:hover { |
|
|
background: #ede9e4 !important; |
|
|
color: #2c2c2c !important; |
|
|
border: 1px solid #c4b49f !important; |
|
|
transform: translateY(-1px) !important; |
|
|
box-shadow: 0 2px 4px rgba(0,0,0,0.08) !important; |
|
|
} |
|
|
|
|
|
.gradio-container .gr-button-secondary:hover { |
|
|
background: #f5f3f0 !important; |
|
|
color: #2c2c2c !important; |
|
|
border: 1px solid #c4b49f !important; |
|
|
transform: translateY(-1px) !important; |
|
|
box-shadow: 0 2px 4px rgba(0,0,0,0.08) !important; |
|
|
} |
|
|
|
|
|
/* Login button styling */ |
|
|
.gradio-container .gr-button:contains("Login"), |
|
|
.gradio-container button:contains("Login") { |
|
|
background: #e8e3dc !important; |
|
|
color: #3c3c3c !important; |
|
|
border: 1px solid #d4c4b0 !important; |
|
|
} |
|
|
|
|
|
/* Markdown content */ |
|
|
.gradio-container .gr-markdown, |
|
|
.gradio-container .markdown, |
|
|
.gradio-container .prose { |
|
|
color: #3c3c3c !important; |
|
|
background-color: transparent !important; |
|
|
} |
|
|
|
|
|
/* Special content boxes */ |
|
|
.container { |
|
|
max-width: 1200px; |
|
|
margin: auto; |
|
|
padding: 20px; |
|
|
background-color: #faf9f7 !important; |
|
|
color: #3c3c3c !important; |
|
|
} |
|
|
|
|
|
.output-markdown { |
|
|
font-size: 16px; |
|
|
line-height: 1.6; |
|
|
color: #3c3c3c !important; |
|
|
background-color: #faf9f7 !important; |
|
|
} |
|
|
|
|
|
.details-box { |
|
|
background-color: #f5f3f0 !important; |
|
|
padding: 15px; |
|
|
border-radius: 8px; |
|
|
margin: 10px 0; |
|
|
color: #3c3c3c !important; |
|
|
border: 1px solid #e0d5c7 !important; |
|
|
} |
|
|
|
|
|
.reasoning-box { |
|
|
background-color: #ffffff !important; |
|
|
padding: 20px; |
|
|
border: 1px solid #e0d5c7 !important; |
|
|
border-radius: 8px; |
|
|
color: #3c3c3c !important; |
|
|
} |
|
|
|
|
|
.unit4-section { |
|
|
background-color: #f0ede8 !important; |
|
|
padding: 20px; |
|
|
border-radius: 8px; |
|
|
margin: 20px 0; |
|
|
color: #4a4035 !important; |
|
|
border: 1px solid #d4c4b0 !important; |
|
|
} |
|
|
|
|
|
.unit4-section h1, |
|
|
.unit4-section h2, |
|
|
.unit4-section h3, |
|
|
.unit4-section p, |
|
|
.unit4-section div:not(button):not(.gr-button) { |
|
|
color: #4a4035 !important; |
|
|
} |
|
|
|
|
|
/* Login section */ |
|
|
.oauth-login { |
|
|
background: #f5f3f0 !important; |
|
|
padding: 10px; |
|
|
border-radius: 5px; |
|
|
margin: 10px 0; |
|
|
color: #3c3c3c !important; |
|
|
border: 1px solid #e0d5c7 !important; |
|
|
} |
|
|
|
|
|
/* Tables */ |
|
|
.gradio-container table, |
|
|
.gradio-container th, |
|
|
.gradio-container td { |
|
|
color: #3c3c3c !important; |
|
|
background-color: #ffffff !important; |
|
|
border: 1px solid #e0d5c7 !important; |
|
|
} |
|
|
|
|
|
.gradio-container th { |
|
|
background-color: #f5f3f0 !important; |
|
|
font-weight: 600 !important; |
|
|
} |
|
|
|
|
|
/* Examples and other interactive elements */ |
|
|
.gradio-container .gr-examples, |
|
|
.gradio-container .gr-file, |
|
|
.gradio-container .gr-textbox, |
|
|
.gradio-container .gr-checkbox { |
|
|
color: #3c3c3c !important; |
|
|
background-color: #ffffff !important; |
|
|
} |
|
|
|
|
|
/* Fix any remaining text contrast issues */ |
|
|
.gradio-container .gr-form, |
|
|
.gradio-container .gr-panel, |
|
|
.gradio-container .gr-block { |
|
|
color: #3c3c3c !important; |
|
|
background-color: transparent !important; |
|
|
} |
|
|
|
|
|
/* Ensure proper text on light backgrounds */ |
|
|
.gradio-container .light, |
|
|
.gradio-container [data-theme="light"] { |
|
|
color: #3c3c3c !important; |
|
|
background-color: #faf9f7 !important; |
|
|
} |
|
|
|
|
|
/* Override any problematic inline styles but preserve button colors */ |
|
|
.gradio-container [style*="color: white"]:not(button):not(.gr-button) { |
|
|
color: #3c3c3c !important; |
|
|
} |
|
|
|
|
|
/* Professional spacing and shadows */ |
|
|
.gradio-container .gr-box { |
|
|
box-shadow: 0 1px 3px rgba(0,0,0,0.1) !important; |
|
|
border-radius: 8px !important; |
|
|
} |
|
|
|
|
|
/* Override any remaining purple/blue elements */ |
|
|
.gradio-container .gr-textbox, |
|
|
.gradio-container .gr-dropdown, |
|
|
.gradio-container .gr-number, |
|
|
.gradio-container .gr-slider { |
|
|
background-color: #ffffff !important; |
|
|
border: 1px solid #d4c4b0 !important; |
|
|
color: #3c3c3c !important; |
|
|
} |
|
|
|
|
|
/* Force override any Gradio default styling */ |
|
|
.gradio-container * { |
|
|
background-color: inherit !important; |
|
|
} |
|
|
|
|
|
.gradio-container *[style*="background-color: rgb(239, 68, 68)"], |
|
|
.gradio-container *[style*="background-color: rgb(59, 130, 246)"], |
|
|
.gradio-container *[style*="background-color: rgb(147, 51, 234)"], |
|
|
.gradio-container *[style*="background-color: rgb(16, 185, 129)"] { |
|
|
background-color: #f5f3f0 !important; |
|
|
color: #3c3c3c !important; |
|
|
border: 1px solid #d4c4b0 !important; |
|
|
} |
|
|
|
|
|
/* Loading states */ |
|
|
.gradio-container .loading { |
|
|
background-color: #f5f3f0 !important; |
|
|
color: #6b5d4f !important; |
|
|
} |
|
|
|
|
|
/* Progress bars */ |
|
|
.gradio-container .gr-progress { |
|
|
background-color: #f5f3f0 !important; |
|
|
} |
|
|
|
|
|
.gradio-container .gr-progress-bar { |
|
|
background-color: #a08b73 !important; |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
oauth_config = { |
|
|
"scopes": ["read", "inference"], |
|
|
} |
|
|
|
|
|
with gr.Blocks(css=css, title="GAIA Agent System", theme=gr.themes.Soft()) as interface: |
|
|
|
|
|
|
|
|
gr.Markdown(""" |
|
|
# 🤖 GAIA Agent System |
|
|
|
|
|
**Advanced Multi-Agent AI System for GAIA Benchmark Questions** |
|
|
|
|
|
This system uses **Qwen 2.5 models (7B/32B/72B)** with specialized agents orchestrated through |
|
|
**LangGraph** to provide accurate, well-reasoned answers to complex questions. |
|
|
|
|
|
**Architecture**: Router → Specialized Agents → Tools → Synthesizer → Final Answer |
|
|
""") |
|
|
|
|
|
|
|
|
with gr.Row(elem_classes=["unit4-section"]): |
|
|
with gr.Column(): |
|
|
gr.Markdown(""" |
|
|
## 🏆 GAIA Benchmark Evaluation |
|
|
|
|
|
**Official Unit 4 API Integration with LangGraph Workflow** |
|
|
|
|
|
Run the complete GAIA Agent system using Qwen 2.5 models and LangGraph multi-agent |
|
|
orchestration on all benchmark questions and submit results to the official API. |
|
|
|
|
|
**System Requirements:** |
|
|
1. 🔑 **Authentication**: HuggingFace login with `read` and `inference` permissions |
|
|
2. 🤖 **Models**: Access to Qwen 2.5 models (7B/32B/72B) via HF Inference API |
|
|
3. 🧠 **Workflow**: LangGraph multi-agent system with specialized tools |
|
|
|
|
|
**Instructions:** |
|
|
1. Log in to your Hugging Face account using the button below (**Full inference access required**) |
|
|
2. Click 'Run GAIA Evaluation & Submit All Answers' to process all questions |
|
|
3. View your official score and detailed results |
|
|
|
|
|
⚠️ **Note**: This may take several minutes to process all questions with the multi-agent system. |
|
|
|
|
|
💡 **OAuth Scopes**: Login requests both `read` and `inference` permissions |
|
|
for Qwen model access and optimal performance (30%+ GAIA score expected). |
|
|
|
|
|
🚫 **No Fallbacks**: System requires proper authentication - simplified responses not available. |
|
|
""") |
|
|
|
|
|
|
|
|
auth_status_display = gr.Markdown( |
|
|
""" |
|
|
### 🔐 Authentication Status: Not Logged In |
|
|
|
|
|
Please log in to access GAIA evaluation with Qwen models and LangGraph workflow. |
|
|
|
|
|
**What you need:** |
|
|
- 🔑 HuggingFace login with `read` and `inference` permissions |
|
|
- 🤖 Access to Qwen 2.5 models via HF Inference API |
|
|
- 🧠 LangGraph multi-agent system capabilities |
|
|
|
|
|
**Expected Performance**: 30%+ GAIA score with full LangGraph workflow and Qwen models. |
|
|
""", |
|
|
elem_classes=["oauth-login"] |
|
|
) |
|
|
|
|
|
|
|
|
gr.LoginButton() |
|
|
|
|
|
with gr.Row(): |
|
|
refresh_auth_button = gr.Button("🔄 Refresh Auth Status", variant="secondary", scale=1) |
|
|
|
|
|
unit4_run_button = gr.Button( |
|
|
"🚀 Run GAIA Evaluation & Submit All Answers", |
|
|
variant="primary", |
|
|
scale=2 |
|
|
) |
|
|
|
|
|
unit4_status_output = gr.Textbox( |
|
|
label="Evaluation Status / Submission Result", |
|
|
lines=5, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
unit4_results_table = gr.DataFrame( |
|
|
label="Questions and GAIA Agent Answers", |
|
|
wrap=True |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown("### 📁 Download Results") |
|
|
gr.Markdown("After evaluation completes, download your results in different formats:") |
|
|
|
|
|
with gr.Row(): |
|
|
csv_download = gr.File( |
|
|
label="📊 CSV Results", |
|
|
visible=False, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
json_download = gr.File( |
|
|
label="🔍 Detailed JSON", |
|
|
visible=False, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
summary_download = gr.File( |
|
|
label="📋 Summary Report", |
|
|
visible=False, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
gr.Markdown("---") |
|
|
|
|
|
|
|
|
gr.Markdown(""" |
|
|
## 🧪 Manual Question Testing |
|
|
|
|
|
Test individual questions with detailed analysis using **Qwen models** and **LangGraph workflow**. |
|
|
|
|
|
**Features:** |
|
|
- 🤖 **Qwen 2.5 Models**: Intelligent tier selection (7B → 32B → 72B) based on complexity |
|
|
- 🧠 **LangGraph Orchestration**: Multi-agent workflow with synthesis |
|
|
- 🔧 **Specialized Agents**: Router, web research, file processing, mathematical reasoning |
|
|
- 📊 **Detailed Analysis**: Processing details, confidence scores, cost tracking |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
|
|
|
gr.Markdown("### 📝 Input") |
|
|
|
|
|
question_input = gr.Textbox( |
|
|
label="Question", |
|
|
placeholder="Enter your question here...", |
|
|
lines=3, |
|
|
max_lines=10 |
|
|
) |
|
|
|
|
|
file_input = gr.File( |
|
|
label="Optional File Upload", |
|
|
file_types=[".txt", ".csv", ".xlsx", ".py", ".json", ".png", ".jpg", ".mp3", ".wav"], |
|
|
type="filepath" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
show_reasoning = gr.Checkbox( |
|
|
label="Show detailed reasoning", |
|
|
value=False |
|
|
) |
|
|
|
|
|
submit_btn = gr.Button( |
|
|
"🔍 Process Question", |
|
|
variant="secondary" |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown("#### 💡 Example Questions") |
|
|
|
|
|
example_questions = [ |
|
|
"How many studio albums were published by Mercedes Sosa between 2000 and 2009?", |
|
|
"What is the capital of the country that has the most time zones?", |
|
|
"Calculate the compound interest on $1000 at 5% annual rate compounded quarterly for 3 years", |
|
|
"What is the square root of the sum of the first 10 prime numbers?", |
|
|
"Who was the first person to walk on the moon and what year did it happen?", |
|
|
"Compare the GDP of Japan and Germany in 2023 and tell me the difference", |
|
|
] |
|
|
|
|
|
examples = gr.Examples( |
|
|
examples=example_questions, |
|
|
inputs=[question_input], |
|
|
cache_examples=False |
|
|
) |
|
|
|
|
|
with gr.Column(scale=3): |
|
|
|
|
|
gr.Markdown("### 📊 Results") |
|
|
|
|
|
answer_output = gr.Markdown( |
|
|
label="Answer", |
|
|
elem_classes=["output-markdown"] |
|
|
) |
|
|
|
|
|
details_output = gr.Markdown( |
|
|
label="Processing Details", |
|
|
elem_classes=["details-box"] |
|
|
) |
|
|
|
|
|
reasoning_output = gr.Markdown( |
|
|
label="Detailed Reasoning", |
|
|
visible=False, |
|
|
elem_classes=["reasoning-box"] |
|
|
) |
|
|
|
|
|
|
|
|
def run_gaia_evaluation(oauth_token: gr.OAuthToken | None, profile: gr.OAuthProfile | None): |
|
|
"""Run GAIA evaluation using Gradio's built-in OAuth""" |
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
result_logger = GAIAResultLogger() |
|
|
|
|
|
|
|
|
if oauth_token is None or profile is None: |
|
|
return "❌ Authentication Required: Please login with HuggingFace to access GAIA evaluation.", None, None, None, None, None |
|
|
|
|
|
username = profile.username if profile else "unknown_user" |
|
|
hf_token = oauth_token.token if oauth_token else None |
|
|
|
|
|
if not hf_token: |
|
|
return "❌ OAuth Token Missing: Could not extract authentication token. Please logout and login again.", None, None, None, None, None |
|
|
|
|
|
logger.info(f"✅ Starting GAIA evaluation for user: {username}") |
|
|
|
|
|
|
|
|
api_url = DEFAULT_API_URL |
|
|
questions_url = f"{api_url}/questions" |
|
|
submit_url = f"{api_url}/submit" |
|
|
|
|
|
|
|
|
space_id = os.getenv("SPACE_ID") |
|
|
|
|
|
|
|
|
try: |
|
|
logger.info("🚀 Creating GAIA Agent with LangGraph workflow and Qwen models") |
|
|
agent = GAIAAgentApp.create_with_oauth_token(hf_token) |
|
|
|
|
|
if not agent.initialized: |
|
|
return "❌ System Error: GAIA Agent failed to initialize with LangGraph workflow", None, None, None, None, None |
|
|
|
|
|
logger.info("✅ GAIA Agent initialized successfully") |
|
|
|
|
|
except ValueError as ve: |
|
|
logger.error(f"Authentication error: {ve}") |
|
|
return f"❌ Authentication Error: {ve}", None, None, None, None, None |
|
|
except RuntimeError as re: |
|
|
logger.error(f"System initialization error: {re}") |
|
|
return f"❌ System Error: {re}", None, None, None, None, None |
|
|
except Exception as e: |
|
|
logger.error(f"Unexpected error initializing agent: {e}") |
|
|
return f"❌ Unexpected Error: {e}. Please check your authentication and try again.", None, None, None, None, None |
|
|
|
|
|
|
|
|
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Local Development" |
|
|
logger.info(f"Agent code URL: {agent_code}") |
|
|
|
|
|
|
|
|
logger.info(f"Fetching questions from: {questions_url}") |
|
|
try: |
|
|
response = requests.get(questions_url, timeout=15) |
|
|
response.raise_for_status() |
|
|
questions_data = response.json() |
|
|
if not questions_data: |
|
|
logger.error("Fetched questions list is empty.") |
|
|
return "❌ Fetched questions list is empty or invalid format.", None, None, None, None, None |
|
|
logger.info(f"✅ Fetched {len(questions_data)} questions.") |
|
|
except requests.exceptions.RequestException as e: |
|
|
logger.error(f"Error fetching questions: {e}") |
|
|
return f"❌ Error fetching questions: {e}", None, None, None, None, None |
|
|
except Exception as e: |
|
|
logger.error(f"An unexpected error occurred fetching questions: {e}") |
|
|
return f"❌ An unexpected error occurred fetching questions: {e}", None, None, None, None, None |
|
|
|
|
|
|
|
|
results_log = [] |
|
|
answers_payload = [] |
|
|
logger.info(f"🤖 Running GAIA Agent on {len(questions_data)} questions with LangGraph workflow...") |
|
|
|
|
|
for i, item in enumerate(questions_data, 1): |
|
|
task_id = item.get("task_id") |
|
|
question_text = item.get("question") |
|
|
if not task_id or question_text is None: |
|
|
logger.warning(f"Skipping item with missing task_id or question: {item}") |
|
|
continue |
|
|
|
|
|
logger.info(f"Processing question {i}/{len(questions_data)}: {task_id}") |
|
|
try: |
|
|
submitted_answer = agent(question_text) |
|
|
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer}) |
|
|
results_log.append({ |
|
|
"Task ID": task_id, |
|
|
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text, |
|
|
"Submitted Answer": submitted_answer[:200] + "..." if len(submitted_answer) > 200 else submitted_answer |
|
|
}) |
|
|
logger.info(f"✅ Question {i} processed successfully") |
|
|
except Exception as e: |
|
|
logger.error(f"Error running GAIA agent on task {task_id}: {e}") |
|
|
error_answer = f"AGENT ERROR: {str(e)}" |
|
|
answers_payload.append({"task_id": task_id, "submitted_answer": error_answer}) |
|
|
results_log.append({ |
|
|
"Task ID": task_id, |
|
|
"Question": question_text[:100] + "..." if len(question_text) > 100 else question_text, |
|
|
"Submitted Answer": error_answer |
|
|
}) |
|
|
|
|
|
if not answers_payload: |
|
|
logger.error("GAIA Agent did not produce any answers to submit.") |
|
|
return "❌ GAIA Agent did not produce any answers to submit.", pd.DataFrame(results_log), None, None, None, None |
|
|
|
|
|
|
|
|
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload} |
|
|
status_update = f"🚀 GAIA Agent finished processing {len(answers_payload)} questions. Submitting results for user '{username}'..." |
|
|
logger.info(status_update) |
|
|
|
|
|
|
|
|
logger.info(f"📤 Submitting {len(answers_payload)} answers to: {submit_url}") |
|
|
try: |
|
|
response = requests.post(submit_url, json=submission_data, timeout=120) |
|
|
logger.info(f"📨 Unit 4 API response status: {response.status_code}") |
|
|
|
|
|
response.raise_for_status() |
|
|
result_data = response.json() |
|
|
|
|
|
|
|
|
logger.info(f"📊 Unit 4 API response data: {result_data}") |
|
|
|
|
|
|
|
|
execution_time = time.time() - start_time |
|
|
|
|
|
|
|
|
logger.info("📝 Logging evaluation results...") |
|
|
logged_files = result_logger.log_evaluation_results( |
|
|
username=username, |
|
|
questions_data=questions_data, |
|
|
results_log=results_log, |
|
|
final_result=result_data, |
|
|
execution_time=execution_time |
|
|
) |
|
|
|
|
|
|
|
|
csv_file = logged_files.get("csv") |
|
|
json_file = logged_files.get("json") |
|
|
summary_file = logged_files.get("summary") |
|
|
|
|
|
final_status = ( |
|
|
f"🎉 GAIA Agent Evaluation Complete!\n" |
|
|
f"👤 User: {result_data.get('username')}\n" |
|
|
f"🏆 Overall Score: {result_data.get('score', 'N/A')}% " |
|
|
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n" |
|
|
f"⏱️ Execution Time: {execution_time:.2f} seconds\n" |
|
|
f"💬 API Response: {result_data.get('message', 'No message received.')}\n\n" |
|
|
f"📁 Results saved to {len([f for f in [csv_file, json_file, summary_file] if f])} files for download." |
|
|
) |
|
|
logger.info("✅ GAIA evaluation completed successfully") |
|
|
results_df = pd.DataFrame(results_log) |
|
|
|
|
|
|
|
|
csv_update = gr.update(value=csv_file, visible=csv_file is not None) |
|
|
json_update = gr.update(value=json_file, visible=json_file is not None) |
|
|
summary_update = gr.update(value=summary_file, visible=summary_file is not None) |
|
|
|
|
|
return final_status, results_df, csv_update, json_update, summary_update |
|
|
|
|
|
except requests.exceptions.HTTPError as e: |
|
|
error_detail = f"Server responded with status {e.response.status_code}." |
|
|
try: |
|
|
error_json = e.response.json() |
|
|
error_detail += f" Detail: {error_json.get('detail', e.response.text)}" |
|
|
except requests.exceptions.JSONDecodeError: |
|
|
error_detail += f" Response: {e.response.text[:500]}" |
|
|
status_message = f"❌ Submission Failed: {error_detail}" |
|
|
logger.error(status_message) |
|
|
results_df = pd.DataFrame(results_log) |
|
|
return status_message, results_df, None, None, None |
|
|
except Exception as e: |
|
|
status_message = f"❌ An unexpected error occurred during submission: {e}" |
|
|
logger.error(status_message) |
|
|
results_df = pd.DataFrame(results_log) |
|
|
return status_message, results_df, None, None, None |
|
|
|
|
|
def update_auth_status(profile: gr.OAuthProfile | None): |
|
|
"""Update authentication status display using Gradio's OAuth""" |
|
|
if profile is None: |
|
|
return """ |
|
|
### 🔐 Authentication Status: Not Logged In |
|
|
|
|
|
Please click the "Sign in with Hugging Face" button above to access GAIA evaluation. |
|
|
|
|
|
**What you need:** |
|
|
- 🔑 HuggingFace login with `read` and `inference` permissions |
|
|
- 🤖 Access to Qwen 2.5 models via HF Inference API |
|
|
- 🧠 LangGraph multi-agent system capabilities |
|
|
|
|
|
**Expected Performance**: 30%+ GAIA score with full LangGraph workflow and Qwen models. |
|
|
""" |
|
|
else: |
|
|
return f""" |
|
|
### 🔐 Authentication Status: ✅ Logged In as {profile.username} |
|
|
|
|
|
**✅ Ready for GAIA Evaluation!** |
|
|
|
|
|
- ✅ **OAuth Profile**: {profile.name or profile.username} |
|
|
- ✅ **Qwen Model Access**: Available via HF Inference API |
|
|
- ✅ **LangGraph Workflow**: Multi-agent orchestration ready |
|
|
- ✅ **Official Evaluation**: Click "Run GAIA Evaluation" to start |
|
|
|
|
|
🎯 **Expected Results**: 30%+ GAIA score with full LangGraph workflow and Qwen models. |
|
|
""" |
|
|
|
|
|
|
|
|
interface.load( |
|
|
fn=update_auth_status, |
|
|
outputs=[auth_status_display] |
|
|
) |
|
|
|
|
|
unit4_run_button.click( |
|
|
fn=run_gaia_evaluation, |
|
|
inputs=[], |
|
|
outputs=[unit4_status_output, unit4_results_table, csv_download, json_download, summary_download] |
|
|
) |
|
|
|
|
|
|
|
|
refresh_auth_button.click( |
|
|
fn=update_auth_status, |
|
|
outputs=[auth_status_display] |
|
|
) |
|
|
|
|
|
|
|
|
def process_and_update(question, file_input, show_reasoning, oauth_token: gr.OAuthToken | None, profile: gr.OAuthProfile | None): |
|
|
"""Process question with authentication check""" |
|
|
|
|
|
if not question.strip(): |
|
|
return "❌ Please provide a question", "", "", gr.update(visible=False) |
|
|
|
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
|
|
|
|
if not hf_token and (oauth_token is None or profile is None): |
|
|
error_msg = """ |
|
|
## ❌ Authentication Required |
|
|
|
|
|
**This system requires authentication to access Qwen models and LangGraph workflow.** |
|
|
|
|
|
**How to authenticate:** |
|
|
1. 🔑 **Login with HuggingFace**: Use the "Sign in with Hugging Face" button above |
|
|
2. 🌐 **Use Official Evaluation**: Login via the GAIA Benchmark section above |
|
|
3. 📝 **Get Token**: Visit https://huggingface.co/settings/tokens to create one with `inference` permissions |
|
|
|
|
|
**Note**: Manual testing requires the same authentication as the official evaluation. |
|
|
""" |
|
|
return error_msg, "", "", gr.update(visible=False) |
|
|
|
|
|
|
|
|
auth_token = hf_token if hf_token else (oauth_token.token if oauth_token else None) |
|
|
|
|
|
if not auth_token: |
|
|
return "❌ No valid authentication token found", "", "", gr.update(visible=False) |
|
|
|
|
|
try: |
|
|
|
|
|
app = GAIAAgentApp(hf_token=auth_token) |
|
|
|
|
|
|
|
|
answer, details, reasoning = app.process_question_detailed(question, file_input, show_reasoning) |
|
|
|
|
|
|
|
|
formatted_answer = f""" |
|
|
## 🎯 Answer |
|
|
|
|
|
{answer} |
|
|
""" |
|
|
|
|
|
|
|
|
formatted_details = f""" |
|
|
## 📋 Processing Details |
|
|
|
|
|
{details} |
|
|
""" |
|
|
|
|
|
|
|
|
reasoning_visible = show_reasoning and reasoning.strip() |
|
|
|
|
|
return ( |
|
|
formatted_answer, |
|
|
formatted_details, |
|
|
reasoning if reasoning_visible else "", |
|
|
gr.update(visible=reasoning_visible) |
|
|
) |
|
|
|
|
|
except ValueError as ve: |
|
|
error_msg = f""" |
|
|
## ❌ Authentication Error |
|
|
|
|
|
{str(ve)} |
|
|
|
|
|
**Solution**: Please ensure your authentication has `inference` permissions. |
|
|
""" |
|
|
return error_msg, "", "", gr.update(visible=False) |
|
|
|
|
|
except RuntimeError as re: |
|
|
error_msg = f""" |
|
|
## ❌ System Error |
|
|
|
|
|
{str(re)} |
|
|
|
|
|
**This may be due to:** |
|
|
- Qwen model access issues |
|
|
- HuggingFace Inference API unavailability |
|
|
- Network connectivity problems |
|
|
""" |
|
|
return error_msg, "", "", gr.update(visible=False) |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f""" |
|
|
## ❌ Unexpected Error |
|
|
|
|
|
{str(e)} |
|
|
|
|
|
**Please try again or contact support if the issue persists.** |
|
|
""" |
|
|
return error_msg, "", "", gr.update(visible=False) |
|
|
|
|
|
submit_btn.click( |
|
|
fn=process_and_update, |
|
|
inputs=[question_input, file_input, show_reasoning], |
|
|
outputs=[answer_output, details_output, reasoning_output, reasoning_output] |
|
|
) |
|
|
|
|
|
|
|
|
show_reasoning.change( |
|
|
fn=lambda show: gr.update(visible=show), |
|
|
inputs=[show_reasoning], |
|
|
outputs=[reasoning_output] |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
|
|
|
### 🔧 System Architecture |
|
|
|
|
|
**LangGraph Multi-Agent Workflow:** |
|
|
- **Router Agent**: Classifies questions and selects appropriate specialized agents (using 32B model for better accuracy) |
|
|
- **Web Research Agent**: Multi-engine search with DuckDuckGo (primary), Tavily API (secondary), Wikipedia (fallback) |
|
|
- **File Processing Agent**: Processes uploaded files (CSV, images, code, audio) |
|
|
- **Reasoning Agent**: Handles mathematical calculations and logical reasoning |
|
|
- **Synthesizer Agent**: Combines results from multiple agents into final answers |
|
|
|
|
|
**Models Used**: Qwen 2.5 (7B/32B/72B) with intelligent tier selection for optimal cost/performance |
|
|
|
|
|
**Tools Available**: Multi-engine web search (DuckDuckGo + Tavily + Wikipedia), mathematical calculator, multi-format file processor |
|
|
|
|
|
### 📈 Performance Metrics |
|
|
- **Success Rate**: 30%+ expected on GAIA benchmark with full authentication |
|
|
- **Average Response Time**: ~3-5 seconds per question depending on complexity |
|
|
- **Cost Efficiency**: $0.01-0.40 per question depending on model tier selection |
|
|
- **Architecture**: Multi-agent LangGraph orchestration with intelligent synthesis |
|
|
- **Reliability**: Robust error handling and graceful degradation within workflow |
|
|
- **Web Search**: 3-tier search system (DuckDuckGo → Tavily → Wikipedia) with smart query optimization |
|
|
|
|
|
### 🎯 Authentication Requirements |
|
|
- **HF_TOKEN Environment Variable**: Best performance with full access to Qwen models |
|
|
- **OAuth with Inference Scope**: Full access to Qwen 2.5 models via HuggingFace Inference API |
|
|
- **Optional**: TAVILY_API_KEY for enhanced web search capabilities (1,000 free searches/month) |
|
|
- **No Fallback Options**: System requires proper authentication for multi-agent functionality |
|
|
""") |
|
|
|
|
|
return interface |
|
|
|
|
|
def main(): |
|
|
"""Main application entry point""" |
|
|
|
|
|
|
|
|
is_production = ( |
|
|
os.getenv("GRADIO_ENV") == "production" or |
|
|
os.getenv("SPACE_ID") is not None or |
|
|
os.getenv("SPACE_HOST") is not None |
|
|
) |
|
|
|
|
|
|
|
|
space_host = os.getenv("SPACE_HOST") |
|
|
space_id = os.getenv("SPACE_ID") |
|
|
|
|
|
if space_host: |
|
|
logger.info(f"✅ SPACE_HOST found: {space_host}") |
|
|
logger.info(f" Runtime URL: https://{space_host}") |
|
|
else: |
|
|
logger.info("ℹ️ SPACE_HOST environment variable not found (running locally?).") |
|
|
|
|
|
if space_id: |
|
|
logger.info(f"✅ SPACE_ID found: {space_id}") |
|
|
logger.info(f" Repo URL: https://huggingface.co/spaces/{space_id}") |
|
|
else: |
|
|
logger.info("ℹ️ SPACE_ID environment variable not found (running locally?).") |
|
|
|
|
|
logger.info(f"🔧 Production mode: {is_production}") |
|
|
|
|
|
|
|
|
interface = create_interface() |
|
|
|
|
|
|
|
|
if is_production: |
|
|
|
|
|
launch_kwargs = { |
|
|
"server_name": "0.0.0.0", |
|
|
"server_port": int(os.getenv("PORT", 7860)), |
|
|
"share": False, |
|
|
"debug": False, |
|
|
"show_error": True, |
|
|
"quiet": False, |
|
|
"favicon_path": None, |
|
|
"auth": None, |
|
|
|
|
|
"auth_message": "Login with HuggingFace for full inference access to models", |
|
|
} |
|
|
logger.info(f"🚀 Launching in PRODUCTION mode on 0.0.0.0:{launch_kwargs['server_port']}") |
|
|
logger.info("🔑 OAuth configured to request 'read' and 'inference' scopes") |
|
|
else: |
|
|
|
|
|
launch_kwargs = { |
|
|
"server_name": "127.0.0.1", |
|
|
"server_port": 7860, |
|
|
"share": False, |
|
|
"debug": True, |
|
|
"show_error": True, |
|
|
"quiet": False, |
|
|
"favicon_path": None, |
|
|
"inbrowser": True, |
|
|
"auth_message": "Login with HuggingFace for full inference access to models", |
|
|
} |
|
|
logger.info("🔧 Launching in DEVELOPMENT mode on 127.0.0.1:7860") |
|
|
|
|
|
|
|
|
if is_production: |
|
|
|
|
|
os.environ["OAUTH_SCOPES"] = "read,inference" |
|
|
os.environ["OAUTH_CLIENT_ID"] = os.getenv("OAUTH_CLIENT_ID", "") |
|
|
logger.info("🔐 OAuth environment configured for inference access") |
|
|
|
|
|
interface.launch(**launch_kwargs) |
|
|
|
|
|
def process_question_with_gaia_agent(question_text: str, question_id: str = None, |
|
|
file_name: str = None, file_content: bytes = None) -> Dict[str, Any]: |
|
|
""" |
|
|
Process a GAIA question using enhanced multi-phase planning workflow |
|
|
""" |
|
|
try: |
|
|
logger.info(f"📝 Processing GAIA question with enhanced workflow: {question_text[:100]}...") |
|
|
|
|
|
|
|
|
llm_client = QwenClient() |
|
|
gaia_agent = GAIAAgentApp.create_with_qwen_client(llm_client) |
|
|
|
|
|
|
|
|
result = gaia_agent.process_with_langgraph(question_text, question_id) |
|
|
|
|
|
|
|
|
enhanced_result = { |
|
|
"question_id": question_id or "unknown", |
|
|
"question": question_text, |
|
|
"answer": result["answer"], |
|
|
"confidence": result["confidence"], |
|
|
"reasoning": result["reasoning"], |
|
|
"cost_estimate": result["cost"], |
|
|
"processing_time": result["processing_time"], |
|
|
"workflow_type": "enhanced_multi_phase", |
|
|
"processing_details": result["processing_details"], |
|
|
"agent_results": result["agent_results"], |
|
|
"success": len(result["errors"]) == 0, |
|
|
"error_messages": result["errors"] |
|
|
} |
|
|
|
|
|
return enhanced_result |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"Enhanced GAIA processing failed: {str(e)}" |
|
|
logger.error(error_msg) |
|
|
return { |
|
|
"question_id": question_id or "unknown", |
|
|
"question": question_text, |
|
|
"answer": "Enhanced processing failed", |
|
|
"confidence": 0.0, |
|
|
"reasoning": error_msg, |
|
|
"cost_estimate": 0.0, |
|
|
"processing_time": 0.0, |
|
|
"workflow_type": "enhanced_multi_phase_failed", |
|
|
"processing_details": {"error": error_msg}, |
|
|
"agent_results": [], |
|
|
"success": False, |
|
|
"error_messages": [error_msg] |
|
|
} |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |