Agent_Course_Final_Assignment / src /tools /final_answer_tool.py
Chris
Final 7.8.3
4d128ff
#!/usr/bin/env python3
"""
Final Answer Tool for GAIA Agent System
Extracts precise, EXACT MATCH compliant answers from agent results
"""
import re
import logging
from typing import Dict, Any, Optional
from models.qwen_client import QwenClient, ModelTier
logger = logging.getLogger(__name__)
class FinalAnswerTool:
"""
Tool for extracting precise, GAIA-compliant final answers
Ensures EXACT MATCH compatibility for Unit 4 API submission
"""
def __init__(self, llm_client: QwenClient):
self.llm_client = llm_client
def extract_final_answer(self, question: str, agent_results: str, question_type: str = "general") -> Dict[str, Any]:
"""
Extract GAIA-compliant final answer with enhanced accuracy
"""
logger.info("🎯 Extracting GAIA-compliant final answer")
try:
# Create specialized extraction prompt based on question type
extraction_prompt = self._create_extraction_prompt(question, agent_results, question_type)
# Use 72B model for precise extraction
llm_result = self.llm_client.generate(
extraction_prompt,
tier=ModelTier.COMPLEX, # Always use most capable model
max_tokens=100 # Keep answer concise
)
if llm_result.success:
# Clean and validate the extracted answer
raw_answer = llm_result.response.strip()
final_answer = self._clean_and_validate_answer(raw_answer, question, question_type)
# Assess answer quality
confidence = self._assess_answer_quality(final_answer, question, agent_results, question_type)
return {
"answer": final_answer,
"confidence": confidence,
"reasoning": f"Extracted from {question_type} analysis using 72B model",
"raw_response": raw_answer,
"validation_passed": len(final_answer) <= 100 and len(final_answer) > 0
}
else:
# Fallback to simple extraction
return self._fallback_extraction(question, agent_results)
except Exception as e:
logger.error(f"Final answer extraction failed: {e}")
return self._fallback_extraction(question, agent_results)
def _create_extraction_prompt(self, question: str, agent_results: str, question_type: str) -> str:
"""Create specialized extraction prompt based on question type"""
base_instructions = """
CRITICAL: Extract the exact answer for GAIA benchmark evaluation.
Your response must be ONLY the answer - no explanations, no prefixes, no extra text.
Question: {question}
Analysis from agents:
{agent_results}
"""
# Specialized instructions based on question type
if question_type == "mathematical" or "how many" in question.lower():
type_instructions = """
This is a counting/mathematical question. Respond with ONLY the number.
Examples of correct responses: "5", "42", "0"
Do NOT include words like "albums", "songs", "items", etc.
"""
elif question_type == "yes_no":
type_instructions = """
This is a yes/no question. Respond with ONLY "yes" or "no".
"""
elif question_type == "name" or any(word in question.lower() for word in ["who", "name"]):
type_instructions = """
This is asking for a name. Respond with ONLY the name requested.
Examples: "John Smith", "Mike102", "Einstein"
"""
elif question_type == "location":
type_instructions = """
This is asking for a location. Respond with ONLY the location name.
Examples: "Paris", "New York", "LIE", "Hanoi"
"""
elif question_type == "text_manipulation":
type_instructions = """
This involves text manipulation. Respond with ONLY the processed text result.
Examples: "right", "hello", "12345"
"""
else:
type_instructions = """
Respond with ONLY the direct answer requested.
Keep it concise and specific.
"""
ending_instructions = """
EXTRACT ONLY THE ANSWER:"""
return base_instructions.format(
question=question,
agent_results=agent_results[:2000] # Limit input length
) + type_instructions + ending_instructions
def _clean_and_validate_answer(self, raw_answer: str, question: str, question_type: str) -> str:
"""Clean and validate the extracted answer"""
# Remove common prefixes and suffixes
answer = raw_answer.strip()
# Remove common answer prefixes
prefixes_to_remove = [
"final answer:", "answer:", "the answer is:", "result:", "conclusion:",
"based on", "according to", "therefore", "thus", "so", "hence",
"final answer is", "the result is", "it is", "this is"
]
answer_lower = answer.lower()
for prefix in prefixes_to_remove:
if answer_lower.startswith(prefix):
answer = answer[len(prefix):].strip()
answer_lower = answer.lower()
# Remove quotes if they wrap the entire answer
if answer.startswith('"') and answer.endswith('"'):
answer = answer[1:-1]
elif answer.startswith("'") and answer.endswith("'"):
answer = answer[1:-1]
# Remove trailing punctuation that's not part of the answer
while answer and answer[-1] in '.!?:;':
answer = answer[:-1]
# Special handling for different question types
if question_type == "mathematical" or "how many" in question.lower():
# Extract just the number
numbers = re.findall(r'\b\d+\b', answer)
if numbers:
answer = numbers[0]
elif question_type == "yes_no":
# Normalize yes/no answers
if any(word in answer.lower() for word in ['yes', 'true', 'correct', 'right']):
answer = "yes"
elif any(word in answer.lower() for word in ['no', 'false', 'incorrect', 'wrong']):
answer = "no"
# Final cleanup
answer = answer.strip()
# Ensure answer is not empty
if not answer:
# Try to extract from the original raw answer
words = raw_answer.split()
if words:
answer = words[-1] # Take the last word as fallback
return answer
def _assess_answer_quality(self, answer: str, question: str, agent_results: str, question_type: str) -> float:
"""Assess the quality/confidence of the extracted answer"""
confidence = 0.7 # Base confidence
# Factor 1: Answer length appropriateness
if len(answer) == 0:
return 0.1 # Very low confidence for empty answers
elif len(answer) > 100:
confidence -= 0.2 # Too long for GAIA
elif 1 <= len(answer) <= 50:
confidence += 0.1 # Good length
# Factor 2: Question type matching
question_lower = question.lower()
if ("how many" in question_lower or question_type == "mathematical") and re.match(r'^\d+$', answer):
confidence += 0.15 # Numeric answer to counting question
elif ("who" in question_lower or "name" in question_lower) and len(answer.split()) <= 3:
confidence += 0.1 # Name-like answer to who question
elif ("where" in question_lower) and len(answer.split()) <= 2:
confidence += 0.1 # Location-like answer
elif ("yes or no" in question_lower) and answer.lower() in ["yes", "no"]:
confidence += 0.15 # Perfect yes/no answer
# Factor 3: Answer appears in agent results (indicates it was found)
if answer.lower() in agent_results.lower():
confidence += 0.1
# Factor 4: Answer specificity
if re.search(r'\b\d{4}\b', answer): # Contains year
confidence += 0.05
if re.search(r'\b[A-Z][a-z]+\b', answer): # Contains proper noun
confidence += 0.05
# Factor 5: Common failure patterns
failure_indicators = ['unknown', 'unclear', 'not found', 'unable to determine', 'no information']
if any(indicator in answer.lower() for indicator in failure_indicators):
confidence -= 0.3
return max(0.1, min(0.95, confidence))
def _fallback_extraction(self, question: str, agent_results: str) -> Dict[str, Any]:
"""Simple fallback when LLM extraction fails"""
# Try to extract a reasonable answer from agent results
lines = agent_results.split('\n')
# Look for lines that might contain answers
potential_answers = []
for line in lines:
line = line.strip()
if len(line) > 0 and len(line) < 100:
# Skip lines that are clearly explanatory
if not any(word in line.lower() for word in ['according', 'based on', 'however', 'therefore', 'because']):
potential_answers.append(line)
# Use the first reasonable answer or a fallback
answer = potential_answers[0] if potential_answers else "Unable to determine"
return {
"answer": answer,
"confidence": 0.3,
"reasoning": "Fallback extraction due to LLM failure",
"raw_response": agent_results[:100],
"validation_passed": False
}