Agent_Course_Final_Assignment

Sleeping

Agent_Course_Final_Assignment / src /tools /final_answer_tool.py

Chris

Final 7.8.3

4d128ff 8 months ago

10.1 kB

	#!/usr/bin/env python3
	"""
	Final Answer Tool for GAIA Agent System
	Extracts precise, EXACT MATCH compliant answers from agent results
	"""

	import re
	import logging
	from typing import Dict, Any, Optional

	from models.qwen_client import QwenClient, ModelTier

	logger = logging.getLogger(__name__)

	class FinalAnswerTool:
	"""
	Tool for extracting precise, GAIA-compliant final answers
	Ensures EXACT MATCH compatibility for Unit 4 API submission
	"""

	def __init__(self, llm_client: QwenClient):
	self.llm_client = llm_client

	def extract_final_answer(self, question: str, agent_results: str, question_type: str = "general") -> Dict[str, Any]:
	"""
	Extract GAIA-compliant final answer with enhanced accuracy
	"""
	logger.info("🎯 Extracting GAIA-compliant final answer")

	try:
	# Create specialized extraction prompt based on question type
	extraction_prompt = self._create_extraction_prompt(question, agent_results, question_type)

	# Use 72B model for precise extraction
	llm_result = self.llm_client.generate(
	extraction_prompt,
	tier=ModelTier.COMPLEX, # Always use most capable model
	max_tokens=100 # Keep answer concise
	)

	if llm_result.success:
	# Clean and validate the extracted answer
	raw_answer = llm_result.response.strip()
	final_answer = self._clean_and_validate_answer(raw_answer, question, question_type)

	# Assess answer quality
	confidence = self._assess_answer_quality(final_answer, question, agent_results, question_type)

	return {
	"answer": final_answer,
	"confidence": confidence,
	"reasoning": f"Extracted from {question_type} analysis using 72B model",
	"raw_response": raw_answer,
	"validation_passed": len(final_answer) <= 100 and len(final_answer) > 0
	}
	else:
	# Fallback to simple extraction
	return self._fallback_extraction(question, agent_results)

	except Exception as e:
	logger.error(f"Final answer extraction failed: {e}")
	return self._fallback_extraction(question, agent_results)

	def _create_extraction_prompt(self, question: str, agent_results: str, question_type: str) -> str:
	"""Create specialized extraction prompt based on question type"""

	base_instructions = """
	CRITICAL: Extract the exact answer for GAIA benchmark evaluation.
	Your response must be ONLY the answer - no explanations, no prefixes, no extra text.

	Question: {question}

	Analysis from agents:
	{agent_results}

	"""

	# Specialized instructions based on question type
	if question_type == "mathematical" or "how many" in question.lower():
	type_instructions = """
	This is a counting/mathematical question. Respond with ONLY the number.
	Examples of correct responses: "5", "42", "0"
	Do NOT include words like "albums", "songs", "items", etc.
	"""

	elif question_type == "yes_no":
	type_instructions = """
	This is a yes/no question. Respond with ONLY "yes" or "no".
	"""

	elif question_type == "name" or any(word in question.lower() for word in ["who", "name"]):
	type_instructions = """
	This is asking for a name. Respond with ONLY the name requested.
	Examples: "John Smith", "Mike102", "Einstein"
	"""

	elif question_type == "location":
	type_instructions = """
	This is asking for a location. Respond with ONLY the location name.
	Examples: "Paris", "New York", "LIE", "Hanoi"
	"""

	elif question_type == "text_manipulation":
	type_instructions = """
	This involves text manipulation. Respond with ONLY the processed text result.
	Examples: "right", "hello", "12345"
	"""

	else:
	type_instructions = """
	Respond with ONLY the direct answer requested.
	Keep it concise and specific.
	"""

	ending_instructions = """

	EXTRACT ONLY THE ANSWER:"""

	return base_instructions.format(
	question=question,
	agent_results=agent_results[:2000] # Limit input length
	) + type_instructions + ending_instructions

	def _clean_and_validate_answer(self, raw_answer: str, question: str, question_type: str) -> str:
	"""Clean and validate the extracted answer"""

	# Remove common prefixes and suffixes
	answer = raw_answer.strip()

	# Remove common answer prefixes
	prefixes_to_remove = [
	"final answer:", "answer:", "the answer is:", "result:", "conclusion:",
	"based on", "according to", "therefore", "thus", "so", "hence",
	"final answer is", "the result is", "it is", "this is"
	]

	answer_lower = answer.lower()
	for prefix in prefixes_to_remove:
	if answer_lower.startswith(prefix):
	answer = answer[len(prefix):].strip()
	answer_lower = answer.lower()

	# Remove quotes if they wrap the entire answer
	if answer.startswith('"') and answer.endswith('"'):
	answer = answer[1:-1]
	elif answer.startswith("'") and answer.endswith("'"):
	answer = answer[1:-1]

	# Remove trailing punctuation that's not part of the answer
	while answer and answer[-1] in '.!?:;':
	answer = answer[:-1]

	# Special handling for different question types
	if question_type == "mathematical" or "how many" in question.lower():
	# Extract just the number
	numbers = re.findall(r'\b\d+\b', answer)
	if numbers:
	answer = numbers[0]

	elif question_type == "yes_no":
	# Normalize yes/no answers
	if any(word in answer.lower() for word in ['yes', 'true', 'correct', 'right']):
	answer = "yes"
	elif any(word in answer.lower() for word in ['no', 'false', 'incorrect', 'wrong']):
	answer = "no"

	# Final cleanup
	answer = answer.strip()

	# Ensure answer is not empty
	if not answer:
	# Try to extract from the original raw answer
	words = raw_answer.split()
	if words:
	answer = words[-1] # Take the last word as fallback

	return answer

	def _assess_answer_quality(self, answer: str, question: str, agent_results: str, question_type: str) -> float:
	"""Assess the quality/confidence of the extracted answer"""

	confidence = 0.7 # Base confidence

	# Factor 1: Answer length appropriateness
	if len(answer) == 0:
	return 0.1 # Very low confidence for empty answers
	elif len(answer) > 100:
	confidence -= 0.2 # Too long for GAIA
	elif 1 <= len(answer) <= 50:
	confidence += 0.1 # Good length

	# Factor 2: Question type matching
	question_lower = question.lower()

	if ("how many" in question_lower or question_type == "mathematical") and re.match(r'^\d+$', answer):
	confidence += 0.15 # Numeric answer to counting question
	elif ("who" in question_lower or "name" in question_lower) and len(answer.split()) <= 3:
	confidence += 0.1 # Name-like answer to who question
	elif ("where" in question_lower) and len(answer.split()) <= 2:
	confidence += 0.1 # Location-like answer
	elif ("yes or no" in question_lower) and answer.lower() in ["yes", "no"]:
	confidence += 0.15 # Perfect yes/no answer

	# Factor 3: Answer appears in agent results (indicates it was found)
	if answer.lower() in agent_results.lower():
	confidence += 0.1

	# Factor 4: Answer specificity
	if re.search(r'\b\d{4}\b', answer): # Contains year
	confidence += 0.05
	if re.search(r'\b[A-Z][a-z]+\b', answer): # Contains proper noun
	confidence += 0.05

	# Factor 5: Common failure patterns
	failure_indicators = ['unknown', 'unclear', 'not found', 'unable to determine', 'no information']
	if any(indicator in answer.lower() for indicator in failure_indicators):
	confidence -= 0.3

	return max(0.1, min(0.95, confidence))

	def _fallback_extraction(self, question: str, agent_results: str) -> Dict[str, Any]:
	"""Simple fallback when LLM extraction fails"""

	# Try to extract a reasonable answer from agent results
	lines = agent_results.split('\n')

	# Look for lines that might contain answers
	potential_answers = []
	for line in lines:
	line = line.strip()
	if len(line) > 0 and len(line) < 100:
	# Skip lines that are clearly explanatory
	if not any(word in line.lower() for word in ['according', 'based on', 'however', 'therefore', 'because']):
	potential_answers.append(line)

	# Use the first reasonable answer or a fallback
	answer = potential_answers[0] if potential_answers else "Unable to determine"

	return {
	"answer": answer,
	"confidence": 0.3,
	"reasoning": "Fallback extraction due to LLM failure",
	"raw_response": agent_results[:100],
	"validation_passed": False
	}