Agent_Course_Final_Assignment

Sleeping

File size: 10,050 Bytes

5ec1e1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6afa67b
5ec1e1b
6afa67b
5ec1e1b
6afa67b
 
5ec1e1b
6afa67b
5ec1e1b
 
 
6afa67b
5ec1e1b
6afa67b
4d128ff
5ec1e1b
 
6afa67b
 
 
 
 
 
 
 
5ec1e1b
6afa67b
 
 
 
 
5ec1e1b
6afa67b
 
 
 
5ec1e1b
6afa67b
 
5ec1e1b
 
 
 
6afa67b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ec1e1b
 
6afa67b
 
 
 
5ec1e1b
6afa67b
 
 
5ec1e1b
6afa67b
 
 
 
5ec1e1b
6afa67b
 
5ec1e1b
6afa67b
5ec1e1b
 
6afa67b
5ec1e1b
6afa67b
 
 
5ec1e1b
 
6afa67b
5ec1e1b
6afa67b
5ec1e1b
6afa67b
5ec1e1b
 
 
 
6afa67b
5ec1e1b
 
6afa67b
 
 
5ec1e1b
6afa67b
 
 
 
 
 
5ec1e1b
6afa67b
 
 
 
 
 
5ec1e1b
6afa67b
 
5ec1e1b
6afa67b
 
 
 
 
 
5ec1e1b
6afa67b

#!/usr/bin/env python3
"""
Final Answer Tool for GAIA Agent System
Extracts precise, EXACT MATCH compliant answers from agent results
"""

import re
import logging
from typing import Dict, Any, Optional

from models.qwen_client import QwenClient, ModelTier

logger = logging.getLogger(__name__)

class FinalAnswerTool:
    """
    Tool for extracting precise, GAIA-compliant final answers
    Ensures EXACT MATCH compatibility for Unit 4 API submission
    """
    
    def __init__(self, llm_client: QwenClient):
        self.llm_client = llm_client
        
    def extract_final_answer(self, question: str, agent_results: str, question_type: str = "general") -> Dict[str, Any]:
        """
        Extract GAIA-compliant final answer with enhanced accuracy
        """
        logger.info("🎯 Extracting GAIA-compliant final answer")
        
        try:
            # Create specialized extraction prompt based on question type
            extraction_prompt = self._create_extraction_prompt(question, agent_results, question_type)
            
            # Use 72B model for precise extraction
            llm_result = self.llm_client.generate(
                extraction_prompt,
                tier=ModelTier.COMPLEX,  # Always use most capable model
                max_tokens=100  # Keep answer concise
            )
            
            if llm_result.success:
                # Clean and validate the extracted answer
                raw_answer = llm_result.response.strip()
                final_answer = self._clean_and_validate_answer(raw_answer, question, question_type)
                
                # Assess answer quality
                confidence = self._assess_answer_quality(final_answer, question, agent_results, question_type)
                
                return {
                    "answer": final_answer,
                    "confidence": confidence,
                    "reasoning": f"Extracted from {question_type} analysis using 72B model",
                    "raw_response": raw_answer,
                    "validation_passed": len(final_answer) <= 100 and len(final_answer) > 0
                }
            else:
                # Fallback to simple extraction
                return self._fallback_extraction(question, agent_results)
                
        except Exception as e:
            logger.error(f"Final answer extraction failed: {e}")
            return self._fallback_extraction(question, agent_results)
    
    def _create_extraction_prompt(self, question: str, agent_results: str, question_type: str) -> str:
        """Create specialized extraction prompt based on question type"""
        
        base_instructions = """
        CRITICAL: Extract the exact answer for GAIA benchmark evaluation.
        Your response must be ONLY the answer - no explanations, no prefixes, no extra text.
        
        Question: {question}
        
        Analysis from agents:
        {agent_results}
        
        """
        
        # Specialized instructions based on question type
        if question_type == "mathematical" or "how many" in question.lower():
            type_instructions = """
        This is a counting/mathematical question. Respond with ONLY the number.
        Examples of correct responses: "5", "42", "0"
        Do NOT include words like "albums", "songs", "items", etc.
        """
        
        elif question_type == "yes_no":
            type_instructions = """
        This is a yes/no question. Respond with ONLY "yes" or "no".
        """
        
        elif question_type == "name" or any(word in question.lower() for word in ["who", "name"]):
            type_instructions = """
        This is asking for a name. Respond with ONLY the name requested.
        Examples: "John Smith", "Mike102", "Einstein"
        """
        
        elif question_type == "location":
            type_instructions = """
        This is asking for a location. Respond with ONLY the location name.
        Examples: "Paris", "New York", "LIE", "Hanoi"
        """
        
        elif question_type == "text_manipulation":
            type_instructions = """
        This involves text manipulation. Respond with ONLY the processed text result.
        Examples: "right", "hello", "12345"
        """
        
        else:
            type_instructions = """
        Respond with ONLY the direct answer requested.
        Keep it concise and specific.
        """
        
        ending_instructions = """
        
        EXTRACT ONLY THE ANSWER:"""
        
        return base_instructions.format(
            question=question,
            agent_results=agent_results[:2000]  # Limit input length
        ) + type_instructions + ending_instructions
    
    def _clean_and_validate_answer(self, raw_answer: str, question: str, question_type: str) -> str:
        """Clean and validate the extracted answer"""
        
        # Remove common prefixes and suffixes
        answer = raw_answer.strip()
        
        # Remove common answer prefixes
        prefixes_to_remove = [
            "final answer:", "answer:", "the answer is:", "result:", "conclusion:",
            "based on", "according to", "therefore", "thus", "so", "hence",
            "final answer is", "the result is", "it is", "this is"
        ]
        
        answer_lower = answer.lower()
        for prefix in prefixes_to_remove:
            if answer_lower.startswith(prefix):
                answer = answer[len(prefix):].strip()
                answer_lower = answer.lower()
        
        # Remove quotes if they wrap the entire answer
        if answer.startswith('"') and answer.endswith('"'):
            answer = answer[1:-1]
        elif answer.startswith("'") and answer.endswith("'"):
            answer = answer[1:-1]
        
        # Remove trailing punctuation that's not part of the answer
        while answer and answer[-1] in '.!?:;':
            answer = answer[:-1]
        
        # Special handling for different question types
        if question_type == "mathematical" or "how many" in question.lower():
            # Extract just the number
            numbers = re.findall(r'\b\d+\b', answer)
            if numbers:
                answer = numbers[0]
        
        elif question_type == "yes_no":
            # Normalize yes/no answers
            if any(word in answer.lower() for word in ['yes', 'true', 'correct', 'right']):
                answer = "yes"
            elif any(word in answer.lower() for word in ['no', 'false', 'incorrect', 'wrong']):
                answer = "no"
        
        # Final cleanup
        answer = answer.strip()
        
        # Ensure answer is not empty
        if not answer:
            # Try to extract from the original raw answer
            words = raw_answer.split()
            if words:
                answer = words[-1]  # Take the last word as fallback
        
        return answer
    
    def _assess_answer_quality(self, answer: str, question: str, agent_results: str, question_type: str) -> float:
        """Assess the quality/confidence of the extracted answer"""
        
        confidence = 0.7  # Base confidence
        
        # Factor 1: Answer length appropriateness
        if len(answer) == 0:
            return 0.1  # Very low confidence for empty answers
        elif len(answer) > 100:
            confidence -= 0.2  # Too long for GAIA
        elif 1 <= len(answer) <= 50:
            confidence += 0.1  # Good length
        
        # Factor 2: Question type matching
        question_lower = question.lower()
        
        if ("how many" in question_lower or question_type == "mathematical") and re.match(r'^\d+$', answer):
            confidence += 0.15  # Numeric answer to counting question
        elif ("who" in question_lower or "name" in question_lower) and len(answer.split()) <= 3:
            confidence += 0.1   # Name-like answer to who question
        elif ("where" in question_lower) and len(answer.split()) <= 2:
            confidence += 0.1   # Location-like answer
        elif ("yes or no" in question_lower) and answer.lower() in ["yes", "no"]:
            confidence += 0.15  # Perfect yes/no answer
        
        # Factor 3: Answer appears in agent results (indicates it was found)
        if answer.lower() in agent_results.lower():
            confidence += 0.1
        
        # Factor 4: Answer specificity
        if re.search(r'\b\d{4}\b', answer):  # Contains year
            confidence += 0.05
        if re.search(r'\b[A-Z][a-z]+\b', answer):  # Contains proper noun
            confidence += 0.05
        
        # Factor 5: Common failure patterns
        failure_indicators = ['unknown', 'unclear', 'not found', 'unable to determine', 'no information']
        if any(indicator in answer.lower() for indicator in failure_indicators):
            confidence -= 0.3
        
        return max(0.1, min(0.95, confidence))
    
    def _fallback_extraction(self, question: str, agent_results: str) -> Dict[str, Any]:
        """Simple fallback when LLM extraction fails"""
        
        # Try to extract a reasonable answer from agent results
        lines = agent_results.split('\n')
        
        # Look for lines that might contain answers
        potential_answers = []
        for line in lines:
            line = line.strip()
            if len(line) > 0 and len(line) < 100:
                # Skip lines that are clearly explanatory
                if not any(word in line.lower() for word in ['according', 'based on', 'however', 'therefore', 'because']):
                    potential_answers.append(line)
        
        # Use the first reasonable answer or a fallback
        answer = potential_answers[0] if potential_answers else "Unable to determine"
        
        return {
            "answer": answer,
            "confidence": 0.3,
            "reasoning": "Fallback extraction due to LLM failure",
            "raw_response": agent_results[:100],
            "validation_passed": False
        }