File size: 10,050 Bytes
5ec1e1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6afa67b
5ec1e1b
6afa67b
5ec1e1b
6afa67b
 
5ec1e1b
6afa67b
5ec1e1b
 
 
6afa67b
5ec1e1b
6afa67b
4d128ff
5ec1e1b
 
6afa67b
 
 
 
 
 
 
 
5ec1e1b
6afa67b
 
 
 
 
5ec1e1b
6afa67b
 
 
 
5ec1e1b
6afa67b
 
5ec1e1b
 
 
 
6afa67b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ec1e1b
 
6afa67b
 
 
 
5ec1e1b
6afa67b
 
 
5ec1e1b
6afa67b
 
 
 
5ec1e1b
6afa67b
 
5ec1e1b
6afa67b
5ec1e1b
 
6afa67b
5ec1e1b
6afa67b
 
 
5ec1e1b
 
6afa67b
5ec1e1b
6afa67b
5ec1e1b
6afa67b
5ec1e1b
 
 
 
6afa67b
5ec1e1b
 
6afa67b
 
 
5ec1e1b
6afa67b
 
 
 
 
 
5ec1e1b
6afa67b
 
 
 
 
 
5ec1e1b
6afa67b
 
5ec1e1b
6afa67b
 
 
 
 
 
5ec1e1b
6afa67b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#!/usr/bin/env python3
"""
Final Answer Tool for GAIA Agent System
Extracts precise, EXACT MATCH compliant answers from agent results
"""

import re
import logging
from typing import Dict, Any, Optional

from models.qwen_client import QwenClient, ModelTier

logger = logging.getLogger(__name__)

class FinalAnswerTool:
    """
    Tool for extracting precise, GAIA-compliant final answers
    Ensures EXACT MATCH compatibility for Unit 4 API submission
    """
    
    def __init__(self, llm_client: QwenClient):
        self.llm_client = llm_client
        
    def extract_final_answer(self, question: str, agent_results: str, question_type: str = "general") -> Dict[str, Any]:
        """
        Extract GAIA-compliant final answer with enhanced accuracy
        """
        logger.info("🎯 Extracting GAIA-compliant final answer")
        
        try:
            # Create specialized extraction prompt based on question type
            extraction_prompt = self._create_extraction_prompt(question, agent_results, question_type)
            
            # Use 72B model for precise extraction
            llm_result = self.llm_client.generate(
                extraction_prompt,
                tier=ModelTier.COMPLEX,  # Always use most capable model
                max_tokens=100  # Keep answer concise
            )
            
            if llm_result.success:
                # Clean and validate the extracted answer
                raw_answer = llm_result.response.strip()
                final_answer = self._clean_and_validate_answer(raw_answer, question, question_type)
                
                # Assess answer quality
                confidence = self._assess_answer_quality(final_answer, question, agent_results, question_type)
                
                return {
                    "answer": final_answer,
                    "confidence": confidence,
                    "reasoning": f"Extracted from {question_type} analysis using 72B model",
                    "raw_response": raw_answer,
                    "validation_passed": len(final_answer) <= 100 and len(final_answer) > 0
                }
            else:
                # Fallback to simple extraction
                return self._fallback_extraction(question, agent_results)
                
        except Exception as e:
            logger.error(f"Final answer extraction failed: {e}")
            return self._fallback_extraction(question, agent_results)
    
    def _create_extraction_prompt(self, question: str, agent_results: str, question_type: str) -> str:
        """Create specialized extraction prompt based on question type"""
        
        base_instructions = """
        CRITICAL: Extract the exact answer for GAIA benchmark evaluation.
        Your response must be ONLY the answer - no explanations, no prefixes, no extra text.
        
        Question: {question}
        
        Analysis from agents:
        {agent_results}
        
        """
        
        # Specialized instructions based on question type
        if question_type == "mathematical" or "how many" in question.lower():
            type_instructions = """
        This is a counting/mathematical question. Respond with ONLY the number.
        Examples of correct responses: "5", "42", "0"
        Do NOT include words like "albums", "songs", "items", etc.
        """
        
        elif question_type == "yes_no":
            type_instructions = """
        This is a yes/no question. Respond with ONLY "yes" or "no".
        """
        
        elif question_type == "name" or any(word in question.lower() for word in ["who", "name"]):
            type_instructions = """
        This is asking for a name. Respond with ONLY the name requested.
        Examples: "John Smith", "Mike102", "Einstein"
        """
        
        elif question_type == "location":
            type_instructions = """
        This is asking for a location. Respond with ONLY the location name.
        Examples: "Paris", "New York", "LIE", "Hanoi"
        """
        
        elif question_type == "text_manipulation":
            type_instructions = """
        This involves text manipulation. Respond with ONLY the processed text result.
        Examples: "right", "hello", "12345"
        """
        
        else:
            type_instructions = """
        Respond with ONLY the direct answer requested.
        Keep it concise and specific.
        """
        
        ending_instructions = """
        
        EXTRACT ONLY THE ANSWER:"""
        
        return base_instructions.format(
            question=question,
            agent_results=agent_results[:2000]  # Limit input length
        ) + type_instructions + ending_instructions
    
    def _clean_and_validate_answer(self, raw_answer: str, question: str, question_type: str) -> str:
        """Clean and validate the extracted answer"""
        
        # Remove common prefixes and suffixes
        answer = raw_answer.strip()
        
        # Remove common answer prefixes
        prefixes_to_remove = [
            "final answer:", "answer:", "the answer is:", "result:", "conclusion:",
            "based on", "according to", "therefore", "thus", "so", "hence",
            "final answer is", "the result is", "it is", "this is"
        ]
        
        answer_lower = answer.lower()
        for prefix in prefixes_to_remove:
            if answer_lower.startswith(prefix):
                answer = answer[len(prefix):].strip()
                answer_lower = answer.lower()
        
        # Remove quotes if they wrap the entire answer
        if answer.startswith('"') and answer.endswith('"'):
            answer = answer[1:-1]
        elif answer.startswith("'") and answer.endswith("'"):
            answer = answer[1:-1]
        
        # Remove trailing punctuation that's not part of the answer
        while answer and answer[-1] in '.!?:;':
            answer = answer[:-1]
        
        # Special handling for different question types
        if question_type == "mathematical" or "how many" in question.lower():
            # Extract just the number
            numbers = re.findall(r'\b\d+\b', answer)
            if numbers:
                answer = numbers[0]
        
        elif question_type == "yes_no":
            # Normalize yes/no answers
            if any(word in answer.lower() for word in ['yes', 'true', 'correct', 'right']):
                answer = "yes"
            elif any(word in answer.lower() for word in ['no', 'false', 'incorrect', 'wrong']):
                answer = "no"
        
        # Final cleanup
        answer = answer.strip()
        
        # Ensure answer is not empty
        if not answer:
            # Try to extract from the original raw answer
            words = raw_answer.split()
            if words:
                answer = words[-1]  # Take the last word as fallback
        
        return answer
    
    def _assess_answer_quality(self, answer: str, question: str, agent_results: str, question_type: str) -> float:
        """Assess the quality/confidence of the extracted answer"""
        
        confidence = 0.7  # Base confidence
        
        # Factor 1: Answer length appropriateness
        if len(answer) == 0:
            return 0.1  # Very low confidence for empty answers
        elif len(answer) > 100:
            confidence -= 0.2  # Too long for GAIA
        elif 1 <= len(answer) <= 50:
            confidence += 0.1  # Good length
        
        # Factor 2: Question type matching
        question_lower = question.lower()
        
        if ("how many" in question_lower or question_type == "mathematical") and re.match(r'^\d+$', answer):
            confidence += 0.15  # Numeric answer to counting question
        elif ("who" in question_lower or "name" in question_lower) and len(answer.split()) <= 3:
            confidence += 0.1   # Name-like answer to who question
        elif ("where" in question_lower) and len(answer.split()) <= 2:
            confidence += 0.1   # Location-like answer
        elif ("yes or no" in question_lower) and answer.lower() in ["yes", "no"]:
            confidence += 0.15  # Perfect yes/no answer
        
        # Factor 3: Answer appears in agent results (indicates it was found)
        if answer.lower() in agent_results.lower():
            confidence += 0.1
        
        # Factor 4: Answer specificity
        if re.search(r'\b\d{4}\b', answer):  # Contains year
            confidence += 0.05
        if re.search(r'\b[A-Z][a-z]+\b', answer):  # Contains proper noun
            confidence += 0.05
        
        # Factor 5: Common failure patterns
        failure_indicators = ['unknown', 'unclear', 'not found', 'unable to determine', 'no information']
        if any(indicator in answer.lower() for indicator in failure_indicators):
            confidence -= 0.3
        
        return max(0.1, min(0.95, confidence))
    
    def _fallback_extraction(self, question: str, agent_results: str) -> Dict[str, Any]:
        """Simple fallback when LLM extraction fails"""
        
        # Try to extract a reasonable answer from agent results
        lines = agent_results.split('\n')
        
        # Look for lines that might contain answers
        potential_answers = []
        for line in lines:
            line = line.strip()
            if len(line) > 0 and len(line) < 100:
                # Skip lines that are clearly explanatory
                if not any(word in line.lower() for word in ['according', 'based on', 'however', 'therefore', 'because']):
                    potential_answers.append(line)
        
        # Use the first reasonable answer or a fallback
        answer = potential_answers[0] if potential_answers else "Unable to determine"
        
        return {
            "answer": answer,
            "confidence": 0.3,
            "reasoning": "Fallback extraction due to LLM failure",
            "raw_response": agent_results[:100],
            "validation_passed": False
        }