"""Evaluation Module for RAG System using DeepEval - Provides evaluation metrics using LLM-as-a-Judge approach via DeepEval - Integrates with local Ollama instance for fast, offline evaluation - Metrics include: answer_relevancy, faithfulness (reference-free, no ground truth needed) """ from typing import List, Dict from deepeval.models import OllamaModel from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric from deepeval.test_case import LLMTestCase from llm_system.config import OLLAMA_BASE_URL, LLM_CHAT_MODEL_NAME from logger import get_logger log = get_logger(name="core_evaluation_deepeval") class RAGEvaluator: """Evaluates RAG responses using reference-free DeepEval metrics with Ollama backend. Uses LLM-as-a-Judge approach for accurate evaluation WITHOUT requiring ground truth. All metrics are reference-free (do NOT require ground truth). Metrics (Reference-Free - No Ground Truth Needed): - answer_relevancy: How relevant the answer is to the question (0-1) - faithfulness: How well the answer is grounded in retrieved documents (0-1) """ def __init__( self, llm_model: str = LLM_CHAT_MODEL_NAME, ollama_base_url: str = OLLAMA_BASE_URL, temperature: float = 0.0, ): """Initialize RAGEvaluator with Ollama backend. Args: llm_model: Name of the Ollama model to use (e.g., "gemma3:latest") ollama_base_url: Base URL of Ollama server temperature: Model temperature for evaluation (0 = deterministic) """ self.llm_model = llm_model self.ollama_base_url = ollama_base_url self.temperature = temperature log.info(f"Initializing RAGEvaluator with DeepEval + Ollama") log.info(f" Model: {llm_model}") log.info(f" Ollama URL: {ollama_base_url}") try: # Initialize Ollama model self.model = OllamaModel( model=llm_model, base_url=ollama_base_url, temperature=temperature ) # Initialize metrics (all reference-free, no ground truth needed) self.answer_relevancy_metric = AnswerRelevancyMetric(model=self.model) self.faithfulness_metric = FaithfulnessMetric(model=self.model) log.info("✅ RAGEvaluator initialized successfully with reference-free DeepEval metrics") except Exception as e: log.error(f"❌ Failed to initialize RAGEvaluator: {e}") raise def evaluate_response( self, question: str, answer: str, contexts: List[str], ) -> Dict[str, float]: """Evaluate a single RAG response using reference-free DeepEval metrics. NOTE: No ground truth needed - all metrics are reference-free. Args: question: The user's question answer: The generated answer from RAG contexts: List of retrieved context chunks Returns: Dictionary with metric names and scores (0-1 range) """ try: log.info(f"Evaluating response for question: '{question[:50]}...'") # Create test case for DeepEval # All metrics are reference-free (no ground truth required) test_case = LLMTestCase( input=question, actual_output=answer, retrieval_context=contexts, # For context-based metrics ) scores = {} # Evaluate Answer Relevancy try: log.info("Evaluating answer relevancy...") self.answer_relevancy_metric.measure(test_case) relevancy_score = self.answer_relevancy_metric.score scores["answer_relevancy"] = relevancy_score log.info(f" Answer Relevancy: {relevancy_score:.3f}") except Exception as e: log.error(f"Failed to evaluate answer relevancy: {e}") scores["answer_relevancy"] = 0.0 # Evaluate Faithfulness try: log.info("Evaluating faithfulness...") self.faithfulness_metric.measure(test_case) faithfulness_score = self.faithfulness_metric.score scores["faithfulness"] = faithfulness_score log.info(f" Faithfulness: {faithfulness_score:.3f}") except Exception as e: log.error(f"Failed to evaluate faithfulness: {e}") scores["faithfulness"] = 0.0 log.info(f"✅ Evaluation complete: {scores}") return scores except Exception as e: log.error(f"❌ Evaluation failed: {e}") return { "answer_relevancy": 0.0, "faithfulness": 0.0, "error": str(e), } def evaluate_batch( self, questions: List[str], answers: List[str], contexts_list: List[List[str]], ) -> Dict[str, List[float]]: """Evaluate multiple RAG responses in batch using reference-free metrics. NOTE: No ground truth needed - all metrics are reference-free. Args: questions: List of user questions answers: List of generated answers contexts_list: List of context lists (one per question) Returns: Dictionary with metric names and lists of scores """ try: log.info(f"Evaluating batch of {len(questions)} responses") all_scores = { "answer_relevancy": [], "faithfulness": [], } for i, (question, answer, contexts) in enumerate(zip(questions, answers, contexts_list)): log.info(f"Evaluating batch item {i+1}/{len(questions)}") scores = self.evaluate_response(question, answer, contexts) all_scores["answer_relevancy"].append(scores.get("answer_relevancy", 0.0)) all_scores["faithfulness"].append(scores.get("faithfulness", 0.0)) log.info(f"✅ Batch evaluation complete") return all_scores except Exception as e: log.error(f"❌ Batch evaluation failed: {e}") return { "answer_relevancy": [0.0] * len(questions), "faithfulness": [0.0] * len(questions), "error": str(e), } def create_evaluator() -> RAGEvaluator: """Factory function to create a RAGEvaluator instance with DeepEval backend.""" return RAGEvaluator()