|
|
"""Evaluation Module for RAG System using DeepEval |
|
|
- Provides evaluation metrics using LLM-as-a-Judge approach via DeepEval |
|
|
- Integrates with local Ollama instance for fast, offline evaluation |
|
|
- Metrics include: answer_relevancy, faithfulness (reference-free, no ground truth needed) |
|
|
""" |
|
|
|
|
|
from typing import List, Dict |
|
|
from deepeval.models import OllamaModel |
|
|
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric |
|
|
from deepeval.test_case import LLMTestCase |
|
|
|
|
|
from llm_system.config import OLLAMA_BASE_URL, LLM_CHAT_MODEL_NAME |
|
|
from logger import get_logger |
|
|
|
|
|
log = get_logger(name="core_evaluation_deepeval") |
|
|
|
|
|
|
|
|
class RAGEvaluator: |
|
|
"""Evaluates RAG responses using reference-free DeepEval metrics with Ollama backend. |
|
|
|
|
|
Uses LLM-as-a-Judge approach for accurate evaluation WITHOUT requiring ground truth. |
|
|
All metrics are reference-free (do NOT require ground truth). |
|
|
|
|
|
Metrics (Reference-Free - No Ground Truth Needed): |
|
|
- answer_relevancy: How relevant the answer is to the question (0-1) |
|
|
- faithfulness: How well the answer is grounded in retrieved documents (0-1) |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
llm_model: str = LLM_CHAT_MODEL_NAME, |
|
|
ollama_base_url: str = OLLAMA_BASE_URL, |
|
|
temperature: float = 0.0, |
|
|
): |
|
|
"""Initialize RAGEvaluator with Ollama backend. |
|
|
|
|
|
Args: |
|
|
llm_model: Name of the Ollama model to use (e.g., "gemma3:latest") |
|
|
ollama_base_url: Base URL of Ollama server |
|
|
temperature: Model temperature for evaluation (0 = deterministic) |
|
|
""" |
|
|
self.llm_model = llm_model |
|
|
self.ollama_base_url = ollama_base_url |
|
|
self.temperature = temperature |
|
|
|
|
|
log.info(f"Initializing RAGEvaluator with DeepEval + Ollama") |
|
|
log.info(f" Model: {llm_model}") |
|
|
log.info(f" Ollama URL: {ollama_base_url}") |
|
|
|
|
|
try: |
|
|
|
|
|
self.model = OllamaModel( |
|
|
model=llm_model, |
|
|
base_url=ollama_base_url, |
|
|
temperature=temperature |
|
|
) |
|
|
|
|
|
|
|
|
self.answer_relevancy_metric = AnswerRelevancyMetric(model=self.model) |
|
|
self.faithfulness_metric = FaithfulnessMetric(model=self.model) |
|
|
|
|
|
log.info("β
RAGEvaluator initialized successfully with reference-free DeepEval metrics") |
|
|
|
|
|
except Exception as e: |
|
|
log.error(f"β Failed to initialize RAGEvaluator: {e}") |
|
|
raise |
|
|
|
|
|
def evaluate_response( |
|
|
self, |
|
|
question: str, |
|
|
answer: str, |
|
|
contexts: List[str], |
|
|
) -> Dict[str, float]: |
|
|
"""Evaluate a single RAG response using reference-free DeepEval metrics. |
|
|
|
|
|
NOTE: No ground truth needed - all metrics are reference-free. |
|
|
|
|
|
Args: |
|
|
question: The user's question |
|
|
answer: The generated answer from RAG |
|
|
contexts: List of retrieved context chunks |
|
|
|
|
|
Returns: |
|
|
Dictionary with metric names and scores (0-1 range) |
|
|
""" |
|
|
try: |
|
|
log.info(f"Evaluating response for question: '{question[:50]}...'") |
|
|
|
|
|
|
|
|
|
|
|
test_case = LLMTestCase( |
|
|
input=question, |
|
|
actual_output=answer, |
|
|
retrieval_context=contexts, |
|
|
) |
|
|
|
|
|
scores = {} |
|
|
|
|
|
|
|
|
try: |
|
|
log.info("Evaluating answer relevancy...") |
|
|
self.answer_relevancy_metric.measure(test_case) |
|
|
relevancy_score = self.answer_relevancy_metric.score |
|
|
scores["answer_relevancy"] = relevancy_score |
|
|
log.info(f" Answer Relevancy: {relevancy_score:.3f}") |
|
|
except Exception as e: |
|
|
log.error(f"Failed to evaluate answer relevancy: {e}") |
|
|
scores["answer_relevancy"] = 0.0 |
|
|
|
|
|
|
|
|
try: |
|
|
log.info("Evaluating faithfulness...") |
|
|
self.faithfulness_metric.measure(test_case) |
|
|
faithfulness_score = self.faithfulness_metric.score |
|
|
scores["faithfulness"] = faithfulness_score |
|
|
log.info(f" Faithfulness: {faithfulness_score:.3f}") |
|
|
except Exception as e: |
|
|
log.error(f"Failed to evaluate faithfulness: {e}") |
|
|
scores["faithfulness"] = 0.0 |
|
|
|
|
|
log.info(f"β
Evaluation complete: {scores}") |
|
|
return scores |
|
|
|
|
|
except Exception as e: |
|
|
log.error(f"β Evaluation failed: {e}") |
|
|
return { |
|
|
"answer_relevancy": 0.0, |
|
|
"faithfulness": 0.0, |
|
|
"error": str(e), |
|
|
} |
|
|
|
|
|
def evaluate_batch( |
|
|
self, |
|
|
questions: List[str], |
|
|
answers: List[str], |
|
|
contexts_list: List[List[str]], |
|
|
) -> Dict[str, List[float]]: |
|
|
"""Evaluate multiple RAG responses in batch using reference-free metrics. |
|
|
|
|
|
NOTE: No ground truth needed - all metrics are reference-free. |
|
|
|
|
|
Args: |
|
|
questions: List of user questions |
|
|
answers: List of generated answers |
|
|
contexts_list: List of context lists (one per question) |
|
|
|
|
|
Returns: |
|
|
Dictionary with metric names and lists of scores |
|
|
""" |
|
|
try: |
|
|
log.info(f"Evaluating batch of {len(questions)} responses") |
|
|
|
|
|
all_scores = { |
|
|
"answer_relevancy": [], |
|
|
"faithfulness": [], |
|
|
} |
|
|
|
|
|
for i, (question, answer, contexts) in enumerate(zip(questions, answers, contexts_list)): |
|
|
log.info(f"Evaluating batch item {i+1}/{len(questions)}") |
|
|
|
|
|
scores = self.evaluate_response(question, answer, contexts) |
|
|
|
|
|
all_scores["answer_relevancy"].append(scores.get("answer_relevancy", 0.0)) |
|
|
all_scores["faithfulness"].append(scores.get("faithfulness", 0.0)) |
|
|
|
|
|
log.info(f"β
Batch evaluation complete") |
|
|
return all_scores |
|
|
|
|
|
except Exception as e: |
|
|
log.error(f"β Batch evaluation failed: {e}") |
|
|
return { |
|
|
"answer_relevancy": [0.0] * len(questions), |
|
|
"faithfulness": [0.0] * len(questions), |
|
|
"error": str(e), |
|
|
} |
|
|
|
|
|
|
|
|
def create_evaluator() -> RAGEvaluator: |
|
|
"""Factory function to create a RAGEvaluator instance with DeepEval backend.""" |
|
|
return RAGEvaluator() |
|
|
|