chat-with-your-data / server /llm_system /core /evaluation_deepeval.py
sanchitshaleen
Initial deployment of RAG with Gemma-3 to Hugging Face Spaces
4aec76b
"""Evaluation Module for RAG System using DeepEval
- Provides evaluation metrics using LLM-as-a-Judge approach via DeepEval
- Integrates with local Ollama instance for fast, offline evaluation
- Metrics include: answer_relevancy, faithfulness (reference-free, no ground truth needed)
"""
from typing import List, Dict
from deepeval.models import OllamaModel
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.test_case import LLMTestCase
from llm_system.config import OLLAMA_BASE_URL, LLM_CHAT_MODEL_NAME
from logger import get_logger
log = get_logger(name="core_evaluation_deepeval")
class RAGEvaluator:
"""Evaluates RAG responses using reference-free DeepEval metrics with Ollama backend.
Uses LLM-as-a-Judge approach for accurate evaluation WITHOUT requiring ground truth.
All metrics are reference-free (do NOT require ground truth).
Metrics (Reference-Free - No Ground Truth Needed):
- answer_relevancy: How relevant the answer is to the question (0-1)
- faithfulness: How well the answer is grounded in retrieved documents (0-1)
"""
def __init__(
self,
llm_model: str = LLM_CHAT_MODEL_NAME,
ollama_base_url: str = OLLAMA_BASE_URL,
temperature: float = 0.0,
):
"""Initialize RAGEvaluator with Ollama backend.
Args:
llm_model: Name of the Ollama model to use (e.g., "gemma3:latest")
ollama_base_url: Base URL of Ollama server
temperature: Model temperature for evaluation (0 = deterministic)
"""
self.llm_model = llm_model
self.ollama_base_url = ollama_base_url
self.temperature = temperature
log.info(f"Initializing RAGEvaluator with DeepEval + Ollama")
log.info(f" Model: {llm_model}")
log.info(f" Ollama URL: {ollama_base_url}")
try:
# Initialize Ollama model
self.model = OllamaModel(
model=llm_model,
base_url=ollama_base_url,
temperature=temperature
)
# Initialize metrics (all reference-free, no ground truth needed)
self.answer_relevancy_metric = AnswerRelevancyMetric(model=self.model)
self.faithfulness_metric = FaithfulnessMetric(model=self.model)
log.info("βœ… RAGEvaluator initialized successfully with reference-free DeepEval metrics")
except Exception as e:
log.error(f"❌ Failed to initialize RAGEvaluator: {e}")
raise
def evaluate_response(
self,
question: str,
answer: str,
contexts: List[str],
) -> Dict[str, float]:
"""Evaluate a single RAG response using reference-free DeepEval metrics.
NOTE: No ground truth needed - all metrics are reference-free.
Args:
question: The user's question
answer: The generated answer from RAG
contexts: List of retrieved context chunks
Returns:
Dictionary with metric names and scores (0-1 range)
"""
try:
log.info(f"Evaluating response for question: '{question[:50]}...'")
# Create test case for DeepEval
# All metrics are reference-free (no ground truth required)
test_case = LLMTestCase(
input=question,
actual_output=answer,
retrieval_context=contexts, # For context-based metrics
)
scores = {}
# Evaluate Answer Relevancy
try:
log.info("Evaluating answer relevancy...")
self.answer_relevancy_metric.measure(test_case)
relevancy_score = self.answer_relevancy_metric.score
scores["answer_relevancy"] = relevancy_score
log.info(f" Answer Relevancy: {relevancy_score:.3f}")
except Exception as e:
log.error(f"Failed to evaluate answer relevancy: {e}")
scores["answer_relevancy"] = 0.0
# Evaluate Faithfulness
try:
log.info("Evaluating faithfulness...")
self.faithfulness_metric.measure(test_case)
faithfulness_score = self.faithfulness_metric.score
scores["faithfulness"] = faithfulness_score
log.info(f" Faithfulness: {faithfulness_score:.3f}")
except Exception as e:
log.error(f"Failed to evaluate faithfulness: {e}")
scores["faithfulness"] = 0.0
log.info(f"βœ… Evaluation complete: {scores}")
return scores
except Exception as e:
log.error(f"❌ Evaluation failed: {e}")
return {
"answer_relevancy": 0.0,
"faithfulness": 0.0,
"error": str(e),
}
def evaluate_batch(
self,
questions: List[str],
answers: List[str],
contexts_list: List[List[str]],
) -> Dict[str, List[float]]:
"""Evaluate multiple RAG responses in batch using reference-free metrics.
NOTE: No ground truth needed - all metrics are reference-free.
Args:
questions: List of user questions
answers: List of generated answers
contexts_list: List of context lists (one per question)
Returns:
Dictionary with metric names and lists of scores
"""
try:
log.info(f"Evaluating batch of {len(questions)} responses")
all_scores = {
"answer_relevancy": [],
"faithfulness": [],
}
for i, (question, answer, contexts) in enumerate(zip(questions, answers, contexts_list)):
log.info(f"Evaluating batch item {i+1}/{len(questions)}")
scores = self.evaluate_response(question, answer, contexts)
all_scores["answer_relevancy"].append(scores.get("answer_relevancy", 0.0))
all_scores["faithfulness"].append(scores.get("faithfulness", 0.0))
log.info(f"βœ… Batch evaluation complete")
return all_scores
except Exception as e:
log.error(f"❌ Batch evaluation failed: {e}")
return {
"answer_relevancy": [0.0] * len(questions),
"faithfulness": [0.0] * len(questions),
"error": str(e),
}
def create_evaluator() -> RAGEvaluator:
"""Factory function to create a RAGEvaluator instance with DeepEval backend."""
return RAGEvaluator()