Snaseem2026

Upload folder using huggingface_hub

7762e8f verified 2 months ago

6.5 kB

	"""
	Evaluation script for trained model with comprehensive analysis
	"""
	import argparse
	import sys
	import os
	import numpy as np
	import pandas as pd
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

	# Add parent directory to path
	sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

	from src import (
	load_config,
	compute_metrics_factory,
	plot_confusion_matrix,
	print_classification_report
	)
	from src.data_loader import prepare_datasets_for_training


	def analyze_errors(
	test_dataset,
	predictions: np.ndarray,
	labels: np.ndarray,
	id2label: dict,
	tokenizer,
	top_n: int = 10
	) -> pd.DataFrame:
	"""
	Analyze misclassified examples.

	Args:
	test_dataset: Test dataset
	predictions: Predicted labels
	labels: True labels
	id2label: Label mapping
	tokenizer: Tokenizer to decode text
	top_n: Number of examples to show per error type

	Returns:
	DataFrame with error analysis
	"""
	errors = []
	for i, (pred, true_label) in enumerate(zip(predictions, labels)):
	if pred != true_label:
	# Decode the comment (approximate, as original text is removed)
	# Note: This is a limitation - we'd need to keep original text
	errors.append({
	'index': i,
	'true_label': id2label[true_label],
	'predicted_label': id2label[pred],
	'error_type': f"{id2label[true_label]} -> {id2label[pred]}"
	})

	error_df = pd.DataFrame(errors)
	if len(error_df) > 0:
	print(f"\nError Analysis:")
	print(f"Total errors: {len(error_df)}")
	print(f"\nError type distribution:")
	print(error_df['error_type'].value_counts())

	return error_df


	def evaluate_model(
	model_path: str,
	config_path: str = "config.yaml",
	save_plots: bool = True
	):
	"""
	Evaluate trained model on test set with comprehensive analysis.

	Args:
	model_path: Path to the trained model
	config_path: Path to configuration file
	save_plots: Whether to save visualization plots
	"""
	print("=" * 60)
	print("Model Evaluation")
	print("=" * 60)

	# Load config
	config = load_config(config_path)

	# Create output directory
	output_dir = config['training'].get('output_dir', './results')
	os.makedirs(output_dir, exist_ok=True)

	# Load datasets
	print("\n[1/5] Loading datasets...")
	tokenized_datasets, label2id, id2label, _ = prepare_datasets_for_training(config_path)
	test_dataset = tokenized_datasets['test']
	print(f"✓ Test samples: {len(test_dataset)}")

	# Load model and tokenizer
	print("\n[2/5] Loading trained model...")
	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model = AutoModelForSequenceClassification.from_pretrained(model_path)
	print(f"✓ Model loaded from {model_path}")

	# Create trainer for evaluation
	print("\n[3/5] Running evaluation...")
	compute_metrics_fn = compute_metrics_factory(id2label)
	trainer = Trainer(
	model=model,
	tokenizer=tokenizer,
	compute_metrics=compute_metrics_fn
	)

	# Get predictions
	predictions_output = trainer.predict(test_dataset)
	predictions = np.argmax(predictions_output.predictions, axis=1)
	labels = predictions_output.label_ids

	# Print metrics
	print("\n[4/5] Computing detailed metrics...")
	print("\n" + "=" * 60)
	print("Test Set Results")
	print("=" * 60)

	metrics = predictions_output.metrics

	# Overall metrics
	print("\nOverall Metrics:")
	overall_metrics = ['accuracy', 'f1_weighted', 'f1_macro', 'precision_weighted', 'recall_weighted']
	for metric in overall_metrics:
	key = f'test_{metric}'
	if key in metrics:
	print(f" {metric.replace('_', ' ').title()}: {metrics[key]:.4f}")

	# Per-class metrics
	print("\nPer-Class Metrics:")
	label_names = [id2label[i] for i in range(len(id2label))]
	for label_name in label_names:
	precision_key = f'test_precision_{label_name}'
	recall_key = f'test_recall_{label_name}'
	f1_key = f'test_f1_{label_name}'
	if precision_key in metrics:
	print(f"\n {label_name.upper()}:")
	print(f" Precision: {metrics[precision_key]:.4f}")
	print(f" Recall: {metrics[recall_key]:.4f}")
	print(f" F1-Score: {metrics[f1_key]:.4f}")
	print(f" Support: {metrics.get(f'test_support_{label_name}', 'N/A')}")

	# Detailed classification report
	print("\n" + "=" * 60)
	print_classification_report(labels, predictions, label_names)

	# Plot confusion matrix
	print("\n[5/5] Generating visualizations...")
	if save_plots:
	plot_confusion_matrix(
	labels,
	predictions,
	label_names,
	save_path=os.path.join(output_dir, "confusion_matrix.png"),
	normalize=False
	)

	# Also save normalized version
	plot_confusion_matrix(
	labels,
	predictions,
	label_names,
	save_path=os.path.join(output_dir, "confusion_matrix_normalized.png"),
	normalize=True
	)

	# Error analysis
	error_df = analyze_errors(test_dataset, predictions, labels, id2label, tokenizer)
	if len(error_df) > 0 and save_plots:
	error_path = os.path.join(output_dir, "error_analysis.csv")
	error_df.to_csv(error_path, index=False)
	print(f"✓ Error analysis saved to {error_path}")

	print("\n" + "=" * 60)
	print("Evaluation Complete! 🎉")
	print("=" * 60)
	print(f"\nResults saved to: {output_dir}")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Evaluate trained model")
	parser.add_argument(
	"--model-path",
	type=str,
	default="./results/final_model",
	help="Path to the trained model"
	)
	parser.add_argument(
	"--config",
	type=str,
	default="config.yaml",
	help="Path to configuration file"
	)
	parser.add_argument(
	"--no-plots",
	action="store_true",
	help="Skip generating visualization plots"
	)
	args = parser.parse_args()

	evaluate_model(args.model_path, args.config, save_plots=not args.no_plots)