#!/usr/bin/env python3 """ Script to extract data from JSON files in a repository folder and save it as a CSV file for import into the benchmark. """ import pandas as pd import json import os import sys import argparse from pathlib import Path def is_valid_json_file(file_path): """ Check if a file is a valid JSON file containing a dict. Args: file_path (str): Path to the JSON file Returns: bool: True if valid JSON dict, False otherwise """ try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) return isinstance(data, dict) except (json.JSONDecodeError, FileNotFoundError, UnicodeDecodeError): return False def find_json_files(repo_path): """ Recursively find all JSON files in the repository folder. Args: repo_path (str): Path to the repository folder Returns: list: List of paths to valid JSON files """ json_files = [] repo_path = Path(repo_path) if not repo_path.exists(): print(f"Error: Repository path '{repo_path}' does not exist.") return [] if not repo_path.is_dir(): print(f"Error: Repository path '{repo_path}' is not a directory.") return [] print(f"Scanning repository: {repo_path}") for file_path in repo_path.rglob("*.json"): if is_valid_json_file(file_path): json_files.append(file_path) print(f"Found valid JSON file: {file_path}") print(f"Total valid JSON files found: {len(json_files)}") return json_files def extract_data_from_json(json_file_path): """ Extract data from a single JSON file. Args: json_file_path (Path): Path to the JSON file Returns: dict or None: Extracted data or None if extraction failed """ try: with open(json_file_path, 'r', encoding='utf-8') as f: data = json.load(f) # Check if required fields exist if 'config_general' not in data or 'results' not in data: return None config_general = data['config_general'] results = data['results'] # Extract model information model_name = config_general.get('model_name', '') model_private = config_general.get('model_private', False) model_num_parameters = config_general.get('model_num_parameters', 0) # Extract results all_grouped = results.get('all_grouped', {}) # Extract metrics assin2_rte = all_grouped.get('assin2_rte', 0.0) assin2_sts = all_grouped.get('assin2_sts', 0.0) faquad_nli = all_grouped.get('faquad_nli', 0.0) hatebr_offensive = all_grouped.get('hatebr_offensive', 0.0) # Create row data row_data = { 'json_file': str(json_file_path), 'model_name': model_name, 'model_private': model_private, 'model_num_parameters': model_num_parameters, 'assin2_rte': assin2_rte, 'assin2_sts': assin2_sts, 'faquad_nli': faquad_nli, 'hatebr_offensive': hatebr_offensive } return row_data except Exception as e: print(f"Error processing {json_file_path}: {e}") return None def extract_portuguese_leaderboard(repo_path): """ Extract data from JSON files in the repository folder and save as CSV. Args: repo_path (str): Path to the repository folder """ print("Scanning repository for JSON files...") # Find all JSON files json_files = find_json_files(repo_path) if not json_files: print("No valid JSON files found in the repository.") return # Prepare data for DataFrame data = [] # Process each JSON file for i, json_file in enumerate(json_files): print(f"Processing file {i+1}/{len(json_files)}: {json_file.name}") row_data = extract_data_from_json(json_file) if row_data: data.append(row_data) # Print progress every 10 files if (i + 1) % 10 == 0: print(f" Processed {i + 1} files...") if not data: print("No valid data extracted from JSON files.") return # Create DataFrame df = pd.DataFrame(data) # Write to CSV output_file = 'portuguese_leaderboard.csv' df.to_csv(output_file, index=False) print(f"\nSuccessfully extracted {len(df)} models to {output_file}") # Show first few entries as preview print("\nFirst 5 entries:") print(df.head().to_string(index=False)) # Show some statistics if not df.empty: print(f"\nStatistics:") print(f"Total models: {len(df)}") print(f"Private models: {df['model_private'].sum()}") print(f"Public models: {(~df['model_private']).sum()}") # Average scores print(f"\nAverage scores:") print(df[['assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']].mean().round(2)) # Show data types and info print(f"\nDataFrame info:") print(df.info()) def main(): """Main function to run the extraction.""" parser = argparse.ArgumentParser(description='Extract Portuguese LLM Leaderboard data from JSON files') parser.add_argument('repo_path', help='Path to the repository folder containing JSON files') args = parser.parse_args() print("Portuguese LLM Leaderboard Data Extractor") print("=" * 50) try: extract_portuguese_leaderboard(args.repo_path) print("\nExtraction completed successfully!") except Exception as e: print(f"Error during extraction: {e}") sys.exit(1) if __name__ == "__main__": main()