Spaces:
Sleeping
Sleeping
Upload 14 files
Browse files- app.py +458 -41
- external_models.csv +31 -31
- extract_portuguese_leaderboard.py +2 -0
- portuguese_leaderboard.csv +0 -0
app.py
CHANGED
|
@@ -24,14 +24,14 @@ def load_portuguese_leaderboard_data() -> pd.DataFrame:
|
|
| 24 |
if os.path.exists(csv_path):
|
| 25 |
df = pd.read_csv(csv_path)
|
| 26 |
# Select only the relevant columns
|
| 27 |
-
relevant_columns = ['model_name', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
|
| 28 |
df = df[relevant_columns].copy()
|
| 29 |
|
| 30 |
# Rename columns to match the existing format
|
| 31 |
df = df.rename(columns={
|
| 32 |
'assin2_rte': 'ASSIN2 RTE',
|
| 33 |
'assin2_sts': 'ASSIN2 STS',
|
| 34 |
-
'faquad_nli': '
|
| 35 |
'hatebr_offensive': 'HateBR'
|
| 36 |
})
|
| 37 |
|
|
@@ -62,13 +62,16 @@ def load_external_models_data() -> pd.DataFrame:
|
|
| 62 |
'model': 'model_name',
|
| 63 |
'assin2_rte': 'ASSIN2 RTE',
|
| 64 |
'assin2_sts': 'ASSIN2 STS',
|
| 65 |
-
'faquad_nli': '
|
| 66 |
'hatebr_offensive': 'HateBR'
|
| 67 |
})
|
| 68 |
|
| 69 |
# Add source information
|
| 70 |
df['source'] = 'external_models'
|
| 71 |
|
|
|
|
|
|
|
|
|
|
| 72 |
print(f"Loaded {len(df)} external models")
|
| 73 |
return df
|
| 74 |
else:
|
|
@@ -84,7 +87,7 @@ PORTUGUESE_LEADERBOARD_DATA = load_portuguese_leaderboard_data()
|
|
| 84 |
# Load external models data
|
| 85 |
EXTERNAL_MODELS_DATA = load_external_models_data()
|
| 86 |
|
| 87 |
-
def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "") -> pd.DataFrame:
|
| 88 |
"""Create a simplified benchmark table with one column per dataset."""
|
| 89 |
# Get all dataset names
|
| 90 |
dataset_names = sorted(NAPOLAB_DATASETS.keys())
|
|
@@ -120,14 +123,15 @@ def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_
|
|
| 120 |
model_data[model_name] = {
|
| 121 |
'dataset_scores': {},
|
| 122 |
'url': None,
|
| 123 |
-
'source': 'portuguese_leaderboard'
|
|
|
|
| 124 |
}
|
| 125 |
|
| 126 |
# Map Portuguese leaderboard columns to dataset names
|
| 127 |
column_mapping = {
|
| 128 |
'ASSIN2 RTE': 'assin2_rte',
|
| 129 |
'ASSIN2 STS': 'assin2_sts',
|
| 130 |
-
'
|
| 131 |
'HateBR': 'hatebr'
|
| 132 |
}
|
| 133 |
|
|
@@ -146,14 +150,15 @@ def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_
|
|
| 146 |
model_data[model_name] = {
|
| 147 |
'dataset_scores': {},
|
| 148 |
'url': row.get('link', ''),
|
| 149 |
-
'source': 'external_models'
|
|
|
|
| 150 |
}
|
| 151 |
|
| 152 |
# Map external models columns to dataset names
|
| 153 |
column_mapping = {
|
| 154 |
'ASSIN2 RTE': 'assin2_rte',
|
| 155 |
'ASSIN2 STS': 'assin2_sts',
|
| 156 |
-
'
|
| 157 |
'HateBR': 'hatebr'
|
| 158 |
}
|
| 159 |
|
|
@@ -177,6 +182,9 @@ def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_
|
|
| 177 |
model_metadata = MODEL_METADATA.get(model_name, {})
|
| 178 |
source = model_metadata.get('source', 'unknown')
|
| 179 |
model_data[model_name]['source'] = source
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
# Create table data
|
| 182 |
table_data = []
|
|
@@ -198,6 +206,12 @@ def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_
|
|
| 198 |
if source == 'unknown':
|
| 199 |
continue
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
# Create clickable link for model name
|
| 202 |
if data['url']:
|
| 203 |
model_display = f"[{model_name}]({data['url']})"
|
|
@@ -394,7 +408,7 @@ def cleanup_current_csv():
|
|
| 394 |
print(f"Error deleting file {current_csv_file}: {e}")
|
| 395 |
|
| 396 |
|
| 397 |
-
def create_model_performance_radar(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "") -> go.Figure:
|
| 398 |
"""Create a radar chart showing model performance across all datasets."""
|
| 399 |
# Use selected datasets if provided, otherwise use all datasets
|
| 400 |
if selected_datasets is None:
|
|
@@ -431,14 +445,15 @@ def create_model_performance_radar(selected_datasets: List[str] = None, show_nap
|
|
| 431 |
model_data[model_name] = {
|
| 432 |
'performances': {},
|
| 433 |
'architecture': 'Unknown',
|
| 434 |
-
'source': 'portuguese_leaderboard'
|
|
|
|
| 435 |
}
|
| 436 |
|
| 437 |
# Map Portuguese leaderboard columns to dataset names
|
| 438 |
column_mapping = {
|
| 439 |
'ASSIN2 RTE': 'assin2_rte',
|
| 440 |
'ASSIN2 STS': 'assin2_sts',
|
| 441 |
-
'
|
| 442 |
'HateBR': 'hatebr'
|
| 443 |
}
|
| 444 |
|
|
@@ -457,14 +472,15 @@ def create_model_performance_radar(selected_datasets: List[str] = None, show_nap
|
|
| 457 |
model_data[model_name] = {
|
| 458 |
'performances': {},
|
| 459 |
'architecture': 'Unknown',
|
| 460 |
-
'source': 'external_models'
|
|
|
|
| 461 |
}
|
| 462 |
|
| 463 |
# Map external models columns to dataset names
|
| 464 |
column_mapping = {
|
| 465 |
'ASSIN2 RTE': 'assin2_rte',
|
| 466 |
'ASSIN2 STS': 'assin2_sts',
|
| 467 |
-
'
|
| 468 |
'HateBR': 'hatebr'
|
| 469 |
}
|
| 470 |
|
|
@@ -488,6 +504,9 @@ def create_model_performance_radar(selected_datasets: List[str] = None, show_nap
|
|
| 488 |
model_metadata = MODEL_METADATA.get(model_name, {})
|
| 489 |
source = model_metadata.get('source', 'unknown')
|
| 490 |
model_data[model_name]['source'] = source
|
|
|
|
|
|
|
|
|
|
| 491 |
|
| 492 |
# Apply source filtering
|
| 493 |
filtered_model_data = {}
|
|
@@ -507,6 +526,12 @@ def create_model_performance_radar(selected_datasets: List[str] = None, show_nap
|
|
| 507 |
if source == 'unknown':
|
| 508 |
continue
|
| 509 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
filtered_model_data[model_name] = data
|
| 511 |
|
| 512 |
# Apply incomplete model filtering
|
|
@@ -731,8 +756,8 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
| 731 |
dataset_checkboxes = []
|
| 732 |
for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
|
| 733 |
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
|
| 734 |
-
# Default to selected only for ASSIN 2 STS,
|
| 735 |
-
default_value =
|
| 736 |
checkbox = gr.Checkbox(
|
| 737 |
label=display_name,
|
| 738 |
value=default_value
|
|
@@ -774,6 +799,22 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
| 774 |
value=True
|
| 775 |
)
|
| 776 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 777 |
# Search bar for filtering models
|
| 778 |
search_query = gr.Textbox(
|
| 779 |
label="Search models by name (supports regex)",
|
|
@@ -807,8 +848,8 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
| 807 |
analysis_dataset_checkboxes = []
|
| 808 |
for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
|
| 809 |
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
|
| 810 |
-
# Default to selected only for ASSIN 2 STS,
|
| 811 |
-
default_value =
|
| 812 |
checkbox = gr.Checkbox(
|
| 813 |
label=display_name,
|
| 814 |
value=default_value
|
|
@@ -853,6 +894,18 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
| 853 |
value=True
|
| 854 |
)
|
| 855 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 856 |
# Search bar for filtering models in radar chart
|
| 857 |
search_query_analysis = gr.Textbox(
|
| 858 |
label="Search models by name (supports regex)",
|
|
@@ -863,6 +916,9 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
| 863 |
|
| 864 |
model_analysis_chart = gr.Plot(label="Model Performance Radar Chart")
|
| 865 |
|
|
|
|
|
|
|
|
|
|
| 866 |
gr.Markdown("""
|
| 867 |
**How to interact with the chart:**
|
| 868 |
- **Click on legend items** to show/hide specific models.
|
|
@@ -918,6 +974,272 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
| 918 |
|
| 919 |
""")
|
| 920 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 921 |
# Event handlers
|
| 922 |
def update_radar_chart(*args):
|
| 923 |
# Extract arguments for radar chart
|
|
@@ -929,6 +1251,7 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
| 929 |
show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
|
| 930 |
show_external_models = args[len(analysis_dataset_checkboxes) + 5]
|
| 931 |
search_query = args[len(analysis_dataset_checkboxes) + 6]
|
|
|
|
| 932 |
|
| 933 |
# Convert dataset selections to list of selected dataset names
|
| 934 |
selected_datasets = []
|
|
@@ -936,7 +1259,7 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
| 936 |
if dataset_values[i]:
|
| 937 |
selected_datasets.append(dataset_name)
|
| 938 |
|
| 939 |
-
return create_model_performance_radar(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query)
|
| 940 |
|
| 941 |
def update_benchmark_table(*args):
|
| 942 |
# Extract arguments
|
|
@@ -948,6 +1271,7 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
| 948 |
show_portuguese_leaderboard = args[len(dataset_checkboxes) + 4]
|
| 949 |
show_external_models = args[len(dataset_checkboxes) + 5]
|
| 950 |
search_query = args[len(dataset_checkboxes) + 6]
|
|
|
|
| 951 |
|
| 952 |
# Convert dataset selections to list of selected dataset names
|
| 953 |
selected_datasets = []
|
|
@@ -955,65 +1279,85 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
| 955 |
if dataset_values[i]:
|
| 956 |
selected_datasets.append(dataset_name)
|
| 957 |
|
| 958 |
-
df = create_simplified_benchmark_table(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query)
|
| 959 |
|
| 960 |
return df
|
| 961 |
|
| 962 |
-
|
| 963 |
-
|
| 964 |
-
|
| 965 |
-
|
| 966 |
-
|
| 967 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 968 |
|
| 969 |
# Connect dataset checkboxes to update table
|
| 970 |
for dataset_name, checkbox in dataset_checkboxes:
|
| 971 |
checkbox.change(
|
| 972 |
update_benchmark_table,
|
| 973 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 974 |
outputs=benchmark_table
|
| 975 |
)
|
| 976 |
|
| 977 |
hide_incomplete_models.change(
|
| 978 |
update_benchmark_table,
|
| 979 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 980 |
outputs=benchmark_table
|
| 981 |
)
|
| 982 |
|
| 983 |
min_average_performance.change(
|
| 984 |
update_benchmark_table,
|
| 985 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 986 |
outputs=benchmark_table
|
| 987 |
)
|
| 988 |
|
| 989 |
show_napolab_thesis.change(
|
| 990 |
update_benchmark_table,
|
| 991 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 992 |
outputs=benchmark_table
|
| 993 |
)
|
| 994 |
|
| 995 |
show_teenytinyllama.change(
|
| 996 |
update_benchmark_table,
|
| 997 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 998 |
outputs=benchmark_table
|
| 999 |
)
|
| 1000 |
|
| 1001 |
show_portuguese_leaderboard.change(
|
| 1002 |
update_benchmark_table,
|
| 1003 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 1004 |
outputs=benchmark_table
|
| 1005 |
)
|
| 1006 |
|
| 1007 |
show_external_models.change(
|
| 1008 |
update_benchmark_table,
|
| 1009 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
| 1010 |
outputs=benchmark_table
|
| 1011 |
)
|
| 1012 |
|
| 1013 |
# Connect search query to update table
|
| 1014 |
search_query.change(
|
| 1015 |
update_benchmark_table,
|
| 1016 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1017 |
outputs=benchmark_table
|
| 1018 |
)
|
| 1019 |
|
|
@@ -1036,52 +1380,125 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
| 1036 |
for dataset_name, checkbox in analysis_dataset_checkboxes:
|
| 1037 |
checkbox.change(
|
| 1038 |
update_radar_chart,
|
| 1039 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 1040 |
outputs=model_analysis_chart
|
| 1041 |
)
|
| 1042 |
|
| 1043 |
hide_incomplete_models_analysis.change(
|
| 1044 |
update_radar_chart,
|
| 1045 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 1046 |
outputs=model_analysis_chart
|
| 1047 |
)
|
| 1048 |
|
| 1049 |
min_average_performance_analysis.change(
|
| 1050 |
update_radar_chart,
|
| 1051 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 1052 |
outputs=model_analysis_chart
|
| 1053 |
)
|
| 1054 |
|
| 1055 |
show_napolab_thesis_analysis.change(
|
| 1056 |
update_radar_chart,
|
| 1057 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 1058 |
outputs=model_analysis_chart
|
| 1059 |
)
|
| 1060 |
|
| 1061 |
show_teenytinyllama_analysis.change(
|
| 1062 |
update_radar_chart,
|
| 1063 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 1064 |
outputs=model_analysis_chart
|
| 1065 |
)
|
| 1066 |
|
| 1067 |
show_portuguese_leaderboard_analysis.change(
|
| 1068 |
update_radar_chart,
|
| 1069 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 1070 |
outputs=model_analysis_chart
|
| 1071 |
)
|
| 1072 |
|
| 1073 |
show_external_models_analysis.change(
|
| 1074 |
update_radar_chart,
|
| 1075 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 1076 |
outputs=model_analysis_chart
|
| 1077 |
)
|
| 1078 |
|
| 1079 |
# Connect search query to update radar chart
|
| 1080 |
search_query_analysis.change(
|
| 1081 |
update_radar_chart,
|
| 1082 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
| 1083 |
outputs=model_analysis_chart
|
| 1084 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1085 |
|
| 1086 |
if __name__ == "__main__":
|
| 1087 |
app.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 24 |
if os.path.exists(csv_path):
|
| 25 |
df = pd.read_csv(csv_path)
|
| 26 |
# Select only the relevant columns
|
| 27 |
+
relevant_columns = ['model_name', 'model_num_parameters', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
|
| 28 |
df = df[relevant_columns].copy()
|
| 29 |
|
| 30 |
# Rename columns to match the existing format
|
| 31 |
df = df.rename(columns={
|
| 32 |
'assin2_rte': 'ASSIN2 RTE',
|
| 33 |
'assin2_sts': 'ASSIN2 STS',
|
| 34 |
+
'faquad_nli': 'FaQUaD-NLI',
|
| 35 |
'hatebr_offensive': 'HateBR'
|
| 36 |
})
|
| 37 |
|
|
|
|
| 62 |
'model': 'model_name',
|
| 63 |
'assin2_rte': 'ASSIN2 RTE',
|
| 64 |
'assin2_sts': 'ASSIN2 STS',
|
| 65 |
+
'faquad_nli': 'FaQUaD-NLI',
|
| 66 |
'hatebr_offensive': 'HateBR'
|
| 67 |
})
|
| 68 |
|
| 69 |
# Add source information
|
| 70 |
df['source'] = 'external_models'
|
| 71 |
|
| 72 |
+
# Add model_num_parameters column with 0 for external models
|
| 73 |
+
df['model_num_parameters'] = 0
|
| 74 |
+
|
| 75 |
print(f"Loaded {len(df)} external models")
|
| 76 |
return df
|
| 77 |
else:
|
|
|
|
| 87 |
# Load external models data
|
| 88 |
EXTERNAL_MODELS_DATA = load_external_models_data()
|
| 89 |
|
| 90 |
+
def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> pd.DataFrame:
|
| 91 |
"""Create a simplified benchmark table with one column per dataset."""
|
| 92 |
# Get all dataset names
|
| 93 |
dataset_names = sorted(NAPOLAB_DATASETS.keys())
|
|
|
|
| 123 |
model_data[model_name] = {
|
| 124 |
'dataset_scores': {},
|
| 125 |
'url': None,
|
| 126 |
+
'source': 'portuguese_leaderboard',
|
| 127 |
+
'num_parameters': row.get('model_num_parameters', 0)
|
| 128 |
}
|
| 129 |
|
| 130 |
# Map Portuguese leaderboard columns to dataset names
|
| 131 |
column_mapping = {
|
| 132 |
'ASSIN2 RTE': 'assin2_rte',
|
| 133 |
'ASSIN2 STS': 'assin2_sts',
|
| 134 |
+
'FaQUaD-NLI': 'faquad-nli',
|
| 135 |
'HateBR': 'hatebr'
|
| 136 |
}
|
| 137 |
|
|
|
|
| 150 |
model_data[model_name] = {
|
| 151 |
'dataset_scores': {},
|
| 152 |
'url': row.get('link', ''),
|
| 153 |
+
'source': 'external_models',
|
| 154 |
+
'num_parameters': row.get('model_num_parameters', 0)
|
| 155 |
}
|
| 156 |
|
| 157 |
# Map external models columns to dataset names
|
| 158 |
column_mapping = {
|
| 159 |
'ASSIN2 RTE': 'assin2_rte',
|
| 160 |
'ASSIN2 STS': 'assin2_sts',
|
| 161 |
+
'FaQUaD-NLI': 'faquad-nli',
|
| 162 |
'HateBR': 'hatebr'
|
| 163 |
}
|
| 164 |
|
|
|
|
| 182 |
model_metadata = MODEL_METADATA.get(model_name, {})
|
| 183 |
source = model_metadata.get('source', 'unknown')
|
| 184 |
model_data[model_name]['source'] = source
|
| 185 |
+
|
| 186 |
+
# Add num_parameters for existing models (set to 0 as they don't have this info)
|
| 187 |
+
model_data[model_name]['num_parameters'] = 0
|
| 188 |
|
| 189 |
# Create table data
|
| 190 |
table_data = []
|
|
|
|
| 206 |
if source == 'unknown':
|
| 207 |
continue
|
| 208 |
|
| 209 |
+
# Apply parameter filtering (only for Portuguese leaderboard models)
|
| 210 |
+
if max_num_parameters > 0 and source == 'portuguese_leaderboard':
|
| 211 |
+
num_parameters = data.get('num_parameters', 0)
|
| 212 |
+
if num_parameters > max_num_parameters:
|
| 213 |
+
continue
|
| 214 |
+
|
| 215 |
# Create clickable link for model name
|
| 216 |
if data['url']:
|
| 217 |
model_display = f"[{model_name}]({data['url']})"
|
|
|
|
| 408 |
print(f"Error deleting file {current_csv_file}: {e}")
|
| 409 |
|
| 410 |
|
| 411 |
+
def create_model_performance_radar(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> go.Figure:
|
| 412 |
"""Create a radar chart showing model performance across all datasets."""
|
| 413 |
# Use selected datasets if provided, otherwise use all datasets
|
| 414 |
if selected_datasets is None:
|
|
|
|
| 445 |
model_data[model_name] = {
|
| 446 |
'performances': {},
|
| 447 |
'architecture': 'Unknown',
|
| 448 |
+
'source': 'portuguese_leaderboard',
|
| 449 |
+
'num_parameters': row.get('model_num_parameters', 0)
|
| 450 |
}
|
| 451 |
|
| 452 |
# Map Portuguese leaderboard columns to dataset names
|
| 453 |
column_mapping = {
|
| 454 |
'ASSIN2 RTE': 'assin2_rte',
|
| 455 |
'ASSIN2 STS': 'assin2_sts',
|
| 456 |
+
'FaQUaD-NLI': 'faquad-nli',
|
| 457 |
'HateBR': 'hatebr'
|
| 458 |
}
|
| 459 |
|
|
|
|
| 472 |
model_data[model_name] = {
|
| 473 |
'performances': {},
|
| 474 |
'architecture': 'Unknown',
|
| 475 |
+
'source': 'external_models',
|
| 476 |
+
'num_parameters': row.get('model_num_parameters', 0)
|
| 477 |
}
|
| 478 |
|
| 479 |
# Map external models columns to dataset names
|
| 480 |
column_mapping = {
|
| 481 |
'ASSIN2 RTE': 'assin2_rte',
|
| 482 |
'ASSIN2 STS': 'assin2_sts',
|
| 483 |
+
'FaQUaD-NLI': 'faquad-nli',
|
| 484 |
'HateBR': 'hatebr'
|
| 485 |
}
|
| 486 |
|
|
|
|
| 504 |
model_metadata = MODEL_METADATA.get(model_name, {})
|
| 505 |
source = model_metadata.get('source', 'unknown')
|
| 506 |
model_data[model_name]['source'] = source
|
| 507 |
+
|
| 508 |
+
# Add num_parameters for existing models (set to 0 as they don't have this info)
|
| 509 |
+
model_data[model_name]['num_parameters'] = 0
|
| 510 |
|
| 511 |
# Apply source filtering
|
| 512 |
filtered_model_data = {}
|
|
|
|
| 526 |
if source == 'unknown':
|
| 527 |
continue
|
| 528 |
|
| 529 |
+
# Apply parameter filtering (only for Portuguese leaderboard models)
|
| 530 |
+
if max_num_parameters > 0 and source == 'portuguese_leaderboard':
|
| 531 |
+
num_parameters = data.get('num_parameters', 0)
|
| 532 |
+
if num_parameters > max_num_parameters:
|
| 533 |
+
continue
|
| 534 |
+
|
| 535 |
filtered_model_data[model_name] = data
|
| 536 |
|
| 537 |
# Apply incomplete model filtering
|
|
|
|
| 756 |
dataset_checkboxes = []
|
| 757 |
for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
|
| 758 |
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
|
| 759 |
+
# Default to selected only for ASSIN 2 STS, FaQUaD-NLI, and HateBR
|
| 760 |
+
default_value = display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR']
|
| 761 |
checkbox = gr.Checkbox(
|
| 762 |
label=display_name,
|
| 763 |
value=default_value
|
|
|
|
| 799 |
value=True
|
| 800 |
)
|
| 801 |
|
| 802 |
+
# Calculate max parameters for slider
|
| 803 |
+
max_params = 0
|
| 804 |
+
if not PORTUGUESE_LEADERBOARD_DATA.empty:
|
| 805 |
+
max_params = int(PORTUGUESE_LEADERBOARD_DATA['model_num_parameters'].max())
|
| 806 |
+
|
| 807 |
+
with gr.Accordion("Filter by Model Size: (Click to expand)", open=False):
|
| 808 |
+
with gr.Row():
|
| 809 |
+
max_num_parameters = gr.Slider(
|
| 810 |
+
minimum=0,
|
| 811 |
+
maximum=max_params,
|
| 812 |
+
value=0,
|
| 813 |
+
step=1,
|
| 814 |
+
label="Maximum Number of Parameters",
|
| 815 |
+
info="This slider is applicable only to Open PT LLM Leaderboard models. For other models, it will have no effect."
|
| 816 |
+
)
|
| 817 |
+
|
| 818 |
# Search bar for filtering models
|
| 819 |
search_query = gr.Textbox(
|
| 820 |
label="Search models by name (supports regex)",
|
|
|
|
| 848 |
analysis_dataset_checkboxes = []
|
| 849 |
for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
|
| 850 |
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
|
| 851 |
+
# Default to selected only for ASSIN 2 STS, FaQUaD-NLI, and HateBR
|
| 852 |
+
default_value = display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR']
|
| 853 |
checkbox = gr.Checkbox(
|
| 854 |
label=display_name,
|
| 855 |
value=default_value
|
|
|
|
| 894 |
value=True
|
| 895 |
)
|
| 896 |
|
| 897 |
+
# Parameter slider for Model Analysis tab
|
| 898 |
+
with gr.Accordion("Filter by Model Size: (Click to expand)", open=False):
|
| 899 |
+
with gr.Row():
|
| 900 |
+
max_num_parameters_analysis = gr.Slider(
|
| 901 |
+
minimum=0,
|
| 902 |
+
maximum=max_params,
|
| 903 |
+
value=0,
|
| 904 |
+
step=1,
|
| 905 |
+
label="Maximum Number of Parameters",
|
| 906 |
+
info="This slider is applicable only to Open PT LLM Leaderboard models. For other models, it will have no effect."
|
| 907 |
+
)
|
| 908 |
+
|
| 909 |
# Search bar for filtering models in radar chart
|
| 910 |
search_query_analysis = gr.Textbox(
|
| 911 |
label="Search models by name (supports regex)",
|
|
|
|
| 916 |
|
| 917 |
model_analysis_chart = gr.Plot(label="Model Performance Radar Chart")
|
| 918 |
|
| 919 |
+
# Add scatter plot below radar chart
|
| 920 |
+
model_scatter_plot = gr.Plot(label="Model Performance vs Number of Parameters")
|
| 921 |
+
|
| 922 |
gr.Markdown("""
|
| 923 |
**How to interact with the chart:**
|
| 924 |
- **Click on legend items** to show/hide specific models.
|
|
|
|
| 974 |
|
| 975 |
""")
|
| 976 |
|
| 977 |
+
def create_model_performance_scatter(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> go.Figure:
|
| 978 |
+
"""Create a scatter plot showing model performance vs number of parameters."""
|
| 979 |
+
# Use selected datasets if provided, otherwise use all datasets
|
| 980 |
+
if selected_datasets is None:
|
| 981 |
+
selected_datasets = list(NAPOLAB_DATASETS.keys())
|
| 982 |
+
|
| 983 |
+
# Collect data for each model
|
| 984 |
+
model_data = {}
|
| 985 |
+
|
| 986 |
+
# Process existing benchmark results
|
| 987 |
+
for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
|
| 988 |
+
if dataset_name in selected_datasets:
|
| 989 |
+
for model_name, metrics in models.items():
|
| 990 |
+
if model_name not in model_data:
|
| 991 |
+
# Get actual source from MODEL_METADATA
|
| 992 |
+
model_metadata = MODEL_METADATA.get(model_name, {})
|
| 993 |
+
actual_source = model_metadata.get('source', 'unknown')
|
| 994 |
+
|
| 995 |
+
model_data[model_name] = {
|
| 996 |
+
'performances': {},
|
| 997 |
+
'architecture': model_metadata.get('architecture', 'Unknown'),
|
| 998 |
+
'source': actual_source,
|
| 999 |
+
'num_parameters': 0
|
| 1000 |
+
}
|
| 1001 |
+
|
| 1002 |
+
# Calculate average performance for this dataset
|
| 1003 |
+
avg_performance = np.mean(list(metrics.values()))
|
| 1004 |
+
model_data[model_name]['performances'][dataset_name] = avg_performance
|
| 1005 |
+
|
| 1006 |
+
# Process Portuguese leaderboard data
|
| 1007 |
+
if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
|
| 1008 |
+
for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
|
| 1009 |
+
model_name = row['model_name']
|
| 1010 |
+
|
| 1011 |
+
if model_name not in model_data:
|
| 1012 |
+
model_data[model_name] = {
|
| 1013 |
+
'performances': {},
|
| 1014 |
+
'architecture': 'Unknown',
|
| 1015 |
+
'source': 'portuguese_leaderboard',
|
| 1016 |
+
'num_parameters': row.get('model_num_parameters', 0)
|
| 1017 |
+
}
|
| 1018 |
+
|
| 1019 |
+
# Map Portuguese leaderboard columns to dataset names
|
| 1020 |
+
column_mapping = {
|
| 1021 |
+
'ASSIN2 RTE': 'assin2_rte',
|
| 1022 |
+
'ASSIN2 STS': 'assin2_sts',
|
| 1023 |
+
'FaQUaD-NLI': 'faquad-nli',
|
| 1024 |
+
'HateBR': 'hatebr'
|
| 1025 |
+
}
|
| 1026 |
+
|
| 1027 |
+
for display_name, dataset_name in column_mapping.items():
|
| 1028 |
+
if dataset_name in selected_datasets:
|
| 1029 |
+
score = row[display_name]
|
| 1030 |
+
if pd.notna(score) and score > 0:
|
| 1031 |
+
model_data[model_name]['performances'][dataset_name] = score
|
| 1032 |
+
|
| 1033 |
+
# Process external models data
|
| 1034 |
+
if show_external_models and not EXTERNAL_MODELS_DATA.empty:
|
| 1035 |
+
for _, row in EXTERNAL_MODELS_DATA.iterrows():
|
| 1036 |
+
model_name = row['model_name']
|
| 1037 |
+
|
| 1038 |
+
if model_name not in model_data:
|
| 1039 |
+
model_data[model_name] = {
|
| 1040 |
+
'performances': {},
|
| 1041 |
+
'architecture': 'Unknown',
|
| 1042 |
+
'source': 'external_models',
|
| 1043 |
+
'num_parameters': row.get('model_num_parameters', 0)
|
| 1044 |
+
}
|
| 1045 |
+
|
| 1046 |
+
# Map external models columns to dataset names
|
| 1047 |
+
column_mapping = {
|
| 1048 |
+
'ASSIN2 RTE': 'assin2_rte',
|
| 1049 |
+
'ASSIN2 STS': 'assin2_sts',
|
| 1050 |
+
'FaQUaD-NLI': 'faquad-nli',
|
| 1051 |
+
'HateBR': 'hatebr'
|
| 1052 |
+
}
|
| 1053 |
+
|
| 1054 |
+
for display_name, dataset_name in column_mapping.items():
|
| 1055 |
+
if dataset_name in selected_datasets:
|
| 1056 |
+
score = row[display_name]
|
| 1057 |
+
if pd.notna(score) and score > 0:
|
| 1058 |
+
model_data[model_name]['performances'][dataset_name] = score
|
| 1059 |
+
|
| 1060 |
+
# Apply source filtering
|
| 1061 |
+
filtered_model_data = {}
|
| 1062 |
+
for model_name, data in model_data.items():
|
| 1063 |
+
source = data.get('source', 'existing')
|
| 1064 |
+
|
| 1065 |
+
# Apply show filters - only show models from sources that are checked
|
| 1066 |
+
if source == 'napolab_thesis' and not show_napolab_thesis:
|
| 1067 |
+
continue
|
| 1068 |
+
if source == 'teenytinyllama_paper' and not show_teenytinyllama:
|
| 1069 |
+
continue
|
| 1070 |
+
if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
|
| 1071 |
+
continue
|
| 1072 |
+
if source == 'external_models' and not show_external_models:
|
| 1073 |
+
continue
|
| 1074 |
+
# Hide models with unknown source (should not happen with proper data)
|
| 1075 |
+
if source == 'unknown':
|
| 1076 |
+
continue
|
| 1077 |
+
|
| 1078 |
+
# Apply parameter filtering (only for Portuguese leaderboard models)
|
| 1079 |
+
if max_num_parameters > 0 and source == 'portuguese_leaderboard':
|
| 1080 |
+
num_parameters = data.get('num_parameters', 0)
|
| 1081 |
+
if num_parameters > max_num_parameters:
|
| 1082 |
+
continue
|
| 1083 |
+
|
| 1084 |
+
filtered_model_data[model_name] = data
|
| 1085 |
+
|
| 1086 |
+
# Apply incomplete model filtering
|
| 1087 |
+
if hide_incomplete_models and selected_datasets:
|
| 1088 |
+
final_filtered_data = {}
|
| 1089 |
+
for model_name, data in filtered_model_data.items():
|
| 1090 |
+
has_all_scores = True
|
| 1091 |
+
for dataset_name in selected_datasets:
|
| 1092 |
+
if data['performances'].get(dataset_name, 0) == 0:
|
| 1093 |
+
has_all_scores = False
|
| 1094 |
+
break
|
| 1095 |
+
if has_all_scores:
|
| 1096 |
+
final_filtered_data[model_name] = data
|
| 1097 |
+
filtered_model_data = final_filtered_data
|
| 1098 |
+
|
| 1099 |
+
# Apply minimum average performance filtering
|
| 1100 |
+
if min_average_performance > 0 and selected_datasets:
|
| 1101 |
+
final_filtered_data = {}
|
| 1102 |
+
for model_name, data in filtered_model_data.items():
|
| 1103 |
+
# Calculate average performance for selected datasets
|
| 1104 |
+
scores = []
|
| 1105 |
+
for dataset_name in selected_datasets:
|
| 1106 |
+
score = data['performances'].get(dataset_name, 0)
|
| 1107 |
+
if score > 0: # Only include non-zero scores
|
| 1108 |
+
scores.append(score)
|
| 1109 |
+
|
| 1110 |
+
if scores:
|
| 1111 |
+
avg_performance = np.mean(scores)
|
| 1112 |
+
if avg_performance >= min_average_performance:
|
| 1113 |
+
final_filtered_data[model_name] = data
|
| 1114 |
+
filtered_model_data = final_filtered_data
|
| 1115 |
+
|
| 1116 |
+
# Apply search query filtering
|
| 1117 |
+
if search_query:
|
| 1118 |
+
final_filtered_data = {}
|
| 1119 |
+
try:
|
| 1120 |
+
# Use regex pattern matching
|
| 1121 |
+
import re
|
| 1122 |
+
pattern = re.compile(search_query, re.IGNORECASE)
|
| 1123 |
+
for model_name, data in filtered_model_data.items():
|
| 1124 |
+
if pattern.search(model_name):
|
| 1125 |
+
final_filtered_data[model_name] = data
|
| 1126 |
+
except re.error:
|
| 1127 |
+
# Fallback to simple string matching if regex is invalid
|
| 1128 |
+
for model_name, data in filtered_model_data.items():
|
| 1129 |
+
if search_query.lower() in model_name.lower():
|
| 1130 |
+
final_filtered_data[model_name] = data
|
| 1131 |
+
filtered_model_data = final_filtered_data
|
| 1132 |
+
|
| 1133 |
+
# Prepare data for scatter plot
|
| 1134 |
+
scatter_data = []
|
| 1135 |
+
for model_name, data in filtered_model_data.items():
|
| 1136 |
+
# Calculate average performance for selected datasets
|
| 1137 |
+
scores = []
|
| 1138 |
+
for dataset_name in selected_datasets:
|
| 1139 |
+
score = data['performances'].get(dataset_name, 0)
|
| 1140 |
+
if score > 0: # Only include non-zero scores
|
| 1141 |
+
scores.append(score)
|
| 1142 |
+
|
| 1143 |
+
if scores:
|
| 1144 |
+
avg_performance = np.mean(scores)
|
| 1145 |
+
num_parameters = data.get('num_parameters', 0)
|
| 1146 |
+
source = data.get('source', 'unknown')
|
| 1147 |
+
|
| 1148 |
+
scatter_data.append({
|
| 1149 |
+
'model_name': model_name,
|
| 1150 |
+
'avg_performance': avg_performance,
|
| 1151 |
+
'num_parameters': num_parameters,
|
| 1152 |
+
'source': source
|
| 1153 |
+
})
|
| 1154 |
+
|
| 1155 |
+
if not scatter_data:
|
| 1156 |
+
# Create empty figure if no data
|
| 1157 |
+
fig = go.Figure()
|
| 1158 |
+
fig.add_annotation(
|
| 1159 |
+
text="No data available for the selected filters",
|
| 1160 |
+
xref="paper", yref="paper",
|
| 1161 |
+
x=0.5, y=0.5, showarrow=False,
|
| 1162 |
+
font=dict(size=16)
|
| 1163 |
+
)
|
| 1164 |
+
fig.update_layout(
|
| 1165 |
+
title="Model Performance vs Number of Parameters",
|
| 1166 |
+
xaxis_title="Number of Parameters",
|
| 1167 |
+
yaxis_title="Average Performance Score",
|
| 1168 |
+
height=500
|
| 1169 |
+
)
|
| 1170 |
+
return fig
|
| 1171 |
+
|
| 1172 |
+
# Create scatter plot
|
| 1173 |
+
df_scatter = pd.DataFrame(scatter_data)
|
| 1174 |
+
|
| 1175 |
+
# Create color mapping for sources
|
| 1176 |
+
color_map = {
|
| 1177 |
+
'portuguese_leaderboard': '#1f77b4',
|
| 1178 |
+
'external_models': '#ff7f0e',
|
| 1179 |
+
'napolab_thesis': '#2ca02c',
|
| 1180 |
+
'teenytinyllama_paper': '#d62728',
|
| 1181 |
+
'unknown': '#9467bd'
|
| 1182 |
+
}
|
| 1183 |
+
|
| 1184 |
+
# Create display name mapping for sources
|
| 1185 |
+
display_name_map = {
|
| 1186 |
+
'portuguese_leaderboard': 'Open PT LLM Leaderboard',
|
| 1187 |
+
'external_models': 'Proprietary Models',
|
| 1188 |
+
'napolab_thesis': 'Napolab Thesis',
|
| 1189 |
+
'teenytinyllama_paper': 'TeenyTinyLlama Paper',
|
| 1190 |
+
'unknown': 'Unknown Source'
|
| 1191 |
+
}
|
| 1192 |
+
|
| 1193 |
+
fig = go.Figure()
|
| 1194 |
+
|
| 1195 |
+
for source in df_scatter['source'].unique():
|
| 1196 |
+
source_data = df_scatter[df_scatter['source'] == source]
|
| 1197 |
+
color = color_map.get(source, '#7f7f7f')
|
| 1198 |
+
display_name = display_name_map.get(source, source.replace('_', ' ').title())
|
| 1199 |
+
|
| 1200 |
+
fig.add_trace(go.Scatter(
|
| 1201 |
+
x=source_data['num_parameters'],
|
| 1202 |
+
y=source_data['avg_performance'],
|
| 1203 |
+
mode='markers',
|
| 1204 |
+
name=display_name,
|
| 1205 |
+
marker=dict(
|
| 1206 |
+
color=color,
|
| 1207 |
+
size=8,
|
| 1208 |
+
opacity=0.7
|
| 1209 |
+
),
|
| 1210 |
+
text=source_data['model_name'],
|
| 1211 |
+
hovertemplate=(
|
| 1212 |
+
"<b>%{text}</b><br>" +
|
| 1213 |
+
"Average Performance: %{y:.3f}<br>" +
|
| 1214 |
+
"Number of Parameters: %{x:,}<br>" +
|
| 1215 |
+
"Source: " + display_name + "<br>" +
|
| 1216 |
+
"<extra></extra>"
|
| 1217 |
+
)
|
| 1218 |
+
))
|
| 1219 |
+
|
| 1220 |
+
fig.update_layout(
|
| 1221 |
+
title="Model Performance vs Number of Parameters",
|
| 1222 |
+
xaxis_title="Number of Parameters",
|
| 1223 |
+
yaxis_title="Average Performance Score",
|
| 1224 |
+
height=500,
|
| 1225 |
+
showlegend=True,
|
| 1226 |
+
plot_bgcolor='rgba(255, 255, 255, 0)',
|
| 1227 |
+
paper_bgcolor='rgba(255, 255, 255, 0)',
|
| 1228 |
+
legend=dict(
|
| 1229 |
+
yanchor="top",
|
| 1230 |
+
y=-0.15,
|
| 1231 |
+
xanchor="center",
|
| 1232 |
+
x=0.5,
|
| 1233 |
+
bgcolor='rgba(255, 255, 255, 0.95)',
|
| 1234 |
+
bordercolor='rgba(0, 0, 0, 0.2)',
|
| 1235 |
+
borderwidth=1,
|
| 1236 |
+
orientation="h"
|
| 1237 |
+
),
|
| 1238 |
+
margin=dict(l=50, r=50, t=100, b=100)
|
| 1239 |
+
)
|
| 1240 |
+
|
| 1241 |
+
return fig
|
| 1242 |
+
|
| 1243 |
# Event handlers
|
| 1244 |
def update_radar_chart(*args):
|
| 1245 |
# Extract arguments for radar chart
|
|
|
|
| 1251 |
show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
|
| 1252 |
show_external_models = args[len(analysis_dataset_checkboxes) + 5]
|
| 1253 |
search_query = args[len(analysis_dataset_checkboxes) + 6]
|
| 1254 |
+
max_num_parameters = args[len(analysis_dataset_checkboxes) + 7]
|
| 1255 |
|
| 1256 |
# Convert dataset selections to list of selected dataset names
|
| 1257 |
selected_datasets = []
|
|
|
|
| 1259 |
if dataset_values[i]:
|
| 1260 |
selected_datasets.append(dataset_name)
|
| 1261 |
|
| 1262 |
+
return create_model_performance_radar(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
|
| 1263 |
|
| 1264 |
def update_benchmark_table(*args):
|
| 1265 |
# Extract arguments
|
|
|
|
| 1271 |
show_portuguese_leaderboard = args[len(dataset_checkboxes) + 4]
|
| 1272 |
show_external_models = args[len(dataset_checkboxes) + 5]
|
| 1273 |
search_query = args[len(dataset_checkboxes) + 6]
|
| 1274 |
+
max_num_parameters = args[len(dataset_checkboxes) + 7]
|
| 1275 |
|
| 1276 |
# Convert dataset selections to list of selected dataset names
|
| 1277 |
selected_datasets = []
|
|
|
|
| 1279 |
if dataset_values[i]:
|
| 1280 |
selected_datasets.append(dataset_name)
|
| 1281 |
|
| 1282 |
+
df = create_simplified_benchmark_table(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
|
| 1283 |
|
| 1284 |
return df
|
| 1285 |
|
| 1286 |
+
def update_scatter_plot(*args):
|
| 1287 |
+
# Extract arguments for scatter plot
|
| 1288 |
+
dataset_values = args[:len(analysis_dataset_checkboxes)]
|
| 1289 |
+
hide_incomplete_models = args[len(analysis_dataset_checkboxes)]
|
| 1290 |
+
min_average_performance = args[len(analysis_dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
|
| 1291 |
+
show_napolab_thesis = args[len(analysis_dataset_checkboxes) + 2]
|
| 1292 |
+
show_teenytinyllama = args[len(analysis_dataset_checkboxes) + 3]
|
| 1293 |
+
show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
|
| 1294 |
+
show_external_models = args[len(analysis_dataset_checkboxes) + 5]
|
| 1295 |
+
search_query = args[len(analysis_dataset_checkboxes) + 6]
|
| 1296 |
+
max_num_parameters = args[len(analysis_dataset_checkboxes) + 7]
|
| 1297 |
+
|
| 1298 |
+
# Convert dataset selections to list of selected dataset names
|
| 1299 |
+
selected_datasets = []
|
| 1300 |
+
for i, (dataset_name, _) in enumerate(analysis_dataset_checkboxes):
|
| 1301 |
+
if dataset_values[i]:
|
| 1302 |
+
selected_datasets.append(dataset_name)
|
| 1303 |
+
|
| 1304 |
+
return create_model_performance_scatter(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
|
| 1305 |
|
| 1306 |
# Connect dataset checkboxes to update table
|
| 1307 |
for dataset_name, checkbox in dataset_checkboxes:
|
| 1308 |
checkbox.change(
|
| 1309 |
update_benchmark_table,
|
| 1310 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
| 1311 |
outputs=benchmark_table
|
| 1312 |
)
|
| 1313 |
|
| 1314 |
hide_incomplete_models.change(
|
| 1315 |
update_benchmark_table,
|
| 1316 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
| 1317 |
outputs=benchmark_table
|
| 1318 |
)
|
| 1319 |
|
| 1320 |
min_average_performance.change(
|
| 1321 |
update_benchmark_table,
|
| 1322 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
| 1323 |
outputs=benchmark_table
|
| 1324 |
)
|
| 1325 |
|
| 1326 |
show_napolab_thesis.change(
|
| 1327 |
update_benchmark_table,
|
| 1328 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
| 1329 |
outputs=benchmark_table
|
| 1330 |
)
|
| 1331 |
|
| 1332 |
show_teenytinyllama.change(
|
| 1333 |
update_benchmark_table,
|
| 1334 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
| 1335 |
outputs=benchmark_table
|
| 1336 |
)
|
| 1337 |
|
| 1338 |
show_portuguese_leaderboard.change(
|
| 1339 |
update_benchmark_table,
|
| 1340 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
| 1341 |
outputs=benchmark_table
|
| 1342 |
)
|
| 1343 |
|
| 1344 |
show_external_models.change(
|
| 1345 |
update_benchmark_table,
|
| 1346 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
| 1347 |
outputs=benchmark_table
|
| 1348 |
)
|
| 1349 |
|
| 1350 |
# Connect search query to update table
|
| 1351 |
search_query.change(
|
| 1352 |
update_benchmark_table,
|
| 1353 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
| 1354 |
+
outputs=benchmark_table
|
| 1355 |
+
)
|
| 1356 |
+
|
| 1357 |
+
# Connect max_num_parameters to update table
|
| 1358 |
+
max_num_parameters.change(
|
| 1359 |
+
update_benchmark_table,
|
| 1360 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
| 1361 |
outputs=benchmark_table
|
| 1362 |
)
|
| 1363 |
|
|
|
|
| 1380 |
for dataset_name, checkbox in analysis_dataset_checkboxes:
|
| 1381 |
checkbox.change(
|
| 1382 |
update_radar_chart,
|
| 1383 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1384 |
outputs=model_analysis_chart
|
| 1385 |
)
|
| 1386 |
|
| 1387 |
hide_incomplete_models_analysis.change(
|
| 1388 |
update_radar_chart,
|
| 1389 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1390 |
outputs=model_analysis_chart
|
| 1391 |
)
|
| 1392 |
|
| 1393 |
min_average_performance_analysis.change(
|
| 1394 |
update_radar_chart,
|
| 1395 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1396 |
outputs=model_analysis_chart
|
| 1397 |
)
|
| 1398 |
|
| 1399 |
show_napolab_thesis_analysis.change(
|
| 1400 |
update_radar_chart,
|
| 1401 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1402 |
outputs=model_analysis_chart
|
| 1403 |
)
|
| 1404 |
|
| 1405 |
show_teenytinyllama_analysis.change(
|
| 1406 |
update_radar_chart,
|
| 1407 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1408 |
outputs=model_analysis_chart
|
| 1409 |
)
|
| 1410 |
|
| 1411 |
show_portuguese_leaderboard_analysis.change(
|
| 1412 |
update_radar_chart,
|
| 1413 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1414 |
outputs=model_analysis_chart
|
| 1415 |
)
|
| 1416 |
|
| 1417 |
show_external_models_analysis.change(
|
| 1418 |
update_radar_chart,
|
| 1419 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1420 |
outputs=model_analysis_chart
|
| 1421 |
)
|
| 1422 |
|
| 1423 |
# Connect search query to update radar chart
|
| 1424 |
search_query_analysis.change(
|
| 1425 |
update_radar_chart,
|
| 1426 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1427 |
outputs=model_analysis_chart
|
| 1428 |
)
|
| 1429 |
+
|
| 1430 |
+
# Connect max_num_parameters_analysis to update radar chart
|
| 1431 |
+
max_num_parameters_analysis.change(
|
| 1432 |
+
update_radar_chart,
|
| 1433 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1434 |
+
outputs=model_analysis_chart
|
| 1435 |
+
)
|
| 1436 |
+
|
| 1437 |
+
# Connect all analysis controls to update scatter plot
|
| 1438 |
+
for dataset_name, checkbox in analysis_dataset_checkboxes:
|
| 1439 |
+
checkbox.change(
|
| 1440 |
+
update_scatter_plot,
|
| 1441 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1442 |
+
outputs=model_scatter_plot
|
| 1443 |
+
)
|
| 1444 |
+
|
| 1445 |
+
hide_incomplete_models_analysis.change(
|
| 1446 |
+
update_scatter_plot,
|
| 1447 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1448 |
+
outputs=model_scatter_plot
|
| 1449 |
+
)
|
| 1450 |
+
|
| 1451 |
+
min_average_performance_analysis.change(
|
| 1452 |
+
update_scatter_plot,
|
| 1453 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1454 |
+
outputs=model_scatter_plot
|
| 1455 |
+
)
|
| 1456 |
+
|
| 1457 |
+
show_napolab_thesis_analysis.change(
|
| 1458 |
+
update_scatter_plot,
|
| 1459 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1460 |
+
outputs=model_scatter_plot
|
| 1461 |
+
)
|
| 1462 |
+
|
| 1463 |
+
show_teenytinyllama_analysis.change(
|
| 1464 |
+
update_scatter_plot,
|
| 1465 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1466 |
+
outputs=model_scatter_plot
|
| 1467 |
+
)
|
| 1468 |
+
|
| 1469 |
+
show_portuguese_leaderboard_analysis.change(
|
| 1470 |
+
update_scatter_plot,
|
| 1471 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1472 |
+
outputs=model_scatter_plot
|
| 1473 |
+
)
|
| 1474 |
+
|
| 1475 |
+
show_external_models_analysis.change(
|
| 1476 |
+
update_scatter_plot,
|
| 1477 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1478 |
+
outputs=model_scatter_plot
|
| 1479 |
+
)
|
| 1480 |
+
|
| 1481 |
+
search_query_analysis.change(
|
| 1482 |
+
update_scatter_plot,
|
| 1483 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1484 |
+
outputs=model_scatter_plot
|
| 1485 |
+
)
|
| 1486 |
+
|
| 1487 |
+
max_num_parameters_analysis.change(
|
| 1488 |
+
update_scatter_plot,
|
| 1489 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
| 1490 |
+
outputs=model_scatter_plot
|
| 1491 |
+
)
|
| 1492 |
+
|
| 1493 |
+
# Connect events
|
| 1494 |
+
# Load model analysis chart on app start
|
| 1495 |
+
app.load(lambda: update_radar_chart(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=model_analysis_chart)
|
| 1496 |
+
|
| 1497 |
+
# Load scatter plot on app start
|
| 1498 |
+
app.load(lambda: update_scatter_plot(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=model_scatter_plot)
|
| 1499 |
+
|
| 1500 |
+
# Load benchmark table on app start
|
| 1501 |
+
app.load(lambda: update_benchmark_table(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=benchmark_table)
|
| 1502 |
|
| 1503 |
if __name__ == "__main__":
|
| 1504 |
app.launch(server_name="0.0.0.0", server_port=7860)
|
external_models.csv
CHANGED
|
@@ -1,31 +1,31 @@
|
|
| 1 |
-
model,link,assin2_sts,assin2_rte,faquad_nli,hatebr_offensive
|
| 2 |
-
sabia-2-small,https://www.maritaca.ai/,0.7053302344881672,0.9121728362223306,0.7575848453041435,0.753800795680591
|
| 3 |
-
sabia-2-medium,https://www.maritaca.ai/,0.7804108376537757,0.923459363368553,0.7657657657657658,0.8349989882997386
|
| 4 |
-
gpt-3.5-turbo-0125,https://www.openai.com/,0.7378460201077941,0.8823038414050672,0.746353108609074,0.8056205941193919
|
| 5 |
-
claude-3-haiku-20240307,https://www.claude.ai/,0.7892124744168747,0.9184462138121732,0.6340996599941455,0.8023698759439051
|
| 6 |
-
gemini-1.0-pro,https://ai.google.dev/,0.7058831239763663,0.8945993304651698,0.7070913567220611,0.8086330094493972
|
| 7 |
-
gemini-1.5-pro-preview-0409,https://cloud.google.com/vertex-ai,0.8159702278408203,0.9328989988467518,0.7290756302521009,0.8697698647467024
|
| 8 |
-
deepseek-v2-chat,https://www.deepseek.com/,0.8533174657651231,0.9440170304568147,0.7995469048381548,0.8842986491071644
|
| 9 |
-
gemini-1.5-flash-preview-0514,https://cloud.google.com/vertex-ai,0.841655158151231,0.9362097477374545,0.8092185592185592,0.9099110141445836
|
| 10 |
-
gemini-1.5-flash-001,https://cloud.google.com/vertex-ai,0.838806085610371,0.9366169973822607,0.7963910785668922,0.9092078461170015
|
| 11 |
-
gpt-4o-mini-2024-07-18,https://www.openai.com/,0.7259038954527597,0.942809846745341,0.819807735300693,0.8682357029532165
|
| 12 |
-
nemotron-4-340b-instruct,https://huggingface.co/nvidia/Nemotron-4-340B-Instruct,0.7857731021403329,0.9489354458928496,0.8194444444444444,0.8641580001234928
|
| 13 |
-
llama_405b_instruct,https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct,0.7888441732870783,0.9476445477916471,0.825063276593557,0.9073940659389119
|
| 14 |
-
sabia-3,https://www.maritaca.ai/,0.8253863689009022,0.9477034821619312,0.8243848812618203,0.8278737774590023
|
| 15 |
-
llama3_3_70b,https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct,0.7275578599896508,0.9407071010860484,0.8787563033858187,0.9024358249091997
|
| 16 |
-
llama3_2_90b,https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct,0.7368518566379951,0.9216548775103446,0.8632015306122449,0.8965270877302478
|
| 17 |
-
gemini-1.5-flash-002,https://cloud.google.com/vertex-ai,0.8380176734291938,0.941176117215237,0.8360786822325283,0.9046145161133335
|
| 18 |
-
gemini-1.5-flash-8b-001,https://aistudio.google.com,0.7638946799836569,0.9329452628161146,0.7937022965448601,0.850497640901663
|
| 19 |
-
gemini-2.0-flash-001,https://cloud.google.com/vertex-ai,0.8440142633742483,0.9305165510724053,0.7533651260745065,0.8890432813545366
|
| 20 |
-
gemini-2.0-flash-lite-001,https://cloud.google.com/vertex-ai,0.8492479991621328,0.9216548775103446,0.7652777777777777,0.8522499647780968
|
| 21 |
-
gemini-2.5-pro-exp-03-25,https://aistudio.google.com,0.837785744915033,0.9415510158830285,0.8738735797309651,0.9248478168290788
|
| 22 |
-
deepSeek-v3-0324,https://huggingface.co/deepseek-ai/DeepSeek-V3-0324,0.8145997097875548,0.9421860387625551,0.796751127001399,0.9060129756724185
|
| 23 |
-
qwen2-5-vl-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct,0.7595538567467497,0.9472975104201871,0.8447190882122586,0.8810695094657859
|
| 24 |
-
qwen2-5-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,0.8230708844558656,0.9509720145268106,0.8194444444444444,0.8810033427242816
|
| 25 |
-
qwen2-5-vl-32b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct,0.7780549055529008,0.9472975104201871,0.8447190882122586,0.8810695094657859
|
| 26 |
-
qwen-turbo-2024-11-01,https://www.alibabacloud.com/en/product/modelstudio,0.7640477700456898,0.9260451969385788,0.8128063725490196,0.8567933277676292
|
| 27 |
-
gpt-4o-2024-08-06,https://www.openai.com/,0.8078677969518289,0.9407235712144604,0.8654396266184885,0.9320137873994456
|
| 28 |
-
claude-3-7-sonnet-20250219,https://www.anthropic.com/,0.8087979933117393,0.9472965253044003,0.8097848807348216,0.9125114739050616
|
| 29 |
-
llama-4-scout-16e,https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct,0.7741640227983941,0.9312877465954967,0.8567037452287072,0.8813700069483281
|
| 30 |
-
llama-4-maverick-128e,https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct,0.7333246903202654,0.9329419027588105,0.7823695413019562,0.9047550357833591
|
| 31 |
-
gemma-3-27b-it,https://huggingface.co/google/gemma-3-27b-it,0.8147646517017526,0.9411147367212748,0.8143210816987241,0.8729414870796344
|
|
|
|
| 1 |
+
model,link,assin2_sts,assin2_rte,faquad_nli,hatebr_offensive
|
| 2 |
+
sabia-2-small,https://www.maritaca.ai/,0.7053302344881672,0.9121728362223306,0.7575848453041435,0.753800795680591
|
| 3 |
+
sabia-2-medium,https://www.maritaca.ai/,0.7804108376537757,0.923459363368553,0.7657657657657658,0.8349989882997386
|
| 4 |
+
gpt-3.5-turbo-0125,https://www.openai.com/,0.7378460201077941,0.8823038414050672,0.746353108609074,0.8056205941193919
|
| 5 |
+
claude-3-haiku-20240307,https://www.claude.ai/,0.7892124744168747,0.9184462138121732,0.6340996599941455,0.8023698759439051
|
| 6 |
+
gemini-1.0-pro,https://ai.google.dev/,0.7058831239763663,0.8945993304651698,0.7070913567220611,0.8086330094493972
|
| 7 |
+
gemini-1.5-pro-preview-0409,https://cloud.google.com/vertex-ai,0.8159702278408203,0.9328989988467518,0.7290756302521009,0.8697698647467024
|
| 8 |
+
deepseek-v2-chat,https://www.deepseek.com/,0.8533174657651231,0.9440170304568147,0.7995469048381548,0.8842986491071644
|
| 9 |
+
gemini-1.5-flash-preview-0514,https://cloud.google.com/vertex-ai,0.841655158151231,0.9362097477374545,0.8092185592185592,0.9099110141445836
|
| 10 |
+
gemini-1.5-flash-001,https://cloud.google.com/vertex-ai,0.838806085610371,0.9366169973822607,0.7963910785668922,0.9092078461170015
|
| 11 |
+
gpt-4o-mini-2024-07-18,https://www.openai.com/,0.7259038954527597,0.942809846745341,0.819807735300693,0.8682357029532165
|
| 12 |
+
nemotron-4-340b-instruct,https://huggingface.co/nvidia/Nemotron-4-340B-Instruct,0.7857731021403329,0.9489354458928496,0.8194444444444444,0.8641580001234928
|
| 13 |
+
llama_405b_instruct,https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct,0.7888441732870783,0.9476445477916471,0.825063276593557,0.9073940659389119
|
| 14 |
+
sabia-3,https://www.maritaca.ai/,0.8253863689009022,0.9477034821619312,0.8243848812618203,0.8278737774590023
|
| 15 |
+
llama3_3_70b,https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct,0.7275578599896508,0.9407071010860484,0.8787563033858187,0.9024358249091997
|
| 16 |
+
llama3_2_90b,https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct,0.7368518566379951,0.9216548775103446,0.8632015306122449,0.8965270877302478
|
| 17 |
+
gemini-1.5-flash-002,https://cloud.google.com/vertex-ai,0.8380176734291938,0.941176117215237,0.8360786822325283,0.9046145161133335
|
| 18 |
+
gemini-1.5-flash-8b-001,https://aistudio.google.com,0.7638946799836569,0.9329452628161146,0.7937022965448601,0.850497640901663
|
| 19 |
+
gemini-2.0-flash-001,https://cloud.google.com/vertex-ai,0.8440142633742483,0.9305165510724053,0.7533651260745065,0.8890432813545366
|
| 20 |
+
gemini-2.0-flash-lite-001,https://cloud.google.com/vertex-ai,0.8492479991621328,0.9216548775103446,0.7652777777777777,0.8522499647780968
|
| 21 |
+
gemini-2.5-pro-exp-03-25,https://aistudio.google.com,0.837785744915033,0.9415510158830285,0.8738735797309651,0.9248478168290788
|
| 22 |
+
deepSeek-v3-0324,https://huggingface.co/deepseek-ai/DeepSeek-V3-0324,0.8145997097875548,0.9421860387625551,0.796751127001399,0.9060129756724185
|
| 23 |
+
qwen2-5-vl-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct,0.7595538567467497,0.9472975104201871,0.8447190882122586,0.8810695094657859
|
| 24 |
+
qwen2-5-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,0.8230708844558656,0.9509720145268106,0.8194444444444444,0.8810033427242816
|
| 25 |
+
qwen2-5-vl-32b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct,0.7780549055529008,0.9472975104201871,0.8447190882122586,0.8810695094657859
|
| 26 |
+
qwen-turbo-2024-11-01,https://www.alibabacloud.com/en/product/modelstudio,0.7640477700456898,0.9260451969385788,0.8128063725490196,0.8567933277676292
|
| 27 |
+
gpt-4o-2024-08-06,https://www.openai.com/,0.8078677969518289,0.9407235712144604,0.8654396266184885,0.9320137873994456
|
| 28 |
+
claude-3-7-sonnet-20250219,https://www.anthropic.com/,0.8087979933117393,0.9472965253044003,0.8097848807348216,0.9125114739050616
|
| 29 |
+
llama-4-scout-16e,https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct,0.7741640227983941,0.9312877465954967,0.8567037452287072,0.8813700069483281
|
| 30 |
+
llama-4-maverick-128e,https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct,0.7333246903202654,0.9329419027588105,0.7823695413019562,0.9047550357833591
|
| 31 |
+
gemma-3-27b-it,https://huggingface.co/google/gemma-3-27b-it,0.8147646517017526,0.9411147367212748,0.8143210816987241,0.8729414870796344
|
extract_portuguese_leaderboard.py
CHANGED
|
@@ -83,6 +83,7 @@ def extract_data_from_json(json_file_path):
|
|
| 83 |
# Extract model information
|
| 84 |
model_name = config_general.get('model_name', '')
|
| 85 |
model_private = config_general.get('model_private', False)
|
|
|
|
| 86 |
|
| 87 |
# Extract results
|
| 88 |
all_grouped = results.get('all_grouped', {})
|
|
@@ -98,6 +99,7 @@ def extract_data_from_json(json_file_path):
|
|
| 98 |
'json_file': str(json_file_path),
|
| 99 |
'model_name': model_name,
|
| 100 |
'model_private': model_private,
|
|
|
|
| 101 |
'assin2_rte': assin2_rte,
|
| 102 |
'assin2_sts': assin2_sts,
|
| 103 |
'faquad_nli': faquad_nli,
|
|
|
|
| 83 |
# Extract model information
|
| 84 |
model_name = config_general.get('model_name', '')
|
| 85 |
model_private = config_general.get('model_private', False)
|
| 86 |
+
model_num_parameters = config_general.get('model_num_parameters', 0)
|
| 87 |
|
| 88 |
# Extract results
|
| 89 |
all_grouped = results.get('all_grouped', {})
|
|
|
|
| 99 |
'json_file': str(json_file_path),
|
| 100 |
'model_name': model_name,
|
| 101 |
'model_private': model_private,
|
| 102 |
+
'model_num_parameters': model_num_parameters,
|
| 103 |
'assin2_rte': assin2_rte,
|
| 104 |
'assin2_sts': assin2_sts,
|
| 105 |
'faquad_nli': faquad_nli,
|
portuguese_leaderboard.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|