Corey Morris
commited on
Commit
·
d506f10
1
Parent(s):
5b83d0b
WIP commit. Currently have nlargest error
Browse files- app.py +32 -1
- result_data_processor.py +2 -0
app.py
CHANGED
|
@@ -107,7 +107,8 @@ def create_line_chart(df, model_names, metrics):
|
|
| 107 |
|
| 108 |
def find_top_differences_table(df, target_model, closest_models, num_differences=10, exclude_columns=['Parameters']):
|
| 109 |
# Calculate the absolute differences for each task between the target model and the closest models
|
| 110 |
-
|
|
|
|
| 111 |
# Unstack the differences and sort by the largest absolute difference
|
| 112 |
top_differences = differences.unstack().nlargest(num_differences)
|
| 113 |
# Convert the top differences to a DataFrame for display
|
|
@@ -120,6 +121,36 @@ def find_top_differences_table(df, target_model, closest_models, num_differences
|
|
| 120 |
return top_differences_table, unique_top_differences_tasks
|
| 121 |
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
data_provider = ResultDataProcessor()
|
| 124 |
|
| 125 |
# st.title('Model Evaluation Results including MMLU by task')
|
|
|
|
| 107 |
|
| 108 |
def find_top_differences_table(df, target_model, closest_models, num_differences=10, exclude_columns=['Parameters']):
|
| 109 |
# Calculate the absolute differences for each task between the target model and the closest models
|
| 110 |
+
new_df = df.drop(columns=exclude_columns)
|
| 111 |
+
differences = new_df.loc[closest_models].sub(new_df.loc[target_model]).abs()
|
| 112 |
# Unstack the differences and sort by the largest absolute difference
|
| 113 |
top_differences = differences.unstack().nlargest(num_differences)
|
| 114 |
# Convert the top differences to a DataFrame for display
|
|
|
|
| 121 |
return top_differences_table, unique_top_differences_tasks
|
| 122 |
|
| 123 |
|
| 124 |
+
# def find_top_differences_table(df, target_model, closest_models, num_differences=10, exclude_columns=['Parameters', 'organization']):
|
| 125 |
+
# # Drop specified columns and create a new DataFrame
|
| 126 |
+
# new_df = df.drop(columns=exclude_columns)
|
| 127 |
+
|
| 128 |
+
# # Compute differences between target model and closest models, taking absolute values
|
| 129 |
+
# differences = new_df.loc[closest_models].sub(new_df.loc[target_model]).abs()
|
| 130 |
+
|
| 131 |
+
# # Unstack the differences
|
| 132 |
+
# unstacked_differences = differences.unstack()
|
| 133 |
+
|
| 134 |
+
# # Convert object types to numeric, ignoring errors to leave non-convertible elements as NaN
|
| 135 |
+
# unstacked_differences = pd.to_numeric(unstacked_differences, errors='coerce')
|
| 136 |
+
|
| 137 |
+
# # Find the top num_differences
|
| 138 |
+
# top_differences = unstacked_differences.nlargest(num_differences)
|
| 139 |
+
|
| 140 |
+
# # Convert the top differences to a DataFrame for display
|
| 141 |
+
# top_differences_table = pd.DataFrame({
|
| 142 |
+
# 'Task': [idx[0] for idx in top_differences.index],
|
| 143 |
+
# 'Difference': top_differences.values
|
| 144 |
+
# })
|
| 145 |
+
|
| 146 |
+
# # Ensure that only unique tasks are returned
|
| 147 |
+
# unique_top_differences_tasks = list(set(top_differences_table['Task'].tolist()))
|
| 148 |
+
|
| 149 |
+
# return top_differences_table, unique_top_differences_tasks
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
|
| 154 |
data_provider = ResultDataProcessor()
|
| 155 |
|
| 156 |
# st.title('Model Evaluation Results including MMLU by task')
|
result_data_processor.py
CHANGED
|
@@ -137,6 +137,8 @@ class ResultDataProcessor:
|
|
| 137 |
# remove extreme outliers from column harness|truthfulqa:mc1
|
| 138 |
data = self._remove_mc1_outliers(data)
|
| 139 |
|
|
|
|
|
|
|
| 140 |
return data
|
| 141 |
|
| 142 |
def rank_data(self):
|
|
|
|
| 137 |
# remove extreme outliers from column harness|truthfulqa:mc1
|
| 138 |
data = self._remove_mc1_outliers(data)
|
| 139 |
|
| 140 |
+
data = data.drop(columns=['organization'])
|
| 141 |
+
|
| 142 |
return data
|
| 143 |
|
| 144 |
def rank_data(self):
|