Spaces:
Running
Running
join Trust & Safety table
Browse files- src/display/utils.py +14 -10
- src/populate.py +11 -5
src/display/utils.py
CHANGED
|
@@ -26,35 +26,39 @@ auto_eval_column_dict.append(
|
|
| 26 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
| 27 |
)
|
| 28 |
auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
| 29 |
-
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown",
|
| 30 |
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
|
| 31 |
auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
|
| 32 |
# Accuracy metrics
|
| 33 |
-
auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown",
|
| 34 |
auto_eval_column_dict.append(
|
| 35 |
[
|
| 36 |
"accuracy_metric_instruction_following",
|
| 37 |
ColumnContent,
|
| 38 |
-
ColumnContent("Instruction Following", "markdown",
|
| 39 |
]
|
| 40 |
)
|
| 41 |
auto_eval_column_dict.append(
|
| 42 |
-
["accuracy_metric_completeness", ColumnContent, ColumnContent("Completeness", "markdown",
|
| 43 |
)
|
| 44 |
auto_eval_column_dict.append(
|
| 45 |
-
["accuracy_metric_conciseness", ColumnContent, ColumnContent("Conciseness", "markdown",
|
| 46 |
)
|
| 47 |
auto_eval_column_dict.append(
|
| 48 |
-
["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown",
|
| 49 |
)
|
| 50 |
-
#
|
| 51 |
-
# ["use_case_flavor", ColumnContent, ColumnContent("Cost and Speed: Flavor", "markdown", False)]
|
| 52 |
-
# )
|
| 53 |
auto_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
|
| 54 |
auto_eval_column_dict.append(
|
| 55 |
["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
|
| 56 |
)
|
| 57 |
auto_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 59 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 60 |
|
|
@@ -80,7 +84,7 @@ CostEvalColumn = make_dataclass("CostEvalColumn", cost_eval_column_dict, frozen=
|
|
| 80 |
ts_eval_column_dict = []
|
| 81 |
# Init
|
| 82 |
ts_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)])
|
| 83 |
-
ts_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
| 84 |
ts_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety", "markdown", True)])
|
| 85 |
ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
|
| 86 |
ts_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
|
|
|
|
| 26 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
| 27 |
)
|
| 28 |
auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
| 29 |
+
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", False)])
|
| 30 |
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
|
| 31 |
auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
|
| 32 |
# Accuracy metrics
|
| 33 |
+
auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", False)])
|
| 34 |
auto_eval_column_dict.append(
|
| 35 |
[
|
| 36 |
"accuracy_metric_instruction_following",
|
| 37 |
ColumnContent,
|
| 38 |
+
ColumnContent("Instruction Following", "markdown", False),
|
| 39 |
]
|
| 40 |
)
|
| 41 |
auto_eval_column_dict.append(
|
| 42 |
+
["accuracy_metric_completeness", ColumnContent, ColumnContent("Completeness", "markdown", False)]
|
| 43 |
)
|
| 44 |
auto_eval_column_dict.append(
|
| 45 |
+
["accuracy_metric_conciseness", ColumnContent, ColumnContent("Conciseness", "markdown", False)]
|
| 46 |
)
|
| 47 |
auto_eval_column_dict.append(
|
| 48 |
+
["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", False)]
|
| 49 |
)
|
| 50 |
+
# Speed (Latency) & Cost metrics
|
|
|
|
|
|
|
| 51 |
auto_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
|
| 52 |
auto_eval_column_dict.append(
|
| 53 |
["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
|
| 54 |
)
|
| 55 |
auto_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)])
|
| 56 |
+
# Trust & Safety metrics
|
| 57 |
+
auto_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety", "markdown", True)])
|
| 58 |
+
auto_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
|
| 59 |
+
auto_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
|
| 60 |
+
auto_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", False)])
|
| 61 |
+
auto_eval_column_dict.append(["crm_bias", ColumnContent, ColumnContent("CRM Bias", "markdown", False)])
|
| 62 |
# We use make dataclass to dynamically fill the scores from Tasks
|
| 63 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 64 |
|
|
|
|
| 84 |
ts_eval_column_dict = []
|
| 85 |
# Init
|
| 86 |
ts_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)])
|
| 87 |
+
# ts_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
| 88 |
ts_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety", "markdown", True)])
|
| 89 |
ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
|
| 90 |
ts_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
|
src/populate.py
CHANGED
|
@@ -31,10 +31,9 @@ def get_leaderboard_df_crm(
|
|
| 31 |
)
|
| 32 |
|
| 33 |
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
|
| 34 |
-
|
| 35 |
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
|
| 36 |
-
leaderboard_ts_df = leaderboard_ts_df.join(
|
| 37 |
-
leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name")
|
| 38 |
privacy_cols = leaderboard_ts_df[
|
| 39 |
[
|
| 40 |
"Privacy Zero-Shot Match Avoidance",
|
|
@@ -47,7 +46,7 @@ def get_leaderboard_df_crm(
|
|
| 47 |
leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
|
| 48 |
leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Bias"].transform(lambda x: x.split(" ")[0])
|
| 49 |
|
| 50 |
-
|
| 51 |
[
|
| 52 |
"Safety",
|
| 53 |
"Privacy",
|
|
@@ -55,7 +54,14 @@ def get_leaderboard_df_crm(
|
|
| 55 |
"Bias No CI",
|
| 56 |
]
|
| 57 |
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
|
| 58 |
-
leaderboard_ts_df["Trust & Safety"] =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
|
| 61 |
by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
|
|
|
| 31 |
)
|
| 32 |
|
| 33 |
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
|
| 34 |
+
leaderboard_ts_crm_bias_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_crm_bias.csv"))
|
| 35 |
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
|
| 36 |
+
leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts_crm_bias_df.set_index("Model Name"), on="Model Name")
|
|
|
|
| 37 |
privacy_cols = leaderboard_ts_df[
|
| 38 |
[
|
| 39 |
"Privacy Zero-Shot Match Avoidance",
|
|
|
|
| 46 |
leaderboard_ts_df["Privacy"] = privacy_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
|
| 47 |
leaderboard_ts_df["Bias No CI"] = leaderboard_ts_df["CRM Bias"].transform(lambda x: x.split(" ")[0])
|
| 48 |
|
| 49 |
+
ts_lvl2_cols = leaderboard_ts_df[
|
| 50 |
[
|
| 51 |
"Safety",
|
| 52 |
"Privacy",
|
|
|
|
| 54 |
"Bias No CI",
|
| 55 |
]
|
| 56 |
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
|
| 57 |
+
leaderboard_ts_df["Trust & Safety"] = ts_lvl2_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
|
| 58 |
+
|
| 59 |
+
leaderboard_accuracy_df = leaderboard_accuracy_df.join(
|
| 60 |
+
leaderboard_ts_df[ts_cols].set_index(["Model Name"]),
|
| 61 |
+
on=["Model Name"],
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
| 65 |
|
| 66 |
leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
|
| 67 |
by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|