Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -409,7 +409,7 @@ def get_leaderboard_data(vote_entry=None):
|
|
| 409 |
pagerank_result = evalica.pagerank(
|
| 410 |
vote_df["left"], vote_df["right"], vote_df["winner"]
|
| 411 |
)
|
| 412 |
-
|
| 413 |
# Load conversation data from the Hugging Face repository
|
| 414 |
conversation_data = load_content_from_hf("SE-Arena/conversations")
|
| 415 |
conversation_df = pd.DataFrame(conversation_data)
|
|
@@ -418,16 +418,16 @@ def get_leaderboard_data(vote_entry=None):
|
|
| 418 |
all_df = pd.merge(
|
| 419 |
vote_df, conversation_df, on=["timestamp", "left", "right"], how="inner"
|
| 420 |
)
|
| 421 |
-
|
| 422 |
# Calculate Conversation Efficiency Indexs more efficiently
|
| 423 |
# Create a dictionary to store accumulated scores and counts for each model
|
| 424 |
model_rcs_sum = {}
|
| 425 |
model_rcs_max = {}
|
| 426 |
-
|
| 427 |
# Process each row once and accumulate scores
|
| 428 |
for _, row in all_df.iterrows():
|
| 429 |
# Determine scores based on winner
|
| 430 |
-
match row["winner"]:
|
| 431 |
case evalica.Winner.X:
|
| 432 |
left_score = 1.0
|
| 433 |
right_score = -1.0
|
|
@@ -437,29 +437,45 @@ def get_leaderboard_data(vote_entry=None):
|
|
| 437 |
case _: # Draw
|
| 438 |
left_score = 0.1
|
| 439 |
right_score = 0.1
|
| 440 |
-
|
| 441 |
# Count rounds for each side
|
| 442 |
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
|
| 443 |
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
|
| 444 |
-
|
| 445 |
left_model = row["left"]
|
| 446 |
right_model = row["right"]
|
| 447 |
-
|
| 448 |
model_rcs_max[left_model] = model_rcs_max.get(left_model, 0) + 1.0 / left_round
|
| 449 |
-
model_rcs_max[right_model] =
|
| 450 |
-
|
|
|
|
|
|
|
| 451 |
# Calculate per-round scores
|
| 452 |
-
model_rcs_sum[left_model] =
|
| 453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
|
| 455 |
-
cei_result = {
|
| 456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
self_matches = vote_df[vote_df["left"] == vote_df["right"]]
|
| 459 |
model_matches = self_matches.groupby("left")
|
| 460 |
-
draw_counts = model_matches["winner"].apply(
|
|
|
|
|
|
|
| 461 |
total_counts = model_matches.size()
|
| 462 |
-
mcs_result = (
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
|
| 464 |
# Combine all results into a single DataFrame
|
| 465 |
leaderboard_data = pd.DataFrame(
|
|
@@ -491,7 +507,7 @@ def get_leaderboard_data(vote_entry=None):
|
|
| 491 |
|
| 492 |
# Add a Rank column based on Elo scores
|
| 493 |
leaderboard_data["Rank"] = (
|
| 494 |
-
leaderboard_data["Elo Score"].rank(ascending=False).astype(int)
|
| 495 |
)
|
| 496 |
|
| 497 |
# Place rank in the first column
|
|
|
|
| 409 |
pagerank_result = evalica.pagerank(
|
| 410 |
vote_df["left"], vote_df["right"], vote_df["winner"]
|
| 411 |
)
|
| 412 |
+
|
| 413 |
# Load conversation data from the Hugging Face repository
|
| 414 |
conversation_data = load_content_from_hf("SE-Arena/conversations")
|
| 415 |
conversation_df = pd.DataFrame(conversation_data)
|
|
|
|
| 418 |
all_df = pd.merge(
|
| 419 |
vote_df, conversation_df, on=["timestamp", "left", "right"], how="inner"
|
| 420 |
)
|
| 421 |
+
|
| 422 |
# Calculate Conversation Efficiency Indexs more efficiently
|
| 423 |
# Create a dictionary to store accumulated scores and counts for each model
|
| 424 |
model_rcs_sum = {}
|
| 425 |
model_rcs_max = {}
|
| 426 |
+
|
| 427 |
# Process each row once and accumulate scores
|
| 428 |
for _, row in all_df.iterrows():
|
| 429 |
# Determine scores based on winner
|
| 430 |
+
match row["winner"]:
|
| 431 |
case evalica.Winner.X:
|
| 432 |
left_score = 1.0
|
| 433 |
right_score = -1.0
|
|
|
|
| 437 |
case _: # Draw
|
| 438 |
left_score = 0.1
|
| 439 |
right_score = 0.1
|
| 440 |
+
|
| 441 |
# Count rounds for each side
|
| 442 |
left_round = sum(1 for msg in row["left_chat"] if msg["role"] == "assistant")
|
| 443 |
right_round = sum(1 for msg in row["right_chat"] if msg["role"] == "assistant")
|
| 444 |
+
|
| 445 |
left_model = row["left"]
|
| 446 |
right_model = row["right"]
|
| 447 |
+
|
| 448 |
model_rcs_max[left_model] = model_rcs_max.get(left_model, 0) + 1.0 / left_round
|
| 449 |
+
model_rcs_max[right_model] = (
|
| 450 |
+
model_rcs_max.get(right_model, 0) + 1.0 / right_round
|
| 451 |
+
)
|
| 452 |
+
|
| 453 |
# Calculate per-round scores
|
| 454 |
+
model_rcs_sum[left_model] = (
|
| 455 |
+
model_rcs_sum.get(left_model, 0) + left_score / left_round
|
| 456 |
+
)
|
| 457 |
+
model_rcs_sum[right_model] = (
|
| 458 |
+
model_rcs_sum.get(right_model, 0) + right_score / right_round
|
| 459 |
+
)
|
| 460 |
|
| 461 |
+
cei_result = {
|
| 462 |
+
model: model_rcs_sum[model] / model_rcs_max[model] for model in model_rcs_sum
|
| 463 |
+
}
|
| 464 |
+
cei_result = pd.Series(
|
| 465 |
+
{model: cei_result[model] for model in elo_result.scores.index}
|
| 466 |
+
)
|
| 467 |
|
| 468 |
self_matches = vote_df[vote_df["left"] == vote_df["right"]]
|
| 469 |
model_matches = self_matches.groupby("left")
|
| 470 |
+
draw_counts = model_matches["winner"].apply(
|
| 471 |
+
lambda x: (x == evalica.Winner.Draw).sum()
|
| 472 |
+
)
|
| 473 |
total_counts = model_matches.size()
|
| 474 |
+
mcs_result = (
|
| 475 |
+
(draw_counts / total_counts)
|
| 476 |
+
.round(2)
|
| 477 |
+
.reindex(elo_result.scores.index, fill_value="N/A")
|
| 478 |
+
)
|
| 479 |
|
| 480 |
# Combine all results into a single DataFrame
|
| 481 |
leaderboard_data = pd.DataFrame(
|
|
|
|
| 507 |
|
| 508 |
# Add a Rank column based on Elo scores
|
| 509 |
leaderboard_data["Rank"] = (
|
| 510 |
+
leaderboard_data["Elo Score"].rank(method="min", ascending=False).astype(int)
|
| 511 |
)
|
| 512 |
|
| 513 |
# Place rank in the first column
|