Spaces:
Running
Running
refine
Browse files
README.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
---
|
| 2 |
title: SWE-Model-Arena
|
| 3 |
emoji: 🎯
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.
|
| 8 |
app_file: app.py
|
| 9 |
hf_oauth: true
|
| 10 |
pinned: false
|
|
@@ -48,7 +48,6 @@ Existing evaluation frameworks (e.g. [LMArena](https://lmarena.ai)) often don't
|
|
| 48 |
### Prerequisites
|
| 49 |
|
| 50 |
- A [Hugging Face](https://huggingface.co) account
|
| 51 |
-
- Basic understanding of software engineering workflows
|
| 52 |
|
| 53 |
### Usage
|
| 54 |
|
|
|
|
| 1 |
---
|
| 2 |
title: SWE-Model-Arena
|
| 3 |
emoji: 🎯
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: red
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.50.0
|
| 8 |
app_file: app.py
|
| 9 |
hf_oauth: true
|
| 10 |
pinned: false
|
|
|
|
| 48 |
### Prerequisites
|
| 49 |
|
| 50 |
- A [Hugging Face](https://huggingface.co) account
|
|
|
|
| 51 |
|
| 52 |
### Usage
|
| 53 |
|
app.py
CHANGED
|
@@ -423,6 +423,7 @@ def get_leaderboard_data(vote_entry=None, use_cache=True):
|
|
| 423 |
columns=[
|
| 424 |
"Rank",
|
| 425 |
"Model",
|
|
|
|
| 426 |
"Elo Score",
|
| 427 |
"Conversation Efficiency Index",
|
| 428 |
"Model Consistency Score",
|
|
@@ -564,9 +565,13 @@ def get_leaderboard_data(vote_entry=None, use_cache=True):
|
|
| 564 |
mcs_result = pd.Series(mcs_result)
|
| 565 |
|
| 566 |
# Combine all results into a single DataFrame
|
|
|
|
|
|
|
|
|
|
| 567 |
leaderboard_data = pd.DataFrame(
|
| 568 |
{
|
| 569 |
"Model": elo_result.scores.index,
|
|
|
|
| 570 |
"Elo Score": elo_result.scores.values,
|
| 571 |
"Conversation Efficiency Index": cei_result.values,
|
| 572 |
"Model Consistency Score": mcs_result.values,
|
|
@@ -600,8 +605,6 @@ def get_leaderboard_data(vote_entry=None, use_cache=True):
|
|
| 600 |
["Rank"] + [col for col in leaderboard_data.columns if col != "Rank"]
|
| 601 |
]
|
| 602 |
|
| 603 |
-
# Note: Links are made clickable via JavaScript, keeping plain model names here
|
| 604 |
-
|
| 605 |
# Save leaderboard data if this is a new vote
|
| 606 |
if vote_entry is not None:
|
| 607 |
try:
|
|
@@ -632,49 +635,6 @@ def toggle_submit_button(text):
|
|
| 632 |
return gr.update(interactive=True) # Enable the button
|
| 633 |
|
| 634 |
|
| 635 |
-
# JavaScript to make model name links clickable in the leaderboard
|
| 636 |
-
# Uses event delegation on the table container for better stability
|
| 637 |
-
clickable_links_js = (
|
| 638 |
-
"""
|
| 639 |
-
function() {
|
| 640 |
-
const modelLinks = """
|
| 641 |
-
+ json.dumps(model_links)
|
| 642 |
-
+ """;
|
| 643 |
-
|
| 644 |
-
function makeLinksClickable() {
|
| 645 |
-
// Find all table cells in the Model column (2nd column)
|
| 646 |
-
const rows = document.querySelectorAll('table tbody tr');
|
| 647 |
-
rows.forEach(row => {
|
| 648 |
-
const cells = row.querySelectorAll('td');
|
| 649 |
-
if (cells.length >= 2) {
|
| 650 |
-
const modelCell = cells[1]; // Model is the 2nd column
|
| 651 |
-
const cellText = modelCell.textContent.trim();
|
| 652 |
-
|
| 653 |
-
// Check if this cell contains a model name and hasn't been converted yet
|
| 654 |
-
if (modelLinks[cellText] && !modelCell.querySelector('a')) {
|
| 655 |
-
// Create clickable link
|
| 656 |
-
const link = document.createElement('a');
|
| 657 |
-
link.href = modelLinks[cellText];
|
| 658 |
-
link.textContent = cellText;
|
| 659 |
-
link.target = '_blank';
|
| 660 |
-
link.style.color = '#2563eb';
|
| 661 |
-
link.style.textDecoration = 'underline';
|
| 662 |
-
link.style.fontWeight = '500';
|
| 663 |
-
|
| 664 |
-
// Replace text with link
|
| 665 |
-
modelCell.innerHTML = '';
|
| 666 |
-
modelCell.appendChild(link);
|
| 667 |
-
}
|
| 668 |
-
}
|
| 669 |
-
});
|
| 670 |
-
}
|
| 671 |
-
|
| 672 |
-
// Continuous polling approach - runs every 300ms
|
| 673 |
-
// This is the ONLY reliable way with virtual scrolling components
|
| 674 |
-
setInterval(makeLinksClickable, 300);
|
| 675 |
-
}
|
| 676 |
-
"""
|
| 677 |
-
)
|
| 678 |
|
| 679 |
# Function to check initial authentication status
|
| 680 |
def check_auth_on_load(request: gr.Request):
|
|
@@ -725,7 +685,7 @@ def check_auth_on_load(request: gr.Request):
|
|
| 725 |
|
| 726 |
|
| 727 |
# Gradio Interface
|
| 728 |
-
with gr.Blocks(
|
| 729 |
user_authenticated = gr.State(False)
|
| 730 |
models_state = gr.State({})
|
| 731 |
conversation_state = gr.State({})
|
|
@@ -735,24 +695,28 @@ with gr.Blocks(js=clickable_links_js) as app:
|
|
| 735 |
|
| 736 |
with gr.Tab("🏆Leaderboard"):
|
| 737 |
# Add title and description as a Markdown component
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
|
|
|
|
|
|
| 744 |
)
|
|
|
|
| 745 |
# Initialize the leaderboard with the DataFrame containing the expected columns
|
| 746 |
leaderboard_component = Leaderboard(
|
| 747 |
value=get_leaderboard_data(use_cache=True),
|
| 748 |
select_columns=[
|
| 749 |
"Rank",
|
| 750 |
"Model",
|
|
|
|
| 751 |
"Elo Score",
|
| 752 |
"Conversation Efficiency Index",
|
| 753 |
"Model Consistency Score",
|
| 754 |
],
|
| 755 |
-
search_columns=["Model"],
|
| 756 |
filter_columns=[
|
| 757 |
ColumnFilter(
|
| 758 |
"Elo Score",
|
|
@@ -822,6 +786,7 @@ with gr.Blocks(js=clickable_links_js) as app:
|
|
| 822 |
datatype=[
|
| 823 |
"number",
|
| 824 |
"str",
|
|
|
|
| 825 |
"number",
|
| 826 |
"number",
|
| 827 |
"number",
|
|
@@ -832,8 +797,12 @@ with gr.Blocks(js=clickable_links_js) as app:
|
|
| 832 |
"number",
|
| 833 |
],
|
| 834 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 835 |
# Add a citation block in Markdown
|
| 836 |
-
|
| 837 |
"""
|
| 838 |
Made with ❤️ for SWE-Model-Arena. If this work is useful to you, please consider citing our vision paper:
|
| 839 |
```
|
|
@@ -850,26 +819,29 @@ with gr.Blocks(js=clickable_links_js) as app:
|
|
| 850 |
)
|
| 851 |
with gr.Tab("⚔️Arena"):
|
| 852 |
# Add title and description as a Markdown component
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
# ⚔️ SWE-Model-Arena: Explore and Test Top FMs with SE Tasks by Community Voting
|
| 856 |
|
| 857 |
-
|
|
|
|
|
|
|
| 858 |
- **Blind Comparison**: Submit a SE-related query to two anonymous FMs randomly selected from up to {len(available_models)} top models from OpenAI, Gemini, Grok, Claude, Deepseek, Qwen, Llama, Mistral, and others.
|
| 859 |
- **Interactive Voting**: Engage in multi-turn dialogues with both FMs and compare their responses. You can continue the conversation until you confidently choose the better model.
|
| 860 |
- **Fair Play Rules**: Votes are counted only if FM identities remain anonymous. Revealing a FM's identity disqualifies the session.
|
| 861 |
-
|
| 862 |
-
**Note:** Due to budget constraints, responses that take longer than {TIMEOUT} seconds to generate will be discarded.
|
| 863 |
-
""",
|
| 864 |
-
elem_classes="arena-intro",
|
| 865 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 866 |
# Add Hugging Face Sign In button and message
|
| 867 |
with gr.Row():
|
| 868 |
# Define the markdown text with or without the hint string
|
| 869 |
-
markdown_text = "
|
| 870 |
if SHOW_HINT_STRING:
|
| 871 |
-
markdown_text += f"\n{HINT_STRING}"
|
| 872 |
-
hint_markdown = gr.Markdown(markdown_text
|
| 873 |
with gr.Column():
|
| 874 |
login_button = gr.LoginButton(
|
| 875 |
"Sign in with Hugging Face", elem_id="oauth-button"
|
|
@@ -1538,12 +1510,14 @@ with gr.Blocks(js=clickable_links_js) as app:
|
|
| 1538 |
],
|
| 1539 |
)
|
| 1540 |
|
|
|
|
|
|
|
|
|
|
| 1541 |
# Add Terms of Service at the bottom
|
| 1542 |
-
|
|
|
|
| 1543 |
"""
|
| 1544 |
-
|
| 1545 |
-
|
| 1546 |
-
Users are required to agree to the following terms before using the service:
|
| 1547 |
|
| 1548 |
- The service is a **research preview**. It only provides limited safety measures and may generate offensive content.
|
| 1549 |
- It must not be used for any **illegal, harmful, violent, racist, or sexual** purposes.
|
|
|
|
| 423 |
columns=[
|
| 424 |
"Rank",
|
| 425 |
"Model",
|
| 426 |
+
"Website",
|
| 427 |
"Elo Score",
|
| 428 |
"Conversation Efficiency Index",
|
| 429 |
"Model Consistency Score",
|
|
|
|
| 565 |
mcs_result = pd.Series(mcs_result)
|
| 566 |
|
| 567 |
# Combine all results into a single DataFrame
|
| 568 |
+
# Add Website column by mapping model names to their links
|
| 569 |
+
website_values = [model_links.get(model, "N/A") for model in elo_result.scores.index]
|
| 570 |
+
|
| 571 |
leaderboard_data = pd.DataFrame(
|
| 572 |
{
|
| 573 |
"Model": elo_result.scores.index,
|
| 574 |
+
"Website": website_values,
|
| 575 |
"Elo Score": elo_result.scores.values,
|
| 576 |
"Conversation Efficiency Index": cei_result.values,
|
| 577 |
"Model Consistency Score": mcs_result.values,
|
|
|
|
| 605 |
["Rank"] + [col for col in leaderboard_data.columns if col != "Rank"]
|
| 606 |
]
|
| 607 |
|
|
|
|
|
|
|
| 608 |
# Save leaderboard data if this is a new vote
|
| 609 |
if vote_entry is not None:
|
| 610 |
try:
|
|
|
|
| 635 |
return gr.update(interactive=True) # Enable the button
|
| 636 |
|
| 637 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
|
| 639 |
# Function to check initial authentication status
|
| 640 |
def check_auth_on_load(request: gr.Request):
|
|
|
|
| 685 |
|
| 686 |
|
| 687 |
# Gradio Interface
|
| 688 |
+
with gr.Blocks(title="SWE-Model-Arena", theme=gr.themes.Soft()) as app:
|
| 689 |
user_authenticated = gr.State(False)
|
| 690 |
models_state = gr.State({})
|
| 691 |
conversation_state = gr.State({})
|
|
|
|
| 695 |
|
| 696 |
with gr.Tab("🏆Leaderboard"):
|
| 697 |
# Add title and description as a Markdown component
|
| 698 |
+
gr.Markdown("# 🏆 FM4SE Leaderboard")
|
| 699 |
+
gr.Markdown(
|
| 700 |
+
"Community-Driven Evaluation of Top Foundation Models (FMs) in Software Engineering (SE) Tasks"
|
| 701 |
+
)
|
| 702 |
+
gr.Markdown(
|
| 703 |
+
"*The SWE-Model-Arena is an open-source platform designed to evaluate foundation models through human preference, "
|
| 704 |
+
"fostering transparency and collaboration. This platform aims to empower the SE community to assess and compare the "
|
| 705 |
+
"performance of leading FMs in related tasks. For technical details, check out our [paper](https://arxiv.org/abs/2502.01860).*"
|
| 706 |
)
|
| 707 |
+
|
| 708 |
# Initialize the leaderboard with the DataFrame containing the expected columns
|
| 709 |
leaderboard_component = Leaderboard(
|
| 710 |
value=get_leaderboard_data(use_cache=True),
|
| 711 |
select_columns=[
|
| 712 |
"Rank",
|
| 713 |
"Model",
|
| 714 |
+
"Website",
|
| 715 |
"Elo Score",
|
| 716 |
"Conversation Efficiency Index",
|
| 717 |
"Model Consistency Score",
|
| 718 |
],
|
| 719 |
+
search_columns=["Model", "Website"],
|
| 720 |
filter_columns=[
|
| 721 |
ColumnFilter(
|
| 722 |
"Elo Score",
|
|
|
|
| 786 |
datatype=[
|
| 787 |
"number",
|
| 788 |
"str",
|
| 789 |
+
"str",
|
| 790 |
"number",
|
| 791 |
"number",
|
| 792 |
"number",
|
|
|
|
| 797 |
"number",
|
| 798 |
],
|
| 799 |
)
|
| 800 |
+
|
| 801 |
+
# Add a divider
|
| 802 |
+
gr.Markdown("---")
|
| 803 |
+
|
| 804 |
# Add a citation block in Markdown
|
| 805 |
+
gr.Markdown(
|
| 806 |
"""
|
| 807 |
Made with ❤️ for SWE-Model-Arena. If this work is useful to you, please consider citing our vision paper:
|
| 808 |
```
|
|
|
|
| 819 |
)
|
| 820 |
with gr.Tab("⚔️Arena"):
|
| 821 |
# Add title and description as a Markdown component
|
| 822 |
+
gr.Markdown("# ⚔️ SWE-Model-Arena")
|
| 823 |
+
gr.Markdown("Explore and Test Top FMs with SE Tasks by Community Voting")
|
|
|
|
| 824 |
|
| 825 |
+
gr.Markdown("### 📜 How It Works")
|
| 826 |
+
gr.Markdown(
|
| 827 |
+
f"""
|
| 828 |
- **Blind Comparison**: Submit a SE-related query to two anonymous FMs randomly selected from up to {len(available_models)} top models from OpenAI, Gemini, Grok, Claude, Deepseek, Qwen, Llama, Mistral, and others.
|
| 829 |
- **Interactive Voting**: Engage in multi-turn dialogues with both FMs and compare their responses. You can continue the conversation until you confidently choose the better model.
|
| 830 |
- **Fair Play Rules**: Votes are counted only if FM identities remain anonymous. Revealing a FM's identity disqualifies the session.
|
| 831 |
+
"""
|
|
|
|
|
|
|
|
|
|
| 832 |
)
|
| 833 |
+
gr.Markdown(f"*Note: Due to budget constraints, responses that take longer than {TIMEOUT} seconds to generate will be discarded.*")
|
| 834 |
+
|
| 835 |
+
# Add a divider
|
| 836 |
+
gr.Markdown("---")
|
| 837 |
+
|
| 838 |
# Add Hugging Face Sign In button and message
|
| 839 |
with gr.Row():
|
| 840 |
# Define the markdown text with or without the hint string
|
| 841 |
+
markdown_text = "### Please sign in first to vote!"
|
| 842 |
if SHOW_HINT_STRING:
|
| 843 |
+
markdown_text += f"\n*{HINT_STRING}*"
|
| 844 |
+
hint_markdown = gr.Markdown(markdown_text)
|
| 845 |
with gr.Column():
|
| 846 |
login_button = gr.LoginButton(
|
| 847 |
"Sign in with Hugging Face", elem_id="oauth-button"
|
|
|
|
| 1510 |
],
|
| 1511 |
)
|
| 1512 |
|
| 1513 |
+
# Add a divider
|
| 1514 |
+
gr.Markdown("---")
|
| 1515 |
+
|
| 1516 |
# Add Terms of Service at the bottom
|
| 1517 |
+
gr.Markdown("### Terms of Service")
|
| 1518 |
+
gr.Markdown(
|
| 1519 |
"""
|
| 1520 |
+
*Users are required to agree to the following terms before using the service:*
|
|
|
|
|
|
|
| 1521 |
|
| 1522 |
- The service is a **research preview**. It only provides limited safety measures and may generate offensive content.
|
| 1523 |
- It must not be used for any **illegal, harmful, violent, racist, or sexual** purposes.
|