zhiminy commited on
Commit
2f04967
·
1 Parent(s): f8ff61a
Files changed (2) hide show
  1. README.md +3 -4
  2. app.py +43 -69
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
  title: SWE-Model-Arena
3
  emoji: 🎯
4
- colorFrom: blue
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.49.1
8
  app_file: app.py
9
  hf_oauth: true
10
  pinned: false
@@ -48,7 +48,6 @@ Existing evaluation frameworks (e.g. [LMArena](https://lmarena.ai)) often don't
48
  ### Prerequisites
49
 
50
  - A [Hugging Face](https://huggingface.co) account
51
- - Basic understanding of software engineering workflows
52
 
53
  ### Usage
54
 
 
1
  ---
2
  title: SWE-Model-Arena
3
  emoji: 🎯
4
+ colorFrom: green
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.50.0
8
  app_file: app.py
9
  hf_oauth: true
10
  pinned: false
 
48
  ### Prerequisites
49
 
50
  - A [Hugging Face](https://huggingface.co) account
 
51
 
52
  ### Usage
53
 
app.py CHANGED
@@ -423,6 +423,7 @@ def get_leaderboard_data(vote_entry=None, use_cache=True):
423
  columns=[
424
  "Rank",
425
  "Model",
 
426
  "Elo Score",
427
  "Conversation Efficiency Index",
428
  "Model Consistency Score",
@@ -564,9 +565,13 @@ def get_leaderboard_data(vote_entry=None, use_cache=True):
564
  mcs_result = pd.Series(mcs_result)
565
 
566
  # Combine all results into a single DataFrame
 
 
 
567
  leaderboard_data = pd.DataFrame(
568
  {
569
  "Model": elo_result.scores.index,
 
570
  "Elo Score": elo_result.scores.values,
571
  "Conversation Efficiency Index": cei_result.values,
572
  "Model Consistency Score": mcs_result.values,
@@ -600,8 +605,6 @@ def get_leaderboard_data(vote_entry=None, use_cache=True):
600
  ["Rank"] + [col for col in leaderboard_data.columns if col != "Rank"]
601
  ]
602
 
603
- # Note: Links are made clickable via JavaScript, keeping plain model names here
604
-
605
  # Save leaderboard data if this is a new vote
606
  if vote_entry is not None:
607
  try:
@@ -632,49 +635,6 @@ def toggle_submit_button(text):
632
  return gr.update(interactive=True) # Enable the button
633
 
634
 
635
- # JavaScript to make model name links clickable in the leaderboard
636
- # Uses event delegation on the table container for better stability
637
- clickable_links_js = (
638
- """
639
- function() {
640
- const modelLinks = """
641
- + json.dumps(model_links)
642
- + """;
643
-
644
- function makeLinksClickable() {
645
- // Find all table cells in the Model column (2nd column)
646
- const rows = document.querySelectorAll('table tbody tr');
647
- rows.forEach(row => {
648
- const cells = row.querySelectorAll('td');
649
- if (cells.length >= 2) {
650
- const modelCell = cells[1]; // Model is the 2nd column
651
- const cellText = modelCell.textContent.trim();
652
-
653
- // Check if this cell contains a model name and hasn't been converted yet
654
- if (modelLinks[cellText] && !modelCell.querySelector('a')) {
655
- // Create clickable link
656
- const link = document.createElement('a');
657
- link.href = modelLinks[cellText];
658
- link.textContent = cellText;
659
- link.target = '_blank';
660
- link.style.color = '#2563eb';
661
- link.style.textDecoration = 'underline';
662
- link.style.fontWeight = '500';
663
-
664
- // Replace text with link
665
- modelCell.innerHTML = '';
666
- modelCell.appendChild(link);
667
- }
668
- }
669
- });
670
- }
671
-
672
- // Continuous polling approach - runs every 300ms
673
- // This is the ONLY reliable way with virtual scrolling components
674
- setInterval(makeLinksClickable, 300);
675
- }
676
- """
677
- )
678
 
679
  # Function to check initial authentication status
680
  def check_auth_on_load(request: gr.Request):
@@ -725,7 +685,7 @@ def check_auth_on_load(request: gr.Request):
725
 
726
 
727
  # Gradio Interface
728
- with gr.Blocks(js=clickable_links_js) as app:
729
  user_authenticated = gr.State(False)
730
  models_state = gr.State({})
731
  conversation_state = gr.State({})
@@ -735,24 +695,28 @@ with gr.Blocks(js=clickable_links_js) as app:
735
 
736
  with gr.Tab("🏆Leaderboard"):
737
  # Add title and description as a Markdown component
738
- leaderboard_intro = gr.Markdown(
739
- """
740
- # 🏆 FM4SE Leaderboard: Community-Driven Evaluation of Top Foundation Models (FMs) in Software Engineering (SE) Tasks
741
- The SWE-Model-Arena is an open-source platform designed to evaluate foundation models through human preference, fostering transparency and collaboration. This platform aims to empower the SE community to assess and compare the performance of leading FMs in related tasks. For technical details, check out our [paper](https://arxiv.org/abs/2502.01860).
742
- """,
743
- elem_classes="leaderboard-intro",
 
 
744
  )
 
745
  # Initialize the leaderboard with the DataFrame containing the expected columns
746
  leaderboard_component = Leaderboard(
747
  value=get_leaderboard_data(use_cache=True),
748
  select_columns=[
749
  "Rank",
750
  "Model",
 
751
  "Elo Score",
752
  "Conversation Efficiency Index",
753
  "Model Consistency Score",
754
  ],
755
- search_columns=["Model"],
756
  filter_columns=[
757
  ColumnFilter(
758
  "Elo Score",
@@ -822,6 +786,7 @@ with gr.Blocks(js=clickable_links_js) as app:
822
  datatype=[
823
  "number",
824
  "str",
 
825
  "number",
826
  "number",
827
  "number",
@@ -832,8 +797,12 @@ with gr.Blocks(js=clickable_links_js) as app:
832
  "number",
833
  ],
834
  )
 
 
 
 
835
  # Add a citation block in Markdown
836
- citation_component = gr.Markdown(
837
  """
838
  Made with ❤️ for SWE-Model-Arena. If this work is useful to you, please consider citing our vision paper:
839
  ```
@@ -850,26 +819,29 @@ with gr.Blocks(js=clickable_links_js) as app:
850
  )
851
  with gr.Tab("⚔️Arena"):
852
  # Add title and description as a Markdown component
853
- arena_intro = gr.Markdown(
854
- f"""
855
- # ⚔️ SWE-Model-Arena: Explore and Test Top FMs with SE Tasks by Community Voting
856
 
857
- ## 📜How It Works
 
 
858
  - **Blind Comparison**: Submit a SE-related query to two anonymous FMs randomly selected from up to {len(available_models)} top models from OpenAI, Gemini, Grok, Claude, Deepseek, Qwen, Llama, Mistral, and others.
859
  - **Interactive Voting**: Engage in multi-turn dialogues with both FMs and compare their responses. You can continue the conversation until you confidently choose the better model.
860
  - **Fair Play Rules**: Votes are counted only if FM identities remain anonymous. Revealing a FM's identity disqualifies the session.
861
-
862
- **Note:** Due to budget constraints, responses that take longer than {TIMEOUT} seconds to generate will be discarded.
863
- """,
864
- elem_classes="arena-intro",
865
  )
 
 
 
 
 
866
  # Add Hugging Face Sign In button and message
867
  with gr.Row():
868
  # Define the markdown text with or without the hint string
869
- markdown_text = "## Please sign in first to vote!"
870
  if SHOW_HINT_STRING:
871
- markdown_text += f"\n{HINT_STRING}"
872
- hint_markdown = gr.Markdown(markdown_text, elem_classes="markdown-text")
873
  with gr.Column():
874
  login_button = gr.LoginButton(
875
  "Sign in with Hugging Face", elem_id="oauth-button"
@@ -1538,12 +1510,14 @@ with gr.Blocks(js=clickable_links_js) as app:
1538
  ],
1539
  )
1540
 
 
 
 
1541
  # Add Terms of Service at the bottom
1542
- terms_of_service = gr.Markdown(
 
1543
  """
1544
- ## Terms of Service
1545
-
1546
- Users are required to agree to the following terms before using the service:
1547
 
1548
  - The service is a **research preview**. It only provides limited safety measures and may generate offensive content.
1549
  - It must not be used for any **illegal, harmful, violent, racist, or sexual** purposes.
 
423
  columns=[
424
  "Rank",
425
  "Model",
426
+ "Website",
427
  "Elo Score",
428
  "Conversation Efficiency Index",
429
  "Model Consistency Score",
 
565
  mcs_result = pd.Series(mcs_result)
566
 
567
  # Combine all results into a single DataFrame
568
+ # Add Website column by mapping model names to their links
569
+ website_values = [model_links.get(model, "N/A") for model in elo_result.scores.index]
570
+
571
  leaderboard_data = pd.DataFrame(
572
  {
573
  "Model": elo_result.scores.index,
574
+ "Website": website_values,
575
  "Elo Score": elo_result.scores.values,
576
  "Conversation Efficiency Index": cei_result.values,
577
  "Model Consistency Score": mcs_result.values,
 
605
  ["Rank"] + [col for col in leaderboard_data.columns if col != "Rank"]
606
  ]
607
 
 
 
608
  # Save leaderboard data if this is a new vote
609
  if vote_entry is not None:
610
  try:
 
635
  return gr.update(interactive=True) # Enable the button
636
 
637
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
638
 
639
  # Function to check initial authentication status
640
  def check_auth_on_load(request: gr.Request):
 
685
 
686
 
687
  # Gradio Interface
688
+ with gr.Blocks(title="SWE-Model-Arena", theme=gr.themes.Soft()) as app:
689
  user_authenticated = gr.State(False)
690
  models_state = gr.State({})
691
  conversation_state = gr.State({})
 
695
 
696
  with gr.Tab("🏆Leaderboard"):
697
  # Add title and description as a Markdown component
698
+ gr.Markdown("# 🏆 FM4SE Leaderboard")
699
+ gr.Markdown(
700
+ "Community-Driven Evaluation of Top Foundation Models (FMs) in Software Engineering (SE) Tasks"
701
+ )
702
+ gr.Markdown(
703
+ "*The SWE-Model-Arena is an open-source platform designed to evaluate foundation models through human preference, "
704
+ "fostering transparency and collaboration. This platform aims to empower the SE community to assess and compare the "
705
+ "performance of leading FMs in related tasks. For technical details, check out our [paper](https://arxiv.org/abs/2502.01860).*"
706
  )
707
+
708
  # Initialize the leaderboard with the DataFrame containing the expected columns
709
  leaderboard_component = Leaderboard(
710
  value=get_leaderboard_data(use_cache=True),
711
  select_columns=[
712
  "Rank",
713
  "Model",
714
+ "Website",
715
  "Elo Score",
716
  "Conversation Efficiency Index",
717
  "Model Consistency Score",
718
  ],
719
+ search_columns=["Model", "Website"],
720
  filter_columns=[
721
  ColumnFilter(
722
  "Elo Score",
 
786
  datatype=[
787
  "number",
788
  "str",
789
+ "str",
790
  "number",
791
  "number",
792
  "number",
 
797
  "number",
798
  ],
799
  )
800
+
801
+ # Add a divider
802
+ gr.Markdown("---")
803
+
804
  # Add a citation block in Markdown
805
+ gr.Markdown(
806
  """
807
  Made with ❤️ for SWE-Model-Arena. If this work is useful to you, please consider citing our vision paper:
808
  ```
 
819
  )
820
  with gr.Tab("⚔️Arena"):
821
  # Add title and description as a Markdown component
822
+ gr.Markdown("# ⚔️ SWE-Model-Arena")
823
+ gr.Markdown("Explore and Test Top FMs with SE Tasks by Community Voting")
 
824
 
825
+ gr.Markdown("### 📜 How It Works")
826
+ gr.Markdown(
827
+ f"""
828
  - **Blind Comparison**: Submit a SE-related query to two anonymous FMs randomly selected from up to {len(available_models)} top models from OpenAI, Gemini, Grok, Claude, Deepseek, Qwen, Llama, Mistral, and others.
829
  - **Interactive Voting**: Engage in multi-turn dialogues with both FMs and compare their responses. You can continue the conversation until you confidently choose the better model.
830
  - **Fair Play Rules**: Votes are counted only if FM identities remain anonymous. Revealing a FM's identity disqualifies the session.
831
+ """
 
 
 
832
  )
833
+ gr.Markdown(f"*Note: Due to budget constraints, responses that take longer than {TIMEOUT} seconds to generate will be discarded.*")
834
+
835
+ # Add a divider
836
+ gr.Markdown("---")
837
+
838
  # Add Hugging Face Sign In button and message
839
  with gr.Row():
840
  # Define the markdown text with or without the hint string
841
+ markdown_text = "### Please sign in first to vote!"
842
  if SHOW_HINT_STRING:
843
+ markdown_text += f"\n*{HINT_STRING}*"
844
+ hint_markdown = gr.Markdown(markdown_text)
845
  with gr.Column():
846
  login_button = gr.LoginButton(
847
  "Sign in with Hugging Face", elem_id="oauth-button"
 
1510
  ],
1511
  )
1512
 
1513
+ # Add a divider
1514
+ gr.Markdown("---")
1515
+
1516
  # Add Terms of Service at the bottom
1517
+ gr.Markdown("### Terms of Service")
1518
+ gr.Markdown(
1519
  """
1520
+ *Users are required to agree to the following terms before using the service:*
 
 
1521
 
1522
  - The service is a **research preview**. It only provides limited safety measures and may generate offensive content.
1523
  - It must not be used for any **illegal, harmful, violent, racist, or sexual** purposes.