Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import spaces | |
| import pandas as pd | |
| from typing import List, Dict, Tuple | |
| from flow_judge import Hf, FlowJudge, EvalInput | |
| from flow_judge.metrics import CustomMetric, RubricItem | |
| from huggingface_hub import snapshot_download | |
| from flow_judge.models.huggingface import Hf | |
| from examples import get_examples | |
| MODEL_NAME = "flowaicom/Flow-Judge-v0.1" | |
| def download_model(): | |
| try: | |
| print(f"Downloading model {MODEL_NAME}...") | |
| snapshot_download(repo_id=MODEL_NAME) | |
| print(f"Model {MODEL_NAME} downloaded to default Hugging Face cache") | |
| return True | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to download model {MODEL_NAME}: {e}") | |
| def evaluate( | |
| inputs_task: List[Dict[str, str]], | |
| output_name: str, | |
| output_value: str, | |
| evaluation_criteria: str, | |
| rubric_items: List[Dict[str, str]] | |
| ) -> Tuple[str, int]: | |
| # [{'name': 'a', 'value': 'a'}] | |
| try: | |
| model = Hf(flash_attn=False) | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to initialize Hf Model: {e}") | |
| eval_input = EvalInput( | |
| inputs=[{input['name']: input['value']} for input in inputs_task], | |
| output={output_name: output_value} | |
| ) | |
| score_rubric_items = [ | |
| RubricItem( | |
| score=int(rubric_item['name']), | |
| description=rubric_item['value'] | |
| ) | |
| for rubric_item in rubric_items | |
| ] | |
| custom_metric = CustomMetric( | |
| name="custom-metric", | |
| criteria=evaluation_criteria, | |
| rubric=score_rubric_items, | |
| required_inputs=[input['name'] for input in inputs_task], | |
| required_output=output_name | |
| ) | |
| judge = FlowJudge(model=model, metric=custom_metric) | |
| try: | |
| result = judge.evaluate(eval_input) | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to evaluate: {e}") | |
| return result.feedback, result.score | |
| def reset_all(): | |
| return ( | |
| [], "", "", [], "", "", # Existing resets for inputs and rubrics | |
| "", "", "", "", "", # New resets for additional fields | |
| gr.update(visible=True), # Show new_input_name | |
| gr.update(visible=True), # Show new_input_value | |
| gr.update(visible=True), # Show new_rubric_name | |
| gr.update(visible=True), # Show new_rubric_value | |
| gr.update(visible=True), # Show Add Input button | |
| gr.update(visible=True), # Show Add Rubric Item button | |
| ) | |
| # Define presets | |
| EXAMPLES = get_examples() | |
| IMAGE_PATH = "./img/flow_judge_banner.png" | |
| HEADER = """<h1 align="center" style="font-family: 'Courier New', Courier, monospace;">Flow Judge Demo</h1> | |
| <p align="center" style="font-family: 'Courier New', Courier, monospace;"> | |
| <strong> | |
| <a href="https://www.flow-ai.com/judge">Technical Report</a> | | |
| <a href="https://huggingface.co/collections/flowaicom/flow-judge-v01-66e6af5fc3b3a128bde07dec">Model Weights</a> | | |
| <a href="https://github.com/flowaicom/lm-evaluation-harness/tree/Flow-Judge-v0.1_evals/lm_eval/tasks/flow_judge_evals">Evaluation Code</a> | | |
| <a href="https://github.com/flowaicom/flow-judge/tree/main/examples">Tutorials</a> | |
| </strong> | |
| </p> | |
| <p align="center" style="font-family: 'Courier New', Courier, monospace;"> | |
| <code>flow-judge</code> is a lightweight library for evaluating LLM applications with <code>Flow-Judge-v0.1</code>. | |
| </p>""" | |
| with gr.Blocks() as demo: | |
| model_downloaded = download_model() | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=2): | |
| gr.Image(IMAGE_PATH, show_label=False, interactive=False, show_share_button=False, show_fullscreen_button=False, show_download_button=False) | |
| with gr.Column(scale=3): | |
| gr.HTML(HEADER) | |
| gr.Markdown("# ⚡ **Quickstart Examples**") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| preset_buttons = [gr.Button(example["description"]) for example in EXAMPLES[:len(EXAMPLES)//3]] | |
| with gr.Column(scale=1): | |
| preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[len(EXAMPLES)//3:2*len(EXAMPLES)//3]] | |
| with gr.Column(scale=1): | |
| preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[2*len(EXAMPLES)//3:]] | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=1): | |
| gr.Markdown("## **Evaluation task inputs**") | |
| gr.Markdown("*<span style='color: gray;'>Define the input names and values. Inputs are optional if evaluation depends on the output only.</span>*") | |
| with gr.Group(): | |
| inputs_task = gr.State([]) | |
| with gr.Row(equal_height=True): | |
| with gr.Column(min_width=60, scale=2): | |
| new_input_name = gr.Textbox( | |
| label="Name", | |
| show_label=True, | |
| autoscroll=False, | |
| max_lines=1, | |
| visible=True # Initially visible | |
| ) | |
| with gr.Column(scale=9): | |
| new_input_value = gr.Textbox( | |
| label="Value", | |
| show_label=True, | |
| autoscroll=False, | |
| max_lines=3, | |
| visible=True # Initially visible | |
| ) | |
| def add_input(inputs_task, new_input_name, new_input_value): | |
| return inputs_task + [{"name": new_input_name, "value": new_input_value}], "", "" | |
| # You have to pass the state here | |
| def render_inputs(inputs_list): # Use different name than the state variable | |
| for input in inputs_list: | |
| with gr.Group(): | |
| with gr.Row(equal_height=True): | |
| with gr.Column(min_width=60, scale=2): | |
| gr.Textbox(input['name'], label="Name", show_label=True, interactive=False, autoscroll=False, max_lines=1) | |
| with gr.Column(scale=8): | |
| gr.Textbox(input['value'], label="Value", show_label=True, interactive=False, autoscroll=False, max_lines=3) | |
| with gr.Column(min_width=15, scale=1): | |
| delete_btn = gr.Button("X", size="lg", variant="secondary") | |
| def delete(input=input): | |
| inputs_list.remove(input) | |
| return inputs_list | |
| delete_btn.click(delete, None, [inputs_task]) # This is the state variable | |
| with gr.Group(): | |
| add_input_btn = gr.Button("Add Input") # Assign to variable | |
| add_input_btn.click( | |
| add_input, | |
| [inputs_task, new_input_name, new_input_value], | |
| [inputs_task, new_input_name, new_input_value] | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Markdown("## **Evaluation task output**") | |
| gr.Markdown("*<span style='color: gray;'>Define the output name and value. Output is always required.</span>*") | |
| with gr.Group(): | |
| with gr.Row(equal_height=True): | |
| with gr.Column(min_width=60, scale=2): | |
| output_name = gr.Textbox(label="Name", show_label=True, interactive=True, autoscroll=False, max_lines=1) | |
| with gr.Column(scale=9): | |
| output_value = gr.Textbox(label="Value", show_label=True, interactive=True, autoscroll=False, max_lines=3) | |
| with gr.Column(scale=1): | |
| gr.Markdown("## **Evaluation criteria and rubric**") | |
| gr.Markdown("*<span style='color: gray;'>Define the evaluation criteria and rubric for the evaluation task. Supported scoring scales: Binary (0 and 1), 3-Likert and 5-Likert.</span>*\n\n*<span style='color: gray;'>❗You can experiment with other scoring scales. However, performance may vary.</span>*") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| with gr.Group(): | |
| rubric_items = gr.State([]) | |
| with gr.Row(equal_height=True): | |
| with gr.Column(min_width=60, scale=2): | |
| new_rubric_name = gr.Textbox( | |
| label="Score", | |
| show_label=True, | |
| interactive=True, | |
| autoscroll=False, | |
| max_lines=1, | |
| visible=True # Initially visible | |
| ) | |
| with gr.Column(scale=9): | |
| new_rubric_value = gr.Textbox( | |
| label="Description", | |
| show_label=True, | |
| interactive=True, | |
| autoscroll=False, | |
| max_lines=3, | |
| visible=True # Initially visible | |
| ) | |
| def add_rubric_item(rubric_items, new_rubric_name, new_rubric_value): | |
| return rubric_items + [{"name": new_rubric_name, "value": new_rubric_value}], "", "" | |
| # You have to pass the state here | |
| def render_rubrics(rubric_items_list): # Use different name than the state variable | |
| for rubric_item in rubric_items_list: | |
| with gr.Group(): | |
| with gr.Row(equal_height=True): | |
| with gr.Column(min_width=60, scale=2): | |
| gr.Textbox( | |
| rubric_item['name'], | |
| label="Score", | |
| show_label=True, | |
| interactive=False | |
| ) | |
| with gr.Column(scale=8): | |
| gr.Textbox( | |
| rubric_item['value'], | |
| label="Description", | |
| show_label=True, | |
| interactive=False | |
| ) | |
| with gr.Column(min_width=15, scale=1): | |
| delete_btn = gr.Button("X", size="lg", variant="secondary") | |
| def delete(rubric_item=rubric_item): | |
| rubric_items_list.remove(rubric_item) | |
| return rubric_items_list | |
| delete_btn.click(delete, None, [rubric_items]) # This is the state variable | |
| with gr.Group(): | |
| add_rubric_btn = gr.Button("Add Rubric Item") # Assign to variable | |
| add_rubric_btn.click( | |
| add_rubric_item, | |
| [rubric_items, new_rubric_name, new_rubric_value], | |
| [rubric_items, new_rubric_name, new_rubric_value] | |
| ) | |
| with gr.Column(scale=1): | |
| evaluation_criteria = gr.Textbox(label="Evaluation criteria") | |
| with gr.Row(): | |
| with gr.Column(scale=1, variant="panel"): | |
| gr.Markdown("# **Evaluation**") | |
| with gr.Group(): | |
| with gr.Row(equal_height=True): | |
| with gr.Column(min_width=60, scale=1): | |
| score = gr.Textbox(label="Score", interactive=False, autoscroll=False, max_lines=1) | |
| with gr.Column(scale=9): | |
| feedback = gr.Textbox(label="Feedback", interactive=False, autoscroll=False, max_lines=6) | |
| with gr.Column(min_width=15, scale=1): | |
| evaluate_btn = gr.Button("Evaluate", variant="primary") | |
| reset_all_btn = gr.Button("Clear All", variant="stop") # Add Reset All button | |
| reset_all_btn.click( | |
| reset_all, | |
| inputs=[], | |
| outputs=[ | |
| inputs_task, | |
| new_input_name, | |
| new_input_value, | |
| rubric_items, | |
| new_rubric_name, | |
| new_rubric_value, | |
| evaluation_criteria, | |
| output_name, | |
| output_value, | |
| feedback, | |
| score, | |
| new_input_name, # Visibility for new_input_name | |
| new_input_value, # Visibility for new_input_value | |
| new_rubric_name, # Visibility for new_rubric_name | |
| new_rubric_value, # Visibility for new_rubric_value | |
| add_input_btn, # Visibility for Add Input button | |
| add_rubric_btn, # Visibility for Add Rubric Item button | |
| ] | |
| ) | |
| evaluate_btn.click( | |
| evaluate, | |
| inputs=[inputs_task, output_name, output_value, evaluation_criteria, rubric_items], | |
| outputs=[feedback, score] | |
| ) | |
| for i, button in enumerate(preset_buttons): | |
| def populate_preset(ex_i=i): | |
| return populate_fields(ex_i) | |
| button.click( | |
| populate_preset, | |
| inputs=[], | |
| outputs=[ | |
| inputs_task, | |
| output_name, | |
| output_value, | |
| evaluation_criteria, | |
| rubric_items, | |
| feedback, | |
| score, | |
| new_input_name, # Visibility for new_input_name | |
| new_input_value, # Visibility for new_input_value | |
| new_rubric_name, # Visibility for new_rubric_name | |
| new_rubric_value, # Visibility for new_rubric_value | |
| add_input_btn, # Visibility for Add Input button | |
| add_rubric_btn, # Visibility for Add Rubric Item button | |
| ] | |
| ) | |
| def populate_fields(example_index: int): | |
| example = EXAMPLES[example_index] | |
| return ( | |
| example["inputs_task"], | |
| example["output"]["name"], | |
| example["output"]["value"], | |
| example["evaluation_criteria"], | |
| example["rubric"], | |
| "", # Reset feedback | |
| "", # Reset score | |
| gr.update(visible=False), # Hide new_input_name | |
| gr.update(visible=False), # Hide new_input_value | |
| gr.update(visible=False), # Hide new_rubric_name | |
| gr.update(visible=False), # Hide new_rubric_value | |
| gr.update(visible=False), # Hide Add Input button | |
| gr.update(visible=False), # Hide Add Rubric Item button | |
| ) | |
| demo.launch(debug=True) |