Spaces:

flowaicom
/

Flow-Judge-v0.1

Runtime error

App Files Files Community

bergr7f commited on Oct 13, 2024

Commit

06d4d61

1 Parent(s): 06826f1

upgrade UI and functionality

Browse files

Files changed (1) hide show

app.py +181 -154

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 import spaces
 import pandas as pd
-from typing import List, Dict
 from flow_judge import Hf, FlowJudge, EvalInput
 from flow_judge.metrics import CustomMetric, RubricItem
 from huggingface_hub import snapshot_download
@@ -17,205 +17,232 @@ def download_model():
         return True
     except Exception as e:
         raise RuntimeError(f"Failed to download model {MODEL_NAME}: {e}")
-EXAMPLES = [
-    {
-        "example_description": "Faithfulness of a answer",
-        "emoji": "🏈",
-        "task_inputs": [{"name": "Question", "value": "What is the capital of France?"}, {"name": "Context", "value": "Paris is the capital of Spain."}],
-        "task_output": {"name": "Answer", "value": "The capital of France is Paris."},
-        "evaluation_criteria": "Based on the provided context, does the response contain only information that is supported by or \
-directly inferable from the context?",
-        "rubric": ['The response contains statements or claims that cannot be directly found in or logically inferred \
-from the provided context. There is hallucinated or fabricated information present in the response \
-that does not have support in the given context.', 'The response contains only statements and claims that are directly stated in or logically \
-inferable from the provided context. There is no hallucinated or fabricated information present in \
-the response that cannot be traced back to or deduced from the context.']
-    }
-]
-def populate_fields(example_index: int):
-    example = EXAMPLES[example_index]
-    return (
-        [[input["name"], input["value"]] for input in example["task_inputs"]],
-        [[example["task_output"]["name"], example["task_output"]["value"]]],
-        example["evaluation_criteria"],
-        [[str(i), description] for i, description in enumerate(example["rubric"])]
-    )
 @spaces.GPU
-def evaluate(task_inputs: pd.DataFrame, task_output: pd.DataFrame, evaluation_criteria: str, rubric: pd.DataFrame) -> tuple:
-    # Load the model
     try:
         model = Hf(flash_attn=False)
     except Exception as e:
         raise RuntimeError(f"Failed to initialize Hf Model: {e}")
-    # Convert inputs to the expected format
     eval_input = EvalInput(
-        inputs=[{row['Name']: row['Value']} for _, row in task_inputs.iterrows()],
-        output={row['Name']: row['Value'] for _, row in task_output.iterrows()}
     )
-    # Parse the rubric into RubricItems
-    rubric_items = [
-        RubricItem(score=int(row['Score']), description=row['Description'])
-        for _, row in rubric.iterrows()
     ]
-    # Create the CustomMetric
     custom_metric = CustomMetric(
         name="custom-metric",
         criteria=evaluation_criteria,
-        rubric=rubric_items,
-        required_inputs=[input_row['Name'] for _, input_row in task_inputs.iterrows()],
-        required_output=task_output.iloc[0]['Name']
     )
-    # Create a FlowJudge instance
     judge = FlowJudge(model=model, metric=custom_metric)
-    # Evaluate using FlowJudge
     try:
         result = judge.evaluate(eval_input)
     except Exception as e:
         raise RuntimeError(f"Failed to evaluate: {e}")
-    # Extract feedback and score from the result
-    feedback = result.feedback
-    score = result.score
-    return feedback, score
-def reset_fields():
-    return (
-        [["", ""]],  # task_inputs
-        [["", ""]],  # task_output
-        "",          # evaluation_criteria
-        [["", ""]],  # rubric
-        "",          # feedback
-        ""           # score
-    )
-def reset_task():
     return (
-        [["", ""]],  # task_inputs
-        [["", ""]]   # task_output
     )
-def reset_evaluation_criteria():
-    return (
-        "",          # evaluation_criteria
-        [["", ""]]   # rubric
-    )
 with gr.Blocks() as demo:
     model_downloaded = download_model()
-    with gr.Row():
-        example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)]
     with gr.Row(equal_height=False):
         with gr.Column(scale=1):
             gr.Markdown("**Inputs**")
-            task_inputs = gr.Dataframe(
-                headers=["Name", "Value"],
-                col_count=(2, "fixed"),
-                datatype=["str", "str"],
-                row_count=1,
-                column_widths=["30%", "70%"]
             )
-            add_input_btn = gr.Button("Add Input")
             gr.Markdown("**Output**")
-            task_output = gr.Dataframe(
-                headers=["Name", "Value"],
-                col_count=(2, "fixed"),
-                datatype=["str", "str"],
-                row_count=1,
-                column_widths=["30%", "70%"]
-            )
-            reset_task_btn = gr.Button("Clear Inputs and Output")
         with gr.Column(scale=1):
             gr.Markdown("**Evaluation criteria and rubric**")
             evaluation_criteria = gr.Textbox(label="Evaluation criteria")
-            rubric = gr.Dataframe(
-                headers=["Score", "Description"],
-                col_count=(2, "fixed"),
-                datatype=["str", "str"],
-                row_count=1,
-                column_widths=["10%", "90%"]
             )
-            add_score_btn = gr.Button("Add Score")
-            reset_criteria_btn = gr.Button("Clear Evaluation Criteria")
     with gr.Row():
-        with gr.Column(scale=1, variant="compact"):
             gr.Markdown("**Evaluation**")
-            feedback = gr.Textbox(label="Feedback")
-            score = gr.Textbox(label="Score")
-            evaluate_btn = gr.Button("Evaluate")
-    with gr.Row():
-        # Add the reset buttons
-        reset_all_btn = gr.Button("Clear All")
-    # Event handlers
-    add_input_btn.click(
-        lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
-                                headers=["Name", "Value"],
-                                col_count=(2, "fixed"),
-                                datatype=["str", "str"],
-                                row_count=1,
-                                column_widths=["30%", "70%"]),
-        inputs=task_inputs,
-        outputs=task_inputs
-    )
-    add_score_btn.click(
-        lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
-                                headers=["Score", "Description"],
-                                col_count=(2, "fixed"),
-                                datatype=["str", "str"],
-                                row_count=1,
-                                column_widths=["10%", "90%"]),
-        inputs=rubric,
-        outputs=rubric
-    )
-    for i, button in enumerate(example_buttons):
-        button.click(
-            populate_fields,
-            inputs=[gr.State(i)],  # Pass the example index as a state
-            outputs=[task_inputs, task_output, evaluation_criteria, rubric]
-        )
     evaluate_btn.click(
         evaluate,
-        inputs=[task_inputs, task_output, evaluation_criteria, rubric],
         outputs=[feedback, score]
     )
-    reset_task_btn.click(
-        reset_task,
-        inputs=[],
-        outputs=[task_inputs, task_output]
-    )
-    reset_criteria_btn.click(
-        reset_evaluation_criteria,
-        inputs=[],
-        outputs=[evaluation_criteria, rubric]
-    )
-    reset_all_btn.click(
-        reset_fields,
-        inputs=[],
-        outputs=[task_inputs, task_output, evaluation_criteria, rubric, feedback, score]
     )
-if __name__ == "__main__":
-    demo.launch(debug=True)

 import gradio as gr
 import spaces
 import pandas as pd
+from typing import List, Dict, Tuple
 from flow_judge import Hf, FlowJudge, EvalInput
 from flow_judge.metrics import CustomMetric, RubricItem
 from huggingface_hub import snapshot_download
         return True
     except Exception as e:
         raise RuntimeError(f"Failed to download model {MODEL_NAME}: {e}")
 @spaces.GPU
+def evaluate(
+    inputs_task: List[Dict[str, str]],
+    output_name: str,
+    output_value: str,
+    evaluation_criteria: str,
+    rubric_items: List[Dict[str, str]]
+) -> Tuple[str, int]:
+    # [{'name': 'a', 'value': 'a'}]
     try:
         model = Hf(flash_attn=False)
     except Exception as e:
         raise RuntimeError(f"Failed to initialize Hf Model: {e}")
     eval_input = EvalInput(
+        inputs=[{input['name']: input['value']} for input in inputs_task],
+        output={output_name: output_value}
     )
+    score_rubric_items = [
+        RubricItem(
+            score=int(rubric_item['name']),
+            description=rubric_item['value']
+        )
+        for rubric_item in rubric_items
     ]
     custom_metric = CustomMetric(
         name="custom-metric",
         criteria=evaluation_criteria,
+        rubric=score_rubric_items,
+        required_inputs=[input['name'] for input in inputs_task],
+        required_output=output_name
     )
     judge = FlowJudge(model=model, metric=custom_metric)
     try:
         result = judge.evaluate(eval_input)
     except Exception as e:
         raise RuntimeError(f"Failed to evaluate: {e}")
+    return result.feedback, result.score
+def reset_all():
     return (
+        [], "", "", [], "", "",  # Existing resets for inputs and rubrics
+        "", "", "", "", ""      # New resets for additional fields
     )
+# Define presets
+EXAMPLES = [
+    {
+        "description": "Example 1: Basic Evaluation",
+        "inputs_task": [{"name": "Question", "value": "What is the capital of France?"}],
+        "output": {"name": "Answer", "value": "The capital of France is Paris."},
+        "evaluation_criteria": "Ensure the answer is accurate and based on the input question.",
+        "rubric": [
+            {"name": "1", "value": "Incorrect answer."},
+            {"name": "2", "value": "Partially correct answer."},
+            {"name": "3", "value": "Completely correct answer."}
+        ]
+    },
+    {
+        "description": "Example 2: Contextual Understanding",
+        "inputs_task": [
+            {"name": "Statement", "value": "All swans are white."}
+        ],
+        "output": {"name": "Conclusion", "value": "There are no black swans."},
+        "evaluation_criteria": "Verify the conclusion logically follows from the statement.",
+        "rubric": [
+            {"name": "1", "value": "Conclusion does not follow."},
+            {"name": "2", "value": "Conclusion somewhat follows."},
+            {"name": "3", "value": "Conclusion logically follows."}
+        ]
+    }
+]
 with gr.Blocks() as demo:
     model_downloaded = download_model()
+    # with gr.Row():
+    #     example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)]
     with gr.Row(equal_height=False):
         with gr.Column(scale=1):
             gr.Markdown("**Inputs**")
+            inputs_task = gr.State([])
+            new_input_name = gr.Textbox(label="Name")
+            new_input_value = gr.Textbox(label="Value")
+            def add_input(inputs_task, new_input_name, new_input_value):
+                return inputs_task + [{"name": new_input_name, "value": new_input_value}], "", ""
+            @gr.render(inputs=inputs_task) # You have to pass the state here
+            def render_inputs(inputs_list): # Use different name than the state variable
+                for input in inputs_list:
+                    with gr.Group():
+                        with gr.Row(equal_height=True):
+                            with gr.Column(min_width=60, scale=2):
+                                gr.Textbox(input['name'], label="Name", show_label=True, interactive=False, autoscroll=False, max_lines=1)
+                            with gr.Column(scale=8):
+                                gr.Textbox(input['value'], label="Value", show_label=True, interactive=False, autoscroll=False, max_lines=3)
+                            with gr.Column(min_width=15, scale=1):
+                                delete_btn = gr.Button("X", size="lg", variant="secondary")
+                                def delete(input=input):
+                                    inputs_list.remove(input)
+                                    return inputs_list
+                            delete_btn.click(delete, None, [inputs_task]) # This is the state variable
+            gr.Button("Add Input").click(
+                add_input,
+                [inputs_task, new_input_name, new_input_value],
+                [inputs_task, new_input_name, new_input_value]
             )
             gr.Markdown("**Output**")
+            with gr.Group():
+                with gr.Row(equal_height=True):
+                    with gr.Column(min_width=60, scale=1):
+                        output_name = gr.Textbox(label="Name", show_label=True, interactive=True, autoscroll=False, max_lines=1)
+                    with gr.Column(scale=6):
+                        output_value = gr.Textbox(label="Value", show_label=True, interactive=True, autoscroll=False, max_lines=3)
         with gr.Column(scale=1):
             gr.Markdown("**Evaluation criteria and rubric**")
             evaluation_criteria = gr.Textbox(label="Evaluation criteria")
+            gr.Markdown("**Score rubrics**")
+            rubric_items = gr.State([])
+            new_rubric_name = gr.Textbox(label="Score", show_label=True, interactive=True, autoscroll=False, max_lines=1)
+            new_rubric_value = gr.Textbox(label="Description", show_label=True, interactive=True, autoscroll=False, max_lines=3)
+            def add_rubric_item(rubric_items, new_rubric_name, new_rubric_value):
+                return rubric_items + [{"name": new_rubric_name, "value": new_rubric_value}], "", ""
+            @gr.render(inputs=rubric_items) # You have to pass the state here
+            def render_rubrics(rubric_items_list): # Use different name than the state variable
+                for rubric_item in rubric_items_list:
+                    with gr.Group():
+                        with gr.Row(equal_height=True):
+                            with gr.Column(min_width=30, scale=1):
+                                gr.Textbox(rubric_item['name'], label="Score", show_label=True, interactive=False)
+                            with gr.Column(scale=8):
+                                gr.Textbox(rubric_item['value'], label="Description", show_label=True, interactive=False)
+                            with gr.Column(min_width=15, scale=1):
+                                delete_btn = gr.Button("X", size="lg", variant="secondary")
+                                def delete(rubric_item=rubric_item):
+                                    rubric_items_list.remove(rubric_item)
+                                    return rubric_items_list
+                                delete_btn.click(delete, None, [rubric_items]) # This is the state variable
+            gr.Button("Add Rubric Item").click(
+                add_rubric_item,
+                [rubric_items, new_rubric_name, new_rubric_value],
+                [rubric_items, new_rubric_name, new_rubric_value]
             )
     with gr.Row():
+        with gr.Column(scale=1, variant="panel"):
             gr.Markdown("**Evaluation**")
+            with gr.Group():
+                with gr.Row(equal_height=True):
+                    with gr.Column(min_width=15, scale=1):
+                        score = gr.Textbox(label="Score", interactive=False, autoscroll=False, max_lines=1)
+                    with gr.Column(scale=5):
+                        feedback = gr.Textbox(label="Feedback", interactive=False, autoscroll=False, max_lines=6)
+                    with gr.Column(min_width=15, scale=1):
+                        evaluate_btn = gr.Button("Evaluate", variant="primary")
+                        reset_all_btn = gr.Button("Clear All", variant="stop")  # Add Reset All button
+                        reset_all_btn.click(
+                            reset_all,
+                            inputs=[],
+                            outputs=[
+                                inputs_task,
+                                new_input_name,
+                                new_input_value,
+                                rubric_items,
+                                new_rubric_name,
+                                new_rubric_value,
+                                evaluation_criteria,    # Reset evaluation criteria
+                                output_name,            # Reset output name
+                                output_value,           # Reset output value
+                                feedback,               # Reset feedback
+                                score                   # Reset score
+                            ]
+                        )
     evaluate_btn.click(
         evaluate,
+        inputs=[inputs_task, output_name, output_value, evaluation_criteria, rubric_items],
         outputs=[feedback, score]
     )
+    preset_buttons = [gr.Button(example["description"]) for example in EXAMPLES]
+    for i, button in enumerate(preset_buttons):
+        def populate_preset(ex_i=i):
+            return populate_fields(ex_i)
+        button.click(
+            populate_preset,  # Use the closure to pass the current index
+            inputs=[],  # No direct inputs needed
+            outputs=[
+                inputs_task,
+                output_name,
+                output_value,
+                evaluation_criteria,
+                rubric_items
+            ]
+        )
+def populate_fields(example_index: int):
+    example = EXAMPLES[example_index]
+    return (
+        example["inputs_task"],
+        example["output"]["name"],
+        example["output"]["value"],
+        example["evaluation_criteria"],
+        example["rubric"]
     )
+demo.launch(debug=True)