Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import spaces | |
| import pandas as pd | |
| from typing import List, Dict | |
| from flow_judge import Hf, FlowJudge, EvalInput | |
| from flow_judge.metrics import CustomMetric, RubricItem | |
| def load_model(): | |
| try: | |
| model = Hf(flash_attn=False) | |
| return model | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to initialize Hf Model: {e}") | |
| EXAMPLES = [ | |
| { | |
| "example_description": "Faithfulness of a answer", | |
| "emoji": "🏈", | |
| "task_inputs": [{"name": "Question", "value": "What is the capital of France?"}, {"name": "Context", "value": "Paris is the capital of Spain."}], | |
| "task_output": {"name": "Answer", "value": "The capital of France is Paris."}, | |
| "evaluation_criteria": "Based on the provided context, does the response contain only information that is supported by or \ | |
| directly inferable from the context?", | |
| "rubric": ['The response contains statements or claims that cannot be directly found in or logically inferred \ | |
| from the provided context. There is hallucinated or fabricated information present in the response \ | |
| that does not have support in the given context.', 'The response contains only statements and claims that are directly stated in or logically \ | |
| inferable from the provided context. There is no hallucinated or fabricated information present in \ | |
| the response that cannot be traced back to or deduced from the context.'] | |
| } | |
| ] | |
| def populate_fields(example_index: int): | |
| example = EXAMPLES[example_index] | |
| return ( | |
| [[input["name"], input["value"]] for input in example["task_inputs"]], | |
| [[example["task_output"]["name"], example["task_output"]["value"]]], | |
| example["evaluation_criteria"], | |
| [[str(i), description] for i, description in enumerate(example["rubric"])] | |
| ) | |
| def evaluate(model, task_inputs: pd.DataFrame, task_output: pd.DataFrame, evaluation_criteria: str, rubric: pd.DataFrame) -> tuple: | |
| # Convert inputs to the expected format | |
| eval_input = EvalInput( | |
| inputs=[{row['Name']: row['Value']} for _, row in task_inputs.iterrows()], | |
| output={row['Name']: row['Value'] for _, row in task_output.iterrows()} | |
| ) | |
| # Parse the rubric into RubricItems | |
| rubric_items = [ | |
| RubricItem(score=int(row['Score']), description=row['Description']) | |
| for _, row in rubric.iterrows() | |
| ] | |
| # Create the CustomMetric | |
| custom_metric = CustomMetric( | |
| name="custom-metric", | |
| criteria=evaluation_criteria, | |
| rubric=rubric_items, | |
| required_inputs=[input_row['Name'] for _, input_row in task_inputs.iterrows()], | |
| required_output=task_output.iloc[0]['Name'] | |
| ) | |
| # Create a FlowJudge instance | |
| judge = FlowJudge(model=model, metric=custom_metric) | |
| # Evaluate using FlowJudge | |
| try: | |
| result = judge.evaluate(eval_input) | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to evaluate: {e}") | |
| # Extract feedback and score from the result | |
| feedback = result.feedback | |
| score = result.score | |
| return feedback, score | |
| def reset_fields(): | |
| return ( | |
| [["", ""]], # task_inputs | |
| [["", ""]], # task_output | |
| "", # evaluation_criteria | |
| [["", ""]], # rubric | |
| "", # feedback | |
| "" # score | |
| ) | |
| def reset_task(): | |
| return ( | |
| [["", ""]], # task_inputs | |
| [["", ""]] # task_output | |
| ) | |
| def reset_evaluation_criteria(): | |
| return ( | |
| "", # evaluation_criteria | |
| [["", ""]] # rubric | |
| ) | |
| with gr.Blocks() as demo: | |
| model = load_model() | |
| with gr.Row(): | |
| example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)] | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=1): | |
| gr.Markdown("**Inputs**") | |
| task_inputs = gr.Dataframe( | |
| headers=["Name", "Value"], | |
| col_count=(2, "fixed"), | |
| datatype=["str", "str"], | |
| row_count=1, | |
| column_widths=["30%", "70%"] | |
| ) | |
| add_input_btn = gr.Button("Add Input") | |
| gr.Markdown("**Output**") | |
| task_output = gr.Dataframe( | |
| headers=["Name", "Value"], | |
| col_count=(2, "fixed"), | |
| datatype=["str", "str"], | |
| row_count=1, | |
| column_widths=["30%", "70%"] | |
| ) | |
| reset_task_btn = gr.Button("Clear Inputs and Output") | |
| with gr.Column(scale=1): | |
| gr.Markdown("**Evaluation criteria and rubric**") | |
| evaluation_criteria = gr.Textbox(label="Evaluation criteria") | |
| rubric = gr.Dataframe( | |
| headers=["Score", "Description"], | |
| col_count=(2, "fixed"), | |
| datatype=["str", "str"], | |
| row_count=1, | |
| column_widths=["10%", "90%"] | |
| ) | |
| add_score_btn = gr.Button("Add Score") | |
| reset_criteria_btn = gr.Button("Clear Evaluation Criteria") | |
| with gr.Row(): | |
| with gr.Column(scale=1, variant="compact"): | |
| gr.Markdown("**Evaluation**") | |
| feedback = gr.Textbox(label="Feedback") | |
| score = gr.Textbox(label="Score") | |
| evaluate_btn = gr.Button("Evaluate") | |
| with gr.Row(): | |
| # Add the reset buttons | |
| reset_all_btn = gr.Button("Clear All") | |
| # Event handlers | |
| add_input_btn.click( | |
| lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]], | |
| headers=["Name", "Value"], | |
| col_count=(2, "fixed"), | |
| datatype=["str", "str"], | |
| row_count=1, | |
| column_widths=["30%", "70%"]), | |
| inputs=task_inputs, | |
| outputs=task_inputs | |
| ) | |
| add_score_btn.click( | |
| lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]], | |
| headers=["Score", "Description"], | |
| col_count=(2, "fixed"), | |
| datatype=["str", "str"], | |
| row_count=1, | |
| column_widths=["10%", "90%"]), | |
| inputs=rubric, | |
| outputs=rubric | |
| ) | |
| for i, button in enumerate(example_buttons): | |
| button.click( | |
| populate_fields, | |
| inputs=[gr.State(i)], # Pass the example index as a state | |
| outputs=[task_inputs, task_output, evaluation_criteria, rubric] | |
| ) | |
| evaluate_btn.click( | |
| evaluate, | |
| inputs=[model,task_inputs, task_output, evaluation_criteria, rubric], | |
| outputs=[feedback, score] | |
| ) | |
| reset_task_btn.click( | |
| reset_task, | |
| inputs=[], | |
| outputs=[task_inputs, task_output] | |
| ) | |
| reset_criteria_btn.click( | |
| reset_evaluation_criteria, | |
| inputs=[], | |
| outputs=[evaluation_criteria, rubric] | |
| ) | |
| reset_all_btn.click( | |
| reset_fields, | |
| inputs=[], | |
| outputs=[task_inputs, task_output, evaluation_criteria, rubric, feedback, score] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(debug=True) | |