Spaces:

flowaicom
/

Flow-Judge-v0.1

Runtime error

App Files Files Community

Flow-Judge-v0.1 / app.py

bergr7f

feat: UI upgrades

aaa8f14 about 1 year ago

raw

history blame contribute delete

15.5 kB

	import gradio as gr
	import spaces
	import pandas as pd
	from typing import List, Dict, Tuple
	from flow_judge import Hf, FlowJudge, EvalInput
	from flow_judge.metrics import CustomMetric, RubricItem
	from huggingface_hub import snapshot_download
	from flow_judge.models.huggingface import Hf
	from examples import get_examples

	MODEL_NAME = "flowaicom/Flow-Judge-v0.1"

	def download_model():
	try:
	print(f"Downloading model {MODEL_NAME}...")
	snapshot_download(repo_id=MODEL_NAME)
	print(f"Model {MODEL_NAME} downloaded to default Hugging Face cache")
	return True
	except Exception as e:
	raise RuntimeError(f"Failed to download model {MODEL_NAME}: {e}")



	@spaces.GPU
	def evaluate(
	inputs_task: List[Dict[str, str]],
	output_name: str,
	output_value: str,
	evaluation_criteria: str,
	rubric_items: List[Dict[str, str]]
	) -> Tuple[str, int]:

	# [{'name': 'a', 'value': 'a'}]
	try:
	model = Hf(flash_attn=False)
	except Exception as e:
	raise RuntimeError(f"Failed to initialize Hf Model: {e}")

	eval_input = EvalInput(
	inputs=[{input['name']: input['value']} for input in inputs_task],
	output={output_name: output_value}
	)

	score_rubric_items = [
	RubricItem(
	score=int(rubric_item['name']),
	description=rubric_item['value']
	)
	for rubric_item in rubric_items
	]

	custom_metric = CustomMetric(
	name="custom-metric",
	criteria=evaluation_criteria,
	rubric=score_rubric_items,
	required_inputs=[input['name'] for input in inputs_task],
	required_output=output_name
	)

	judge = FlowJudge(model=model, metric=custom_metric)

	try:
	result = judge.evaluate(eval_input)
	except Exception as e:
	raise RuntimeError(f"Failed to evaluate: {e}")

	return result.feedback, result.score

	def reset_all():
	return (
	[], "", "", [], "", "", # Existing resets for inputs and rubrics
	"", "", "", "", "", # New resets for additional fields
	gr.update(visible=True), # Show new_input_name
	gr.update(visible=True), # Show new_input_value
	gr.update(visible=True), # Show new_rubric_name
	gr.update(visible=True), # Show new_rubric_value
	gr.update(visible=True), # Show Add Input button
	gr.update(visible=True), # Show Add Rubric Item button
	)

	# Define presets
	EXAMPLES = get_examples()

	IMAGE_PATH = "./img/flow_judge_banner.png"

	HEADER = """<h1 align="center" style="font-family: 'Courier New', Courier, monospace;">Flow Judge Demo</h1>

	<p align="center" style="font-family: 'Courier New', Courier, monospace;">
	<strong>
	<a href="https://www.flow-ai.com/judge">Technical Report</a> \|
	<a href="https://huggingface.co/collections/flowaicom/flow-judge-v01-66e6af5fc3b3a128bde07dec">Model Weights</a> \|
	<a href="https://github.com/flowaicom/lm-evaluation-harness/tree/Flow-Judge-v0.1_evals/lm_eval/tasks/flow_judge_evals">Evaluation Code</a> \|
	<a href="https://github.com/flowaicom/flow-judge/tree/main/examples">Tutorials</a>
	</strong>
	</p>

	<p align="center" style="font-family: 'Courier New', Courier, monospace;">
	<code>flow-judge</code> is a lightweight library for evaluating LLM applications with <code>Flow-Judge-v0.1</code>.
	</p>"""


	with gr.Blocks() as demo:
	model_downloaded = download_model()

	with gr.Row(equal_height=False):
	with gr.Column(scale=2):
	gr.Image(IMAGE_PATH, show_label=False, interactive=False, show_share_button=False, show_fullscreen_button=False, show_download_button=False)
	with gr.Column(scale=3):
	gr.HTML(HEADER)
	gr.Markdown("# ⚡ Quickstart Examples")
	with gr.Row():
	with gr.Column(scale=1):
	preset_buttons = [gr.Button(example["description"]) for example in EXAMPLES[:len(EXAMPLES)//3]]
	with gr.Column(scale=1):
	preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[len(EXAMPLES)//3:2*len(EXAMPLES)//3]]
	with gr.Column(scale=1):
	preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[2*len(EXAMPLES)//3:]]

	with gr.Row(equal_height=False):
	with gr.Column(scale=1):
	gr.Markdown("## Evaluation task inputs")
	gr.Markdown("<span style='color: gray;'>Define the input names and values. Inputs are optional if evaluation depends on the output only.</span>")
	with gr.Group():
	inputs_task = gr.State([])
	with gr.Row(equal_height=True):
	with gr.Column(min_width=60, scale=2):
	new_input_name = gr.Textbox(
	label="Name",
	show_label=True,
	autoscroll=False,
	max_lines=1,
	visible=True # Initially visible
	)
	with gr.Column(scale=9):
	new_input_value = gr.Textbox(
	label="Value",
	show_label=True,
	autoscroll=False,
	max_lines=3,
	visible=True # Initially visible
	)

	def add_input(inputs_task, new_input_name, new_input_value):
	return inputs_task + [{"name": new_input_name, "value": new_input_value}], "", ""

	@gr.render(inputs=inputs_task) # You have to pass the state here
	def render_inputs(inputs_list): # Use different name than the state variable

	for input in inputs_list:
	with gr.Group():
	with gr.Row(equal_height=True):
	with gr.Column(min_width=60, scale=2):
	gr.Textbox(input['name'], label="Name", show_label=True, interactive=False, autoscroll=False, max_lines=1)
	with gr.Column(scale=8):
	gr.Textbox(input['value'], label="Value", show_label=True, interactive=False, autoscroll=False, max_lines=3)
	with gr.Column(min_width=15, scale=1):
	delete_btn = gr.Button("X", size="lg", variant="secondary")
	def delete(input=input):
	inputs_list.remove(input)
	return inputs_list
	delete_btn.click(delete, None, [inputs_task]) # This is the state variable

	with gr.Group():
	add_input_btn = gr.Button("Add Input") # Assign to variable
	add_input_btn.click(
	add_input,
	[inputs_task, new_input_name, new_input_value],
	[inputs_task, new_input_name, new_input_value]
	)

	with gr.Column(scale=1):
	gr.Markdown("## Evaluation task output")
	gr.Markdown("<span style='color: gray;'>Define the output name and value. Output is always required.</span>")
	with gr.Group():
	with gr.Row(equal_height=True):
	with gr.Column(min_width=60, scale=2):
	output_name = gr.Textbox(label="Name", show_label=True, interactive=True, autoscroll=False, max_lines=1)
	with gr.Column(scale=9):
	output_value = gr.Textbox(label="Value", show_label=True, interactive=True, autoscroll=False, max_lines=3)


	with gr.Column(scale=1):
	gr.Markdown("## Evaluation criteria and rubric")
	gr.Markdown("<span style='color: gray;'>Define the evaluation criteria and rubric for the evaluation task. Supported scoring scales: Binary (0 and 1), 3-Likert and 5-Likert.</span>\n\n<span style='color: gray;'>❗You can experiment with other scoring scales. However, performance may vary.</span>")

	with gr.Row():
	with gr.Column(scale=1):
	with gr.Group():
	rubric_items = gr.State([])
	with gr.Row(equal_height=True):
	with gr.Column(min_width=60, scale=2):
	new_rubric_name = gr.Textbox(
	label="Score",
	show_label=True,
	interactive=True,
	autoscroll=False,
	max_lines=1,
	visible=True # Initially visible
	)
	with gr.Column(scale=9):
	new_rubric_value = gr.Textbox(
	label="Description",
	show_label=True,
	interactive=True,
	autoscroll=False,
	max_lines=3,
	visible=True # Initially visible
	)

	def add_rubric_item(rubric_items, new_rubric_name, new_rubric_value):
	return rubric_items + [{"name": new_rubric_name, "value": new_rubric_value}], "", ""

	@gr.render(inputs=rubric_items) # You have to pass the state here
	def render_rubrics(rubric_items_list): # Use different name than the state variable

	for rubric_item in rubric_items_list:
	with gr.Group():
	with gr.Row(equal_height=True):
	with gr.Column(min_width=60, scale=2):
	gr.Textbox(
	rubric_item['name'],
	label="Score",
	show_label=True,
	interactive=False
	)
	with gr.Column(scale=8):
	gr.Textbox(
	rubric_item['value'],
	label="Description",
	show_label=True,
	interactive=False
	)
	with gr.Column(min_width=15, scale=1):
	delete_btn = gr.Button("X", size="lg", variant="secondary")
	def delete(rubric_item=rubric_item):
	rubric_items_list.remove(rubric_item)
	return rubric_items_list
	delete_btn.click(delete, None, [rubric_items]) # This is the state variable

	with gr.Group():
	add_rubric_btn = gr.Button("Add Rubric Item") # Assign to variable
	add_rubric_btn.click(
	add_rubric_item,
	[rubric_items, new_rubric_name, new_rubric_value],
	[rubric_items, new_rubric_name, new_rubric_value]
	)
	with gr.Column(scale=1):
	evaluation_criteria = gr.Textbox(label="Evaluation criteria")

	with gr.Row():
	with gr.Column(scale=1, variant="panel"):
	gr.Markdown("# Evaluation")
	with gr.Group():
	with gr.Row(equal_height=True):
	with gr.Column(min_width=60, scale=1):
	score = gr.Textbox(label="Score", interactive=False, autoscroll=False, max_lines=1)
	with gr.Column(scale=9):
	feedback = gr.Textbox(label="Feedback", interactive=False, autoscroll=False, max_lines=6)
	with gr.Column(min_width=15, scale=1):
	evaluate_btn = gr.Button("Evaluate", variant="primary")

	reset_all_btn = gr.Button("Clear All", variant="stop") # Add Reset All button
	reset_all_btn.click(
	reset_all,
	inputs=[],
	outputs=[
	inputs_task,
	new_input_name,
	new_input_value,
	rubric_items,
	new_rubric_name,
	new_rubric_value,
	evaluation_criteria,
	output_name,
	output_value,
	feedback,
	score,
	new_input_name, # Visibility for new_input_name
	new_input_value, # Visibility for new_input_value
	new_rubric_name, # Visibility for new_rubric_name
	new_rubric_value, # Visibility for new_rubric_value
	add_input_btn, # Visibility for Add Input button
	add_rubric_btn, # Visibility for Add Rubric Item button
	]
	)

	evaluate_btn.click(
	evaluate,
	inputs=[inputs_task, output_name, output_value, evaluation_criteria, rubric_items],
	outputs=[feedback, score]
	)

	for i, button in enumerate(preset_buttons):
	def populate_preset(ex_i=i):
	return populate_fields(ex_i)

	button.click(
	populate_preset,
	inputs=[],
	outputs=[
	inputs_task,
	output_name,
	output_value,
	evaluation_criteria,
	rubric_items,
	feedback,
	score,
	new_input_name, # Visibility for new_input_name
	new_input_value, # Visibility for new_input_value
	new_rubric_name, # Visibility for new_rubric_name
	new_rubric_value, # Visibility for new_rubric_value
	add_input_btn, # Visibility for Add Input button
	add_rubric_btn, # Visibility for Add Rubric Item button
	]
	)

	def populate_fields(example_index: int):
	example = EXAMPLES[example_index]
	return (
	example["inputs_task"],
	example["output"]["name"],
	example["output"]["value"],
	example["evaluation_criteria"],
	example["rubric"],
	"", # Reset feedback
	"", # Reset score
	gr.update(visible=False), # Hide new_input_name
	gr.update(visible=False), # Hide new_input_value
	gr.update(visible=False), # Hide new_rubric_name
	gr.update(visible=False), # Hide new_rubric_value
	gr.update(visible=False), # Hide Add Input button
	gr.update(visible=False), # Hide Add Rubric Item button
	)

	demo.launch(debug=True)