Spaces:
Configuration error
Configuration error
might have found a bug in binning
Browse files- app.py +166 -3
- local_app.py +43 -59
app.py
CHANGED
|
@@ -1,8 +1,171 @@
|
|
| 1 |
import evaluate
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import evaluate
|
| 2 |
+
import json
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import gradio as gr
|
| 6 |
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
import ast
|
| 10 |
+
import matplotlib.pyplot as plt
|
| 11 |
+
import matplotlib.patches as mpatches
|
| 12 |
|
| 13 |
+
plt.rcParams["figure.dpi"] = 300
|
| 14 |
+
plt.switch_backend(
|
| 15 |
+
"agg"
|
| 16 |
+
) # ; https://stackoverflow.com/questions/14694408/runtimeerror-main-thread-is-not-in-main-loop
|
| 17 |
|
| 18 |
|
| 19 |
+
def default_plot():
|
| 20 |
+
fig = plt.figure()
|
| 21 |
+
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
|
| 22 |
+
ax2 = plt.subplot2grid((3, 1), (2, 0))
|
| 23 |
+
ranged = np.linspace(0, 1, 10)
|
| 24 |
+
ax1.plot(
|
| 25 |
+
ranged,
|
| 26 |
+
ranged,
|
| 27 |
+
color="darkgreen",
|
| 28 |
+
ls="dotted",
|
| 29 |
+
label="Perfect",
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Bin differences
|
| 33 |
+
ax1.set_ylabel("Conditional Expectation")
|
| 34 |
+
ax1.set_ylim([0, 1.05])
|
| 35 |
+
ax1.set_title("Reliability Diagram")
|
| 36 |
+
ax1.set_xlim([-0.05, 1.05]) # respective to bin range
|
| 37 |
+
|
| 38 |
+
# Bin frequencies
|
| 39 |
+
ax2.set_xlabel("Confidence")
|
| 40 |
+
ax2.set_ylabel("Count")
|
| 41 |
+
ax2.set_xlim([-0.05, 1.05]) # respective to bin range
|
| 42 |
+
|
| 43 |
+
return fig, ax1, ax2
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def reliability_plot(results):
|
| 47 |
+
# DEV: might still need to write tests in case of equal mass binning
|
| 48 |
+
# DEV: nicer would be to plot like a polygon
|
| 49 |
+
# see: https://github.com/markus93/fit-on-the-test/blob/main/Experiments_Synthetic/binnings.py
|
| 50 |
+
|
| 51 |
+
def over_under_confidence(results):
|
| 52 |
+
colors = []
|
| 53 |
+
for j, bin in enumerate(results["y_bar"]):
|
| 54 |
+
perfect = results["y_bar"][j]
|
| 55 |
+
empirical = results["p_bar"][j]
|
| 56 |
+
|
| 57 |
+
bin_color = (
|
| 58 |
+
"limegreen"
|
| 59 |
+
if np.allclose(perfect, empirical)
|
| 60 |
+
else "dodgerblue"
|
| 61 |
+
if empirical < perfect
|
| 62 |
+
else "orangered"
|
| 63 |
+
)
|
| 64 |
+
colors.append(bin_color)
|
| 65 |
+
return colors
|
| 66 |
+
|
| 67 |
+
fig, ax1, ax2 = default_plot()
|
| 68 |
+
|
| 69 |
+
# Bin differences
|
| 70 |
+
bins_with_left_edge = np.insert(results["y_bar"], 0, 0, axis=0)
|
| 71 |
+
B, bins, patches = ax1.hist(
|
| 72 |
+
results["y_bar"],
|
| 73 |
+
weights=np.nan_to_num(results["p_bar"][:-1], copy=True, nan=0),
|
| 74 |
+
bins=bins_with_left_edge,
|
| 75 |
+
)
|
| 76 |
+
colors = over_under_confidence(results)
|
| 77 |
+
for b in range(len(B)):
|
| 78 |
+
patches[b].set_facecolor(colors[b]) # color based on over/underconfidence
|
| 79 |
+
|
| 80 |
+
ax1handles = [
|
| 81 |
+
mpatches.Patch(color="orangered", label="Overconfident"),
|
| 82 |
+
mpatches.Patch(color="limegreen", label="Perfect", linestyle="dotted"),
|
| 83 |
+
mpatches.Patch(color="dodgerblue", label="Underconfident"),
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
# Bin frequencies
|
| 87 |
+
anindices = np.where(~np.isnan(results["p_bar"][:-1]))[0]
|
| 88 |
+
n_bins = len(results["y_bar"])
|
| 89 |
+
bin_freqs = np.zeros(n_bins)
|
| 90 |
+
bin_freqs[anindices] = results["bin_freq"]
|
| 91 |
+
B, newbins, patches = ax2.hist(
|
| 92 |
+
results["y_bar"], weights=bin_freqs, color="midnightblue", bins=bins_with_left_edge
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
acc_plt = ax2.axvline(x=results["accuracy"], ls="solid", lw=3, c="black", label="Accuracy")
|
| 96 |
+
conf_plt = ax2.axvline(
|
| 97 |
+
x=results["p_bar_cont"], ls="dotted", lw=3, c="#444", label="Avg. confidence"
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
ax1.legend(loc="lower right", handles=ax1handles)
|
| 101 |
+
ax2.legend(handles=[acc_plt, conf_plt])
|
| 102 |
+
ax1.set_xticks(bins_with_left_edge)
|
| 103 |
+
ax2.set_xticks(bins_with_left_edge)
|
| 104 |
+
plt.tight_layout()
|
| 105 |
+
return fig
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def compute_and_plot(data, n_bins, bin_range, scheme, proxy, p):
|
| 109 |
+
# DEV: check on invalid datatypes with better warnings
|
| 110 |
+
|
| 111 |
+
if isinstance(data, pd.DataFrame):
|
| 112 |
+
data.dropna(inplace=True)
|
| 113 |
+
|
| 114 |
+
predictions = [
|
| 115 |
+
ast.literal_eval(prediction) if not isinstance(prediction, list) else prediction
|
| 116 |
+
for prediction in data["predictions"]
|
| 117 |
+
]
|
| 118 |
+
references = [reference for reference in data["references"]]
|
| 119 |
+
|
| 120 |
+
results = metric._compute(
|
| 121 |
+
predictions,
|
| 122 |
+
references,
|
| 123 |
+
n_bins=n_bins,
|
| 124 |
+
scheme=scheme,
|
| 125 |
+
proxy=proxy,
|
| 126 |
+
p=p,
|
| 127 |
+
detail=True,
|
| 128 |
+
)
|
| 129 |
+
plot = reliability_plot(results)
|
| 130 |
+
return results["ECE"], plot
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
sliders = [
|
| 134 |
+
gr.Slider(0, 100, value=10, label="n_bins"),
|
| 135 |
+
gr.Slider(
|
| 136 |
+
0, 100, value=None, label="bin_range", visible=False
|
| 137 |
+
), # DEV: need to have a double slider
|
| 138 |
+
gr.Dropdown(choices=["equal-range", "equal-mass"], value="equal-range", label="scheme"),
|
| 139 |
+
gr.Dropdown(choices=["upper-edge", "center"], value="upper-edge", label="proxy"),
|
| 140 |
+
gr.Dropdown(choices=[1, 2, np.inf], value=1, label="p"),
|
| 141 |
+
]
|
| 142 |
+
|
| 143 |
+
slider_defaults = [slider.value for slider in sliders]
|
| 144 |
+
|
| 145 |
+
# example data
|
| 146 |
+
component = gr.inputs.Dataframe(
|
| 147 |
+
headers=["predictions", "references"], col_count=2, datatype="number", type="pandas"
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
component.value = [
|
| 151 |
+
[[0.63, 0.2, 0.2], 0],
|
| 152 |
+
[[0.73, 0.1, 0.2], 2],
|
| 153 |
+
[[0, 0.95, 0.05], 1],
|
| 154 |
+
]
|
| 155 |
+
sample_data = [[component] + slider_defaults]
|
| 156 |
+
|
| 157 |
+
local_path = Path(sys.path[0])
|
| 158 |
+
metric = evaluate.load("jordyvl/ece")
|
| 159 |
+
outputs = [gr.outputs.Textbox(label="ECE"), gr.Plot(label="Reliability diagram")]
|
| 160 |
+
# outputs[1].value = default_plot().__dict__ #DEV: Does not work in gradio; needs to be JSON encoded
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
iface = gr.Interface(
|
| 164 |
+
fn=compute_and_plot,
|
| 165 |
+
inputs=[component] + sliders,
|
| 166 |
+
outputs=outputs,
|
| 167 |
+
description=metric.info.description,
|
| 168 |
+
article=evaluate.utils.parse_readme(local_path / "README.md"),
|
| 169 |
+
title=f"Metric: {metric.name}",
|
| 170 |
+
# examples=sample_data; #DEV: ValueError: Examples argument must either be a directory or a nested list, where each sublist represents a set of inputs.
|
| 171 |
+
).launch()
|
local_app.py
CHANGED
|
@@ -7,7 +7,8 @@ import gradio as gr
|
|
| 7 |
import numpy as np
|
| 8 |
import pandas as pd
|
| 9 |
import ast
|
| 10 |
-
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
import matplotlib.pyplot as plt
|
|
@@ -55,7 +56,7 @@ sample_data = [[component] + slider_defaults] ##json.dumps(df)
|
|
| 55 |
|
| 56 |
local_path = Path(sys.path[0])
|
| 57 |
metric = evaluate.load("jordyvl/ece")
|
| 58 |
-
#ECE()
|
| 59 |
# module = evaluate.load("jordyvl/ece")
|
| 60 |
# launch_gradio_widget(module)
|
| 61 |
|
|
@@ -76,97 +77,80 @@ def default_plot():
|
|
| 76 |
ls="dotted",
|
| 77 |
label="Perfect",
|
| 78 |
)
|
|
|
|
|
|
|
| 79 |
ax1.set_ylabel("Conditional Expectation")
|
| 80 |
-
ax1.set_ylim([
|
| 81 |
-
ax1.legend(loc="lower right")
|
| 82 |
ax1.set_title("Reliability Diagram")
|
|
|
|
| 83 |
|
| 84 |
# Bin frequencies
|
| 85 |
ax2.set_xlabel("Confidence")
|
| 86 |
ax2.set_ylabel("Count")
|
| 87 |
ax2.legend(loc="upper left") # , ncol=2
|
| 88 |
-
|
| 89 |
-
return fig
|
| 90 |
-
|
| 91 |
|
| 92 |
-
|
| 93 |
-
colors = []
|
| 94 |
-
for j, bin in enumerate(results["y_bar"]):
|
| 95 |
-
perfect = results["y_bar"][j]
|
| 96 |
-
empirical = results["p_bar"][j]
|
| 97 |
-
bin_color = (
|
| 98 |
-
"limegreen"
|
| 99 |
-
if perfect == empirical
|
| 100 |
-
else "dodgerblue"
|
| 101 |
-
if empirical < perfect
|
| 102 |
-
else "orangered"
|
| 103 |
-
)
|
| 104 |
-
colors.append(bin_color)
|
| 105 |
-
return colors
|
| 106 |
|
| 107 |
|
| 108 |
def reliability_plot(results):
|
| 109 |
# DEV: might still need to write tests in case of equal mass binning
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
ax2 = plt.subplot2grid((3, 1), (2, 0))
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
B, bins, patches = ax1.hist(
|
| 122 |
-
results["y_bar"],
|
|
|
|
|
|
|
| 123 |
)
|
| 124 |
colors = over_under_confidence(results)
|
| 125 |
for b in range(len(B)):
|
| 126 |
patches[b].set_facecolor(colors[b]) # color based on over/underconfidence
|
| 127 |
|
| 128 |
-
ranged = np.linspace(bin_range[0], bin_range[1], n_bins)
|
| 129 |
-
ax1.plot(
|
| 130 |
-
ranged,
|
| 131 |
-
ranged,
|
| 132 |
-
color="limegreen",
|
| 133 |
-
ls="dotted",
|
| 134 |
-
label="Perfect",
|
| 135 |
-
)
|
| 136 |
ax1handles = [
|
| 137 |
mpatches.Patch(color="orangered", label="Overconfident"),
|
| 138 |
mpatches.Patch(color="limegreen", label="Perfect", linestyle="dotted"),
|
| 139 |
mpatches.Patch(color="dodgerblue", label="Underconfident"),
|
| 140 |
]
|
| 141 |
|
|
|
|
| 142 |
anindices = np.where(~np.isnan(results["p_bar"][:-1]))[0]
|
|
|
|
| 143 |
bin_freqs = np.zeros(n_bins)
|
| 144 |
bin_freqs[anindices] = results["bin_freq"]
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
# see: https://github.com/markus93/fit-on-the-test/blob/main/Experiments_Synthetic/binnings.py
|
| 149 |
|
| 150 |
acc_plt = ax2.axvline(x=results["accuracy"], ls="solid", lw=3, c="black", label="Accuracy")
|
| 151 |
conf_plt = ax2.axvline(
|
| 152 |
x=results["p_bar_cont"], ls="dotted", lw=3, c="#444", label="Avg. confidence"
|
| 153 |
)
|
| 154 |
-
ax2.legend(handles=[acc_plt, conf_plt])
|
| 155 |
|
| 156 |
-
# Bin differences
|
| 157 |
-
ax1.set_ylabel("Conditional Expectation")
|
| 158 |
-
ax1.set_ylim([0, 1.05]) # respective to bin range
|
| 159 |
ax1.legend(loc="lower right", handles=ax1handles)
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
# Bin frequencies
|
| 165 |
-
ax2.set_xlabel("Confidence")
|
| 166 |
-
ax2.set_ylabel("Count")
|
| 167 |
-
ax2.legend(loc="upper left") # , ncol=2
|
| 168 |
-
# ax2.set_xticks([0, ]+results["y_bar"])
|
| 169 |
-
ax2.set_xlim([-0.05, 1.05]) # respective to bin range
|
| 170 |
plt.tight_layout()
|
| 171 |
return fig
|
| 172 |
|
|
@@ -208,4 +192,4 @@ iface = gr.Interface(
|
|
| 208 |
article=evaluate.utils.parse_readme(local_path / "README.md"),
|
| 209 |
title=f"Metric: {metric.name}",
|
| 210 |
# examples=sample_data; # ValueError: Examples argument must either be a directory or a nested list, where each sublist represents a set of inputs.
|
| 211 |
-
).launch()
|
|
|
|
| 7 |
import numpy as np
|
| 8 |
import pandas as pd
|
| 9 |
import ast
|
| 10 |
+
|
| 11 |
+
# from ece import ECE # loads local instead
|
| 12 |
|
| 13 |
|
| 14 |
import matplotlib.pyplot as plt
|
|
|
|
| 56 |
|
| 57 |
local_path = Path(sys.path[0])
|
| 58 |
metric = evaluate.load("jordyvl/ece")
|
| 59 |
+
# ECE()
|
| 60 |
# module = evaluate.load("jordyvl/ece")
|
| 61 |
# launch_gradio_widget(module)
|
| 62 |
|
|
|
|
| 77 |
ls="dotted",
|
| 78 |
label="Perfect",
|
| 79 |
)
|
| 80 |
+
|
| 81 |
+
# Bin differences
|
| 82 |
ax1.set_ylabel("Conditional Expectation")
|
| 83 |
+
ax1.set_ylim([0, 1.05]) # respective to bin range
|
|
|
|
| 84 |
ax1.set_title("Reliability Diagram")
|
| 85 |
+
ax1.set_xlim([-0.05, 1.05]) # respective to bin range
|
| 86 |
|
| 87 |
# Bin frequencies
|
| 88 |
ax2.set_xlabel("Confidence")
|
| 89 |
ax2.set_ylabel("Count")
|
| 90 |
ax2.legend(loc="upper left") # , ncol=2
|
| 91 |
+
ax2.set_xlim([-0.05, 1.05]) # respective to bin range
|
|
|
|
|
|
|
| 92 |
|
| 93 |
+
return fig, ax1, ax2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
def reliability_plot(results):
|
| 97 |
# DEV: might still need to write tests in case of equal mass binning
|
| 98 |
+
# DEV: nicer would be to plot like a polygon
|
| 99 |
+
# see: https://github.com/markus93/fit-on-the-test/blob/main/Experiments_Synthetic/binnings.py
|
|
|
|
| 100 |
|
| 101 |
+
def over_under_confidence(results):
|
| 102 |
+
colors = []
|
| 103 |
+
for j, bin in enumerate(results["y_bar"]):
|
| 104 |
+
perfect = results["y_bar"][j]
|
| 105 |
+
empirical = results["p_bar"][j]
|
| 106 |
+
|
| 107 |
+
bin_color = (
|
| 108 |
+
"limegreen"
|
| 109 |
+
if np.allclose(perfect, empirical)
|
| 110 |
+
else "dodgerblue"
|
| 111 |
+
if empirical < perfect
|
| 112 |
+
else "orangered"
|
| 113 |
+
)
|
| 114 |
+
colors.append(bin_color)
|
| 115 |
+
return colors
|
| 116 |
+
|
| 117 |
+
fig, ax1, ax2 = default_plot()
|
| 118 |
+
|
| 119 |
+
# Bin differences
|
| 120 |
+
bins_with_left_edge = np.insert(results["y_bar"], 0, 0, axis=0)
|
| 121 |
B, bins, patches = ax1.hist(
|
| 122 |
+
results["y_bar"],
|
| 123 |
+
weights=np.nan_to_num(results["p_bar"][:-1], copy=True, nan=0),
|
| 124 |
+
bins=bins_with_left_edge,
|
| 125 |
)
|
| 126 |
colors = over_under_confidence(results)
|
| 127 |
for b in range(len(B)):
|
| 128 |
patches[b].set_facecolor(colors[b]) # color based on over/underconfidence
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
ax1handles = [
|
| 131 |
mpatches.Patch(color="orangered", label="Overconfident"),
|
| 132 |
mpatches.Patch(color="limegreen", label="Perfect", linestyle="dotted"),
|
| 133 |
mpatches.Patch(color="dodgerblue", label="Underconfident"),
|
| 134 |
]
|
| 135 |
|
| 136 |
+
# Bin frequencies
|
| 137 |
anindices = np.where(~np.isnan(results["p_bar"][:-1]))[0]
|
| 138 |
+
n_bins = len(results["y_bar"])
|
| 139 |
bin_freqs = np.zeros(n_bins)
|
| 140 |
bin_freqs[anindices] = results["bin_freq"]
|
| 141 |
+
B, newbins, patches = ax2.hist(
|
| 142 |
+
results["y_bar"], weights=bin_freqs, color="midnightblue", bins=bins_with_left_edge
|
| 143 |
+
)
|
|
|
|
| 144 |
|
| 145 |
acc_plt = ax2.axvline(x=results["accuracy"], ls="solid", lw=3, c="black", label="Accuracy")
|
| 146 |
conf_plt = ax2.axvline(
|
| 147 |
x=results["p_bar_cont"], ls="dotted", lw=3, c="#444", label="Avg. confidence"
|
| 148 |
)
|
|
|
|
| 149 |
|
|
|
|
|
|
|
|
|
|
| 150 |
ax1.legend(loc="lower right", handles=ax1handles)
|
| 151 |
+
ax2.legend(handles=[acc_plt, conf_plt])
|
| 152 |
+
ax1.set_xticks(bins_with_left_edge)
|
| 153 |
+
ax2.set_xticks(bins_with_left_edge)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
plt.tight_layout()
|
| 155 |
return fig
|
| 156 |
|
|
|
|
| 192 |
article=evaluate.utils.parse_readme(local_path / "README.md"),
|
| 193 |
title=f"Metric: {metric.name}",
|
| 194 |
# examples=sample_data; # ValueError: Examples argument must either be a directory or a nested list, where each sublist represents a set of inputs.
|
| 195 |
+
).launch()
|