Spaces:
Running
Running
added download button, title plot, cleaned prompt to not include to much "experiential", interactive max sentences for LLM
Browse files
app.py
CHANGED
|
@@ -26,6 +26,9 @@ from huggingface_hub import InferenceClient # for the LLM API command
|
|
| 26 |
|
| 27 |
from typing import Any
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
|
|
@@ -312,11 +315,13 @@ The title should be:
|
|
| 312 |
3. DISTINCTIVE enough that it wouldn't apply equally well to other "phenomenological" clusters
|
| 313 |
4. TECHNICALLY PRECISE, using domain-specific terminology where appropriate
|
| 314 |
5. CONCEPTUALLY FOCUSED on the core specificities of this type of experience
|
|
|
|
| 315 |
|
| 316 |
|
| 317 |
Constraints:
|
| 318 |
- Output ONLY the label (no explanation).
|
| 319 |
- 3–7 words.
|
|
|
|
| 320 |
- No punctuation, no quotes, no extra text.
|
| 321 |
- Do not explain your reasoning
|
| 322 |
"""
|
|
@@ -339,6 +344,23 @@ def _clean_label(x: str) -> str:
|
|
| 339 |
x = re.sub(r"[.:\-–—]+$", "", x).strip() # remove trailing punctuation
|
| 340 |
# enforce "no punctuation" lightly (optional):
|
| 341 |
x = re.sub(r"[^\w\s]", "", x).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
return x or "Unlabelled"
|
| 343 |
|
| 344 |
|
|
@@ -350,7 +372,7 @@ def generate_labels_via_chat_completion(
|
|
| 350 |
config_hash: str,
|
| 351 |
model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct",
|
| 352 |
max_topics: int = 50,
|
| 353 |
-
max_docs_per_topic: int =
|
| 354 |
doc_char_limit: int = 400,
|
| 355 |
temperature: float = 0.2, #deterministic, stable outputs.
|
| 356 |
force: bool = False) -> dict[int, str]:
|
|
@@ -1057,7 +1079,7 @@ else:
|
|
| 1057 |
|
| 1058 |
with st.sidebar.expander("BERTopic"):
|
| 1059 |
nr_topics = st.text_input("nr_topics", value="auto")
|
| 1060 |
-
top_n_words = st.slider("top_n_words", 5, 25, 10)
|
| 1061 |
|
| 1062 |
current_config = {
|
| 1063 |
"embedding_model": selected_embedding_model,
|
|
@@ -1246,12 +1268,21 @@ else:
|
|
| 1246 |
st.caption("No example prompt stored yet – run LLM labelling to populate this.")
|
| 1247 |
|
| 1248 |
cA, cB, cC = st.columns([1, 1, 2])
|
| 1249 |
-
max_topics = cA.slider("Max topics", 5, 120, 40, 5)
|
| 1250 |
-
# topic_info = tm.get_topic_info()
|
| 1251 |
-
# n_topics_no_outliers = int((topic_info.Topic != -1).sum())
|
| 1252 |
-
# max_topics = n_topics_no_outliers
|
| 1253 |
-
# st.caption(f"Will label all topics (excluding outliers): {max_topics}")
|
| 1254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1255 |
|
| 1256 |
force = cB.checkbox("Force regenerate", value=False)
|
| 1257 |
|
|
@@ -1264,6 +1295,7 @@ else:
|
|
| 1264 |
config_hash=cfg_hash,
|
| 1265 |
model_id=model_id,
|
| 1266 |
max_topics=max_topics,
|
|
|
|
| 1267 |
force=force,
|
| 1268 |
)
|
| 1269 |
st.session_state.llm_names = llm_names
|
|
@@ -1290,24 +1322,8 @@ else:
|
|
| 1290 |
api_map = st.session_state.get("llm_names", {}) or {}
|
| 1291 |
llm_names = {**default_map, **api_map}
|
| 1292 |
|
| 1293 |
-
# #option to choose to include outliers or not
|
| 1294 |
-
# include_outliers_plot = st.checkbox("Include outliers in plot (-1)", value=False)
|
| 1295 |
-
# topics_arr = np.asarray(tm.topics_)
|
| 1296 |
-
# labs_all = [final_name_map.get(int(t), "Unlabelled") for t in topics_arr]
|
| 1297 |
-
# if include_outliers_plot:
|
| 1298 |
-
# final_name_map[-1] = "Outliers"
|
| 1299 |
-
# reduced_plot = reduced
|
| 1300 |
-
# labs = labs_all
|
| 1301 |
-
# else:
|
| 1302 |
-
# mask = topics_arr != -1
|
| 1303 |
-
# reduced_plot = reduced[mask]
|
| 1304 |
-
# labs = list(np.asarray(labs_all)[mask])
|
| 1305 |
-
|
| 1306 |
|
| 1307 |
|
| 1308 |
-
|
| 1309 |
-
#try remove -1 outliers topics
|
| 1310 |
-
|
| 1311 |
# FIX: Force outliers (Topic -1) to be "Unlabelled" so we can hide them
|
| 1312 |
labs = []
|
| 1313 |
for t in tm.topics_:
|
|
@@ -1318,6 +1334,10 @@ else:
|
|
| 1318 |
|
| 1319 |
# VISUALISATION
|
| 1320 |
st.subheader("Experiential Topics Visualisation")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1321 |
|
| 1322 |
# We pass 'noise_label' and 'noise_color' to grey out the outliers
|
| 1323 |
fig, _ = datamapplot.create_plot(
|
|
@@ -1328,16 +1348,41 @@ else:
|
|
| 1328 |
label_font_size=11, # Optional: Adjust font size
|
| 1329 |
arrowprops={"arrowstyle": "-", "color": "#333333"} # Optional: darker, simpler arrows
|
| 1330 |
)
|
|
|
|
| 1331 |
st.pyplot(fig)
|
| 1332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1333 |
|
| 1334 |
-
#
|
| 1335 |
-
|
|
|
|
|
|
|
| 1336 |
|
| 1337 |
-
|
| 1338 |
-
|
| 1339 |
-
|
| 1340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1341 |
|
| 1342 |
|
| 1343 |
|
|
@@ -1346,27 +1391,6 @@ else:
|
|
| 1346 |
|
| 1347 |
st.subheader("Export results (one row per topic)")
|
| 1348 |
|
| 1349 |
-
# full_reps = tm.get_topics(full=True)
|
| 1350 |
-
# llm_reps = full_reps.get("LLM", {})
|
| 1351 |
-
|
| 1352 |
-
# llm_names = {}
|
| 1353 |
-
# for tid, vals in llm_reps.items():
|
| 1354 |
-
# try:
|
| 1355 |
-
# llm_names[tid] = (
|
| 1356 |
-
# (vals[0][0] or "").strip().strip('"').strip(".")
|
| 1357 |
-
# )
|
| 1358 |
-
# except Exception:
|
| 1359 |
-
# llm_names[tid] = "Unlabelled"
|
| 1360 |
-
|
| 1361 |
-
# if not llm_names:
|
| 1362 |
-
# st.caption("Note: Using default keyword-based topic names.")
|
| 1363 |
-
# llm_names = (
|
| 1364 |
-
# tm.get_topic_info().set_index("Topic")["Name"].to_dict()
|
| 1365 |
-
# )
|
| 1366 |
-
|
| 1367 |
-
# default_map = tm.get_topic_info().set_index("Topic")["Name"].to_dict()
|
| 1368 |
-
# api_map = st.session_state.get("llm_names", {}) or {}
|
| 1369 |
-
# llm_names = {**default_map, **api_map}
|
| 1370 |
|
| 1371 |
model_docs = getattr(tm, "docs_", None)
|
| 1372 |
if model_docs is not None and len(docs) != len(model_docs):
|
|
@@ -1376,8 +1400,6 @@ else:
|
|
| 1376 |
"so you may want to re-run topic modelling before exporting."
|
| 1377 |
)
|
| 1378 |
|
| 1379 |
-
# doc_info = tm.get_document_info()[["Document", "Topic"]]
|
| 1380 |
-
|
| 1381 |
doc_info = tm.get_document_info(docs)[["Document", "Topic"]]
|
| 1382 |
|
| 1383 |
include_outliers = st.checkbox(
|
|
|
|
| 26 |
|
| 27 |
from typing import Any
|
| 28 |
|
| 29 |
+
from io import BytesIO #Download button for the clustering image
|
| 30 |
+
|
| 31 |
+
|
| 32 |
|
| 33 |
|
| 34 |
|
|
|
|
| 315 |
3. DISTINCTIVE enough that it wouldn't apply equally well to other "phenomenological" clusters
|
| 316 |
4. TECHNICALLY PRECISE, using domain-specific terminology where appropriate
|
| 317 |
5. CONCEPTUALLY FOCUSED on the core specificities of this type of experience
|
| 318 |
+
6. CONCISE and NOUN-PHRASE LIKE (e.g. "body boundary dissolution", not "Experience of body boundary dissolution").
|
| 319 |
|
| 320 |
|
| 321 |
Constraints:
|
| 322 |
- Output ONLY the label (no explanation).
|
| 323 |
- 3–7 words.
|
| 324 |
+
- Avoid generic wrappers such as "experience of", "phenomenon of", "state of" unless they are absolutely necessary.
|
| 325 |
- No punctuation, no quotes, no extra text.
|
| 326 |
- Do not explain your reasoning
|
| 327 |
"""
|
|
|
|
| 344 |
x = re.sub(r"[.:\-–—]+$", "", x).strip() # remove trailing punctuation
|
| 345 |
# enforce "no punctuation" lightly (optional):
|
| 346 |
x = re.sub(r"[^\w\s]", "", x).strip()
|
| 347 |
+
# Optional: de-wrap generic "experience/phenomenon/state" wrappers
|
| 348 |
+
# Leading patterns like "Experiential/Experience of ..."
|
| 349 |
+
x = re.sub(
|
| 350 |
+
r"^(Experiential(?:\s+Phenomenon)?|Experience|Experience of|Subjective Experience of|Phenomenon of)\s+",
|
| 351 |
+
"",
|
| 352 |
+
x,
|
| 353 |
+
flags=re.IGNORECASE,
|
| 354 |
+
)
|
| 355 |
+
# Trailing "experience/phenomenon/state"
|
| 356 |
+
x = re.sub(
|
| 357 |
+
r"\s+(experience|experiences|phenomenon|state|states)$",
|
| 358 |
+
"",
|
| 359 |
+
x,
|
| 360 |
+
flags=re.IGNORECASE,
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
x = x.strip()
|
| 364 |
return x or "Unlabelled"
|
| 365 |
|
| 366 |
|
|
|
|
| 372 |
config_hash: str,
|
| 373 |
model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct",
|
| 374 |
max_topics: int = 50,
|
| 375 |
+
max_docs_per_topic: int = 10,
|
| 376 |
doc_char_limit: int = 400,
|
| 377 |
temperature: float = 0.2, #deterministic, stable outputs.
|
| 378 |
force: bool = False) -> dict[int, str]:
|
|
|
|
| 1079 |
|
| 1080 |
with st.sidebar.expander("BERTopic"):
|
| 1081 |
nr_topics = st.text_input("nr_topics", value="auto")
|
| 1082 |
+
top_n_words = st.slider("top_n_words", 5, 25, 10, help="for a number N selected, BERTopic with fill the N most statistically significant words for that cluster")
|
| 1083 |
|
| 1084 |
current_config = {
|
| 1085 |
"embedding_model": selected_embedding_model,
|
|
|
|
| 1268 |
st.caption("No example prompt stored yet – run LLM labelling to populate this.")
|
| 1269 |
|
| 1270 |
cA, cB, cC = st.columns([1, 1, 2])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1271 |
|
| 1272 |
+
with cA:
|
| 1273 |
+
max_topics = st.slider("Max topics", 5, 120, 40, 5)
|
| 1274 |
+
# max_topics = cA.slider("Max topics", 5, 120, 40, 5)
|
| 1275 |
+
|
| 1276 |
+
with cB:
|
| 1277 |
+
max_docs_per_topic = st.slider(
|
| 1278 |
+
"Docs per topic",
|
| 1279 |
+
min_value=2,
|
| 1280 |
+
max_value=20,
|
| 1281 |
+
value=10,
|
| 1282 |
+
step=1,
|
| 1283 |
+
help="How many representative sentences per topic to show the LLM."
|
| 1284 |
+
)
|
| 1285 |
+
force = st.checkbox("Force regenerate", value=False)
|
| 1286 |
|
| 1287 |
force = cB.checkbox("Force regenerate", value=False)
|
| 1288 |
|
|
|
|
| 1295 |
config_hash=cfg_hash,
|
| 1296 |
model_id=model_id,
|
| 1297 |
max_topics=max_topics,
|
| 1298 |
+
max_docs_per_topic=max_docs_per_topic,
|
| 1299 |
force=force,
|
| 1300 |
)
|
| 1301 |
st.session_state.llm_names = llm_names
|
|
|
|
| 1322 |
api_map = st.session_state.get("llm_names", {}) or {}
|
| 1323 |
llm_names = {**default_map, **api_map}
|
| 1324 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1325 |
|
| 1326 |
|
|
|
|
|
|
|
|
|
|
| 1327 |
# FIX: Force outliers (Topic -1) to be "Unlabelled" so we can hide them
|
| 1328 |
labs = []
|
| 1329 |
for t in tm.topics_:
|
|
|
|
| 1334 |
|
| 1335 |
# VISUALISATION
|
| 1336 |
st.subheader("Experiential Topics Visualisation")
|
| 1337 |
+
|
| 1338 |
+
# Build a nice title from the dataset name
|
| 1339 |
+
dataset_title = ds_input.strip() or DATASET_DIR
|
| 1340 |
+
plot_title = f"{dataset_title}: MOSAIC's Experiential Topic Map"
|
| 1341 |
|
| 1342 |
# We pass 'noise_label' and 'noise_color' to grey out the outliers
|
| 1343 |
fig, _ = datamapplot.create_plot(
|
|
|
|
| 1348 |
label_font_size=11, # Optional: Adjust font size
|
| 1349 |
arrowprops={"arrowstyle": "-", "color": "#333333"} # Optional: darker, simpler arrows
|
| 1350 |
)
|
| 1351 |
+
fig.suptitle(plot_title, fontsize=16, y=0.99)
|
| 1352 |
st.pyplot(fig)
|
| 1353 |
|
| 1354 |
+
# --- Download / save visualisation ---
|
| 1355 |
+
|
| 1356 |
+
# Prepare high-res PNG bytes
|
| 1357 |
+
buf = BytesIO()
|
| 1358 |
+
fig.savefig(buf, format="png", dpi=300, bbox_inches="tight")
|
| 1359 |
+
png_bytes = buf.getvalue()
|
| 1360 |
|
| 1361 |
+
# Reuse base / gran for a nice filename later (they’re defined below as well)
|
| 1362 |
+
base = os.path.splitext(os.path.basename(CSV_PATH))[0]
|
| 1363 |
+
gran = "sentences" if selected_granularity else "reports"
|
| 1364 |
+
png_name = f"topics_{base}_{gran}_plot.png"
|
| 1365 |
|
| 1366 |
+
dl_col, save_col = st.columns(2)
|
| 1367 |
+
|
| 1368 |
+
with dl_col:
|
| 1369 |
+
st.download_button(
|
| 1370 |
+
"Download visualisation as PNG",
|
| 1371 |
+
data=png_bytes,
|
| 1372 |
+
file_name=png_name,
|
| 1373 |
+
mime="image/png",
|
| 1374 |
+
use_container_width=True,
|
| 1375 |
+
)
|
| 1376 |
+
with save_col:
|
| 1377 |
+
if st.button("Save plot to eval/", use_container_width=True):
|
| 1378 |
+
try:
|
| 1379 |
+
plot_path = (EVAL_DIR / png_name).resolve()
|
| 1380 |
+
fig.savefig(plot_path, format="png", dpi=300, bbox_inches="tight")
|
| 1381 |
+
st.success(f"Saved plot → {plot_path}")
|
| 1382 |
+
except Exception as e:
|
| 1383 |
+
st.error(f"Failed to save plot: {e}")
|
| 1384 |
+
|
| 1385 |
+
|
| 1386 |
|
| 1387 |
|
| 1388 |
|
|
|
|
| 1391 |
|
| 1392 |
st.subheader("Export results (one row per topic)")
|
| 1393 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1394 |
|
| 1395 |
model_docs = getattr(tm, "docs_", None)
|
| 1396 |
if model_docs is not None and len(docs) != len(model_docs):
|
|
|
|
| 1400 |
"so you may want to re-run topic modelling before exporting."
|
| 1401 |
)
|
| 1402 |
|
|
|
|
|
|
|
| 1403 |
doc_info = tm.get_document_info(docs)[["Document", "Topic"]]
|
| 1404 |
|
| 1405 |
include_outliers = st.checkbox(
|