romybeaute commited on
Commit
f2698ac
·
verified ·
1 Parent(s): 37b2381

added download button, title plot, cleaned prompt to not include to much "experiential", interactive max sentences for LLM

Browse files
Files changed (1) hide show
  1. app.py +74 -52
app.py CHANGED
@@ -26,6 +26,9 @@ from huggingface_hub import InferenceClient # for the LLM API command
26
 
27
  from typing import Any
28
 
 
 
 
29
 
30
 
31
 
@@ -312,11 +315,13 @@ The title should be:
312
  3. DISTINCTIVE enough that it wouldn't apply equally well to other "phenomenological" clusters
313
  4. TECHNICALLY PRECISE, using domain-specific terminology where appropriate
314
  5. CONCEPTUALLY FOCUSED on the core specificities of this type of experience
 
315
 
316
 
317
  Constraints:
318
  - Output ONLY the label (no explanation).
319
  - 3–7 words.
 
320
  - No punctuation, no quotes, no extra text.
321
  - Do not explain your reasoning
322
  """
@@ -339,6 +344,23 @@ def _clean_label(x: str) -> str:
339
  x = re.sub(r"[.:\-–—]+$", "", x).strip() # remove trailing punctuation
340
  # enforce "no punctuation" lightly (optional):
341
  x = re.sub(r"[^\w\s]", "", x).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  return x or "Unlabelled"
343
 
344
 
@@ -350,7 +372,7 @@ def generate_labels_via_chat_completion(
350
  config_hash: str,
351
  model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct",
352
  max_topics: int = 50,
353
- max_docs_per_topic: int = 8,
354
  doc_char_limit: int = 400,
355
  temperature: float = 0.2, #deterministic, stable outputs.
356
  force: bool = False) -> dict[int, str]:
@@ -1057,7 +1079,7 @@ else:
1057
 
1058
  with st.sidebar.expander("BERTopic"):
1059
  nr_topics = st.text_input("nr_topics", value="auto")
1060
- top_n_words = st.slider("top_n_words", 5, 25, 10)
1061
 
1062
  current_config = {
1063
  "embedding_model": selected_embedding_model,
@@ -1246,12 +1268,21 @@ else:
1246
  st.caption("No example prompt stored yet – run LLM labelling to populate this.")
1247
 
1248
  cA, cB, cC = st.columns([1, 1, 2])
1249
- max_topics = cA.slider("Max topics", 5, 120, 40, 5)
1250
- # topic_info = tm.get_topic_info()
1251
- # n_topics_no_outliers = int((topic_info.Topic != -1).sum())
1252
- # max_topics = n_topics_no_outliers
1253
- # st.caption(f"Will label all topics (excluding outliers): {max_topics}")
1254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1255
 
1256
  force = cB.checkbox("Force regenerate", value=False)
1257
 
@@ -1264,6 +1295,7 @@ else:
1264
  config_hash=cfg_hash,
1265
  model_id=model_id,
1266
  max_topics=max_topics,
 
1267
  force=force,
1268
  )
1269
  st.session_state.llm_names = llm_names
@@ -1290,24 +1322,8 @@ else:
1290
  api_map = st.session_state.get("llm_names", {}) or {}
1291
  llm_names = {**default_map, **api_map}
1292
 
1293
- # #option to choose to include outliers or not
1294
- # include_outliers_plot = st.checkbox("Include outliers in plot (-1)", value=False)
1295
- # topics_arr = np.asarray(tm.topics_)
1296
- # labs_all = [final_name_map.get(int(t), "Unlabelled") for t in topics_arr]
1297
- # if include_outliers_plot:
1298
- # final_name_map[-1] = "Outliers"
1299
- # reduced_plot = reduced
1300
- # labs = labs_all
1301
- # else:
1302
- # mask = topics_arr != -1
1303
- # reduced_plot = reduced[mask]
1304
- # labs = list(np.asarray(labs_all)[mask])
1305
-
1306
 
1307
 
1308
-
1309
- #try remove -1 outliers topics
1310
-
1311
  # FIX: Force outliers (Topic -1) to be "Unlabelled" so we can hide them
1312
  labs = []
1313
  for t in tm.topics_:
@@ -1318,6 +1334,10 @@ else:
1318
 
1319
  # VISUALISATION
1320
  st.subheader("Experiential Topics Visualisation")
 
 
 
 
1321
 
1322
  # We pass 'noise_label' and 'noise_color' to grey out the outliers
1323
  fig, _ = datamapplot.create_plot(
@@ -1328,16 +1348,41 @@ else:
1328
  label_font_size=11, # Optional: Adjust font size
1329
  arrowprops={"arrowstyle": "-", "color": "#333333"} # Optional: darker, simpler arrows
1330
  )
 
1331
  st.pyplot(fig)
1332
 
 
 
 
 
 
 
1333
 
1334
- # labs = [llm_names.get(t, "Unlabelled") for t in tm.topics_]
1335
- # ##### ADDED FOR LLM (END)
 
 
1336
 
1337
- # # VISUALISATION
1338
- # st.subheader("Experiential Topics Visualisation")
1339
- # fig, _ = datamapplot.create_plot(reduced, labs)
1340
- # st.pyplot(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1341
 
1342
 
1343
 
@@ -1346,27 +1391,6 @@ else:
1346
 
1347
  st.subheader("Export results (one row per topic)")
1348
 
1349
- # full_reps = tm.get_topics(full=True)
1350
- # llm_reps = full_reps.get("LLM", {})
1351
-
1352
- # llm_names = {}
1353
- # for tid, vals in llm_reps.items():
1354
- # try:
1355
- # llm_names[tid] = (
1356
- # (vals[0][0] or "").strip().strip('"').strip(".")
1357
- # )
1358
- # except Exception:
1359
- # llm_names[tid] = "Unlabelled"
1360
-
1361
- # if not llm_names:
1362
- # st.caption("Note: Using default keyword-based topic names.")
1363
- # llm_names = (
1364
- # tm.get_topic_info().set_index("Topic")["Name"].to_dict()
1365
- # )
1366
-
1367
- # default_map = tm.get_topic_info().set_index("Topic")["Name"].to_dict()
1368
- # api_map = st.session_state.get("llm_names", {}) or {}
1369
- # llm_names = {**default_map, **api_map}
1370
 
1371
  model_docs = getattr(tm, "docs_", None)
1372
  if model_docs is not None and len(docs) != len(model_docs):
@@ -1376,8 +1400,6 @@ else:
1376
  "so you may want to re-run topic modelling before exporting."
1377
  )
1378
 
1379
- # doc_info = tm.get_document_info()[["Document", "Topic"]]
1380
-
1381
  doc_info = tm.get_document_info(docs)[["Document", "Topic"]]
1382
 
1383
  include_outliers = st.checkbox(
 
26
 
27
  from typing import Any
28
 
29
+ from io import BytesIO #Download button for the clustering image
30
+
31
+
32
 
33
 
34
 
 
315
  3. DISTINCTIVE enough that it wouldn't apply equally well to other "phenomenological" clusters
316
  4. TECHNICALLY PRECISE, using domain-specific terminology where appropriate
317
  5. CONCEPTUALLY FOCUSED on the core specificities of this type of experience
318
+ 6. CONCISE and NOUN-PHRASE LIKE (e.g. "body boundary dissolution", not "Experience of body boundary dissolution").
319
 
320
 
321
  Constraints:
322
  - Output ONLY the label (no explanation).
323
  - 3–7 words.
324
+ - Avoid generic wrappers such as "experience of", "phenomenon of", "state of" unless they are absolutely necessary.
325
  - No punctuation, no quotes, no extra text.
326
  - Do not explain your reasoning
327
  """
 
344
  x = re.sub(r"[.:\-–—]+$", "", x).strip() # remove trailing punctuation
345
  # enforce "no punctuation" lightly (optional):
346
  x = re.sub(r"[^\w\s]", "", x).strip()
347
+ # Optional: de-wrap generic "experience/phenomenon/state" wrappers
348
+ # Leading patterns like "Experiential/Experience of ..."
349
+ x = re.sub(
350
+ r"^(Experiential(?:\s+Phenomenon)?|Experience|Experience of|Subjective Experience of|Phenomenon of)\s+",
351
+ "",
352
+ x,
353
+ flags=re.IGNORECASE,
354
+ )
355
+ # Trailing "experience/phenomenon/state"
356
+ x = re.sub(
357
+ r"\s+(experience|experiences|phenomenon|state|states)$",
358
+ "",
359
+ x,
360
+ flags=re.IGNORECASE,
361
+ )
362
+
363
+ x = x.strip()
364
  return x or "Unlabelled"
365
 
366
 
 
372
  config_hash: str,
373
  model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct",
374
  max_topics: int = 50,
375
+ max_docs_per_topic: int = 10,
376
  doc_char_limit: int = 400,
377
  temperature: float = 0.2, #deterministic, stable outputs.
378
  force: bool = False) -> dict[int, str]:
 
1079
 
1080
  with st.sidebar.expander("BERTopic"):
1081
  nr_topics = st.text_input("nr_topics", value="auto")
1082
+ top_n_words = st.slider("top_n_words", 5, 25, 10, help="for a number N selected, BERTopic with fill the N most statistically significant words for that cluster")
1083
 
1084
  current_config = {
1085
  "embedding_model": selected_embedding_model,
 
1268
  st.caption("No example prompt stored yet – run LLM labelling to populate this.")
1269
 
1270
  cA, cB, cC = st.columns([1, 1, 2])
 
 
 
 
 
1271
 
1272
+ with cA:
1273
+ max_topics = st.slider("Max topics", 5, 120, 40, 5)
1274
+ # max_topics = cA.slider("Max topics", 5, 120, 40, 5)
1275
+
1276
+ with cB:
1277
+ max_docs_per_topic = st.slider(
1278
+ "Docs per topic",
1279
+ min_value=2,
1280
+ max_value=20,
1281
+ value=10,
1282
+ step=1,
1283
+ help="How many representative sentences per topic to show the LLM."
1284
+ )
1285
+ force = st.checkbox("Force regenerate", value=False)
1286
 
1287
  force = cB.checkbox("Force regenerate", value=False)
1288
 
 
1295
  config_hash=cfg_hash,
1296
  model_id=model_id,
1297
  max_topics=max_topics,
1298
+ max_docs_per_topic=max_docs_per_topic,
1299
  force=force,
1300
  )
1301
  st.session_state.llm_names = llm_names
 
1322
  api_map = st.session_state.get("llm_names", {}) or {}
1323
  llm_names = {**default_map, **api_map}
1324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1325
 
1326
 
 
 
 
1327
  # FIX: Force outliers (Topic -1) to be "Unlabelled" so we can hide them
1328
  labs = []
1329
  for t in tm.topics_:
 
1334
 
1335
  # VISUALISATION
1336
  st.subheader("Experiential Topics Visualisation")
1337
+
1338
+ # Build a nice title from the dataset name
1339
+ dataset_title = ds_input.strip() or DATASET_DIR
1340
+ plot_title = f"{dataset_title}: MOSAIC's Experiential Topic Map"
1341
 
1342
  # We pass 'noise_label' and 'noise_color' to grey out the outliers
1343
  fig, _ = datamapplot.create_plot(
 
1348
  label_font_size=11, # Optional: Adjust font size
1349
  arrowprops={"arrowstyle": "-", "color": "#333333"} # Optional: darker, simpler arrows
1350
  )
1351
+ fig.suptitle(plot_title, fontsize=16, y=0.99)
1352
  st.pyplot(fig)
1353
 
1354
+ # --- Download / save visualisation ---
1355
+
1356
+ # Prepare high-res PNG bytes
1357
+ buf = BytesIO()
1358
+ fig.savefig(buf, format="png", dpi=300, bbox_inches="tight")
1359
+ png_bytes = buf.getvalue()
1360
 
1361
+ # Reuse base / gran for a nice filename later (they’re defined below as well)
1362
+ base = os.path.splitext(os.path.basename(CSV_PATH))[0]
1363
+ gran = "sentences" if selected_granularity else "reports"
1364
+ png_name = f"topics_{base}_{gran}_plot.png"
1365
 
1366
+ dl_col, save_col = st.columns(2)
1367
+
1368
+ with dl_col:
1369
+ st.download_button(
1370
+ "Download visualisation as PNG",
1371
+ data=png_bytes,
1372
+ file_name=png_name,
1373
+ mime="image/png",
1374
+ use_container_width=True,
1375
+ )
1376
+ with save_col:
1377
+ if st.button("Save plot to eval/", use_container_width=True):
1378
+ try:
1379
+ plot_path = (EVAL_DIR / png_name).resolve()
1380
+ fig.savefig(plot_path, format="png", dpi=300, bbox_inches="tight")
1381
+ st.success(f"Saved plot → {plot_path}")
1382
+ except Exception as e:
1383
+ st.error(f"Failed to save plot: {e}")
1384
+
1385
+
1386
 
1387
 
1388
 
 
1391
 
1392
  st.subheader("Export results (one row per topic)")
1393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1394
 
1395
  model_docs = getattr(tm, "docs_", None)
1396
  if model_docs is not None and len(docs) != len(model_docs):
 
1400
  "so you may want to re-run topic modelling before exporting."
1401
  )
1402
 
 
 
1403
  doc_info = tm.get_document_info(docs)[["Document", "Topic"]]
1404
 
1405
  include_outliers = st.checkbox(