Spaces:
Running
Running
updated for sentences with < N words removal
Browse files
app.py
CHANGED
|
@@ -550,6 +550,7 @@ def generate_and_save_embeddings(
|
|
| 550 |
split_sentences,
|
| 551 |
device,
|
| 552 |
text_col=None,
|
|
|
|
| 553 |
):
|
| 554 |
|
| 555 |
# ---------------------
|
|
@@ -578,15 +579,33 @@ def generate_and_save_embeddings(
|
|
| 578 |
# ---------------------
|
| 579 |
# Sentence / report granularity
|
| 580 |
# ---------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
if split_sentences:
|
| 582 |
try:
|
| 583 |
sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
|
| 584 |
-
docs = [s for s in sentences if len(s.split()) > 2]
|
| 585 |
except LookupError as e:
|
| 586 |
st.error(f"NLTK tokenizer data not found: {e}")
|
| 587 |
st.stop()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 588 |
else:
|
| 589 |
-
|
|
|
|
|
|
|
|
|
|
| 590 |
|
| 591 |
np.save(docs_file, np.array(docs, dtype=object))
|
| 592 |
st.success(f"Prepared {len(docs)} documents")
|
|
@@ -770,6 +789,17 @@ selected_granularity = st.sidebar.checkbox(
|
|
| 770 |
)
|
| 771 |
granularity_label = "sentences" if selected_granularity else "reports"
|
| 772 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
subsample_perc = st.sidebar.slider("Data sampling (%)", 10, 100, 100, 5)
|
| 774 |
|
| 775 |
st.sidebar.markdown("---")
|
|
@@ -870,6 +900,7 @@ if not os.path.exists(EMBEDDINGS_FILE):
|
|
| 870 |
selected_granularity,
|
| 871 |
selected_device,
|
| 872 |
text_col=selected_text_column,
|
|
|
|
| 873 |
)
|
| 874 |
|
| 875 |
else:
|
|
|
|
| 550 |
split_sentences,
|
| 551 |
device,
|
| 552 |
text_col=None,
|
| 553 |
+
min_words: int = 0, #for removal of sentences with <N words
|
| 554 |
):
|
| 555 |
|
| 556 |
# ---------------------
|
|
|
|
| 579 |
# ---------------------
|
| 580 |
# Sentence / report granularity
|
| 581 |
# ---------------------
|
| 582 |
+
# if split_sentences:
|
| 583 |
+
# try:
|
| 584 |
+
# sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
|
| 585 |
+
# docs = [s for s in sentences if len(s.split()) > 2]
|
| 586 |
+
# except LookupError as e:
|
| 587 |
+
# st.error(f"NLTK tokenizer data not found: {e}")
|
| 588 |
+
# st.stop()
|
| 589 |
+
# else:
|
| 590 |
+
# docs = reports
|
| 591 |
+
|
| 592 |
+
#change to account for sentence removal when < N words
|
| 593 |
if split_sentences:
|
| 594 |
try:
|
| 595 |
sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
|
|
|
|
| 596 |
except LookupError as e:
|
| 597 |
st.error(f"NLTK tokenizer data not found: {e}")
|
| 598 |
st.stop()
|
| 599 |
+
|
| 600 |
+
if min_words and min_words > 0:
|
| 601 |
+
docs = [s for s in sentences if len(s.split()) >= min_words]
|
| 602 |
+
else:
|
| 603 |
+
docs = sentences
|
| 604 |
else:
|
| 605 |
+
if min_words and min_words > 0:
|
| 606 |
+
docs = [r for r in reports if len(str(r).split()) >= min_words]
|
| 607 |
+
else:
|
| 608 |
+
docs = reports
|
| 609 |
|
| 610 |
np.save(docs_file, np.array(docs, dtype=object))
|
| 611 |
st.success(f"Prepared {len(docs)} documents")
|
|
|
|
| 789 |
)
|
| 790 |
granularity_label = "sentences" if selected_granularity else "reports"
|
| 791 |
|
| 792 |
+
#preprocessing action: remove sentences with less than N words
|
| 793 |
+
min_words = st.sidebar.slider(
|
| 794 |
+
f"Remove {granularity_label} shorter than N words",
|
| 795 |
+
min_value=1,
|
| 796 |
+
max_value=20,
|
| 797 |
+
value=2, # default = 2 words
|
| 798 |
+
step=1,
|
| 799 |
+
help="Units (sentences or reports) with fewer words than this will be discarded "
|
| 800 |
+
"during preprocessing. After changing, click 'Prepare Data for This Configuration'.",
|
| 801 |
+
)
|
| 802 |
+
|
| 803 |
subsample_perc = st.sidebar.slider("Data sampling (%)", 10, 100, 100, 5)
|
| 804 |
|
| 805 |
st.sidebar.markdown("---")
|
|
|
|
| 900 |
selected_granularity,
|
| 901 |
selected_device,
|
| 902 |
text_col=selected_text_column,
|
| 903 |
+
min_words=min_words,
|
| 904 |
)
|
| 905 |
|
| 906 |
else:
|