romybeaute commited on
Commit
74a069f
·
verified ·
1 Parent(s): 904fb3f

updated for sentences with < N words removal

Browse files
Files changed (1) hide show
  1. app.py +33 -2
app.py CHANGED
@@ -550,6 +550,7 @@ def generate_and_save_embeddings(
550
  split_sentences,
551
  device,
552
  text_col=None,
 
553
  ):
554
 
555
  # ---------------------
@@ -578,15 +579,33 @@ def generate_and_save_embeddings(
578
  # ---------------------
579
  # Sentence / report granularity
580
  # ---------------------
 
 
 
 
 
 
 
 
 
 
 
581
  if split_sentences:
582
  try:
583
  sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
584
- docs = [s for s in sentences if len(s.split()) > 2]
585
  except LookupError as e:
586
  st.error(f"NLTK tokenizer data not found: {e}")
587
  st.stop()
 
 
 
 
 
588
  else:
589
- docs = reports
 
 
 
590
 
591
  np.save(docs_file, np.array(docs, dtype=object))
592
  st.success(f"Prepared {len(docs)} documents")
@@ -770,6 +789,17 @@ selected_granularity = st.sidebar.checkbox(
770
  )
771
  granularity_label = "sentences" if selected_granularity else "reports"
772
 
 
 
 
 
 
 
 
 
 
 
 
773
  subsample_perc = st.sidebar.slider("Data sampling (%)", 10, 100, 100, 5)
774
 
775
  st.sidebar.markdown("---")
@@ -870,6 +900,7 @@ if not os.path.exists(EMBEDDINGS_FILE):
870
  selected_granularity,
871
  selected_device,
872
  text_col=selected_text_column,
 
873
  )
874
 
875
  else:
 
550
  split_sentences,
551
  device,
552
  text_col=None,
553
+ min_words: int = 0, #for removal of sentences with <N words
554
  ):
555
 
556
  # ---------------------
 
579
  # ---------------------
580
  # Sentence / report granularity
581
  # ---------------------
582
+ # if split_sentences:
583
+ # try:
584
+ # sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
585
+ # docs = [s for s in sentences if len(s.split()) > 2]
586
+ # except LookupError as e:
587
+ # st.error(f"NLTK tokenizer data not found: {e}")
588
+ # st.stop()
589
+ # else:
590
+ # docs = reports
591
+
592
+ #change to account for sentence removal when < N words
593
  if split_sentences:
594
  try:
595
  sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
 
596
  except LookupError as e:
597
  st.error(f"NLTK tokenizer data not found: {e}")
598
  st.stop()
599
+
600
+ if min_words and min_words > 0:
601
+ docs = [s for s in sentences if len(s.split()) >= min_words]
602
+ else:
603
+ docs = sentences
604
  else:
605
+ if min_words and min_words > 0:
606
+ docs = [r for r in reports if len(str(r).split()) >= min_words]
607
+ else:
608
+ docs = reports
609
 
610
  np.save(docs_file, np.array(docs, dtype=object))
611
  st.success(f"Prepared {len(docs)} documents")
 
789
  )
790
  granularity_label = "sentences" if selected_granularity else "reports"
791
 
792
+ #preprocessing action: remove sentences with less than N words
793
+ min_words = st.sidebar.slider(
794
+ f"Remove {granularity_label} shorter than N words",
795
+ min_value=1,
796
+ max_value=20,
797
+ value=2, # default = 2 words
798
+ step=1,
799
+ help="Units (sentences or reports) with fewer words than this will be discarded "
800
+ "during preprocessing. After changing, click 'Prepare Data for This Configuration'.",
801
+ )
802
+
803
  subsample_perc = st.sidebar.slider("Data sampling (%)", 10, 100, 100, 5)
804
 
805
  st.sidebar.markdown("---")
 
900
  selected_granularity,
901
  selected_device,
902
  text_col=selected_text_column,
903
+ min_words=min_words,
904
  )
905
 
906
  else: