wbrooks commited on
Commit
68fd999
·
1 Parent(s): 1b85d5f

switched to the new search functions using sentence-transformers

Browse files
Files changed (3) hide show
  1. app.py +24 -3
  2. src/embeddings_search.py +105 -42
  3. src/tfidf_search.py +84 -33
app.py CHANGED
@@ -1,12 +1,19 @@
1
  from fastapi import FastAPI, Request, Query
2
  from fastapi.responses import JSONResponse
3
- from src.do_pca_on_tfidf import query_docs
4
- from src.search_embeddings import sbert_query_docs
 
5
  import polars as pl
6
  #from jinja2 import Template
7
 
 
8
  path_prefix = "/Users/wes/Google Drive/Shared drives/datalab/projects/2025_coul_aisearch/data/original_box_download/"
9
 
 
 
 
 
 
10
  app = FastAPI()
11
 
12
 
@@ -37,7 +44,21 @@ def square(x: int):
37
 
38
  @app.get("/search", response_class=JSONResponse)
39
  def search(q: str = Query(..., description="Search query")):
40
- res_tfidf = query_docs(q)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  res_sbert = sbert_query_docs(q)
42
 
43
  joined = res_sbert.join(res_tfidf, on='file', how = 'inner')
 
1
  from fastapi import FastAPI, Request, Query
2
  from fastapi.responses import JSONResponse
3
+ from src.embeddings_search import create_embeddings_search_function_from_embeddings_df
4
+ from src.tfidf_search import create_tfidf_search_function
5
+
6
  import polars as pl
7
  #from jinja2 import Template
8
 
9
+ # remove this prefix from the file paths:
10
  path_prefix = "/Users/wes/Google Drive/Shared drives/datalab/projects/2025_coul_aisearch/data/original_box_download/"
11
 
12
+ # data we will need for search:
13
+ block_embeddings_df_path = "block_embeddings/block-embeddings.parquet"
14
+ doc_tfidf_df_path = "block_tfidf/TF-IDF-doc-text.parquet"
15
+ tfidf_vectorizer_path = "block_tfidf/tfidf_vectorizer_doc_text.joblib"
16
+
17
  app = FastAPI()
18
 
19
 
 
44
 
45
  @app.get("/search", response_class=JSONResponse)
46
  def search(q: str = Query(..., description="Search query")):
47
+ block_embeddings_df_path = "block_embeddings/block-embeddings.parquet"
48
+ doc_tfidf_df_path = "block_tfidf/TF-IDF-doc-text.parquet"
49
+ tfidf_vectorizer_path = "block_tfidf/tfidf_vectorizer_doc_text.joblib"
50
+
51
+ sbert_query_docs = create_embeddings_search_function_from_embeddings_df(
52
+ model_name = "sentence-transformers/all-MiniLM-L6-v2",
53
+ embeddings_df_path = block_embeddings_df_path,
54
+ device = "cpu")
55
+ tfidf_query_docs = create_tfidf_search_function(
56
+ dtm_df_path = doc_tfidf_df_path,
57
+ vectorizer_path = tfidf_vectorizer_path,
58
+ model_name = "facebook/fasttext-en-vectors")
59
+
60
+
61
+ res_tfidf = tfidf_query_docs(q)
62
  res_sbert = sbert_query_docs(q)
63
 
64
  joined = res_sbert.join(res_tfidf, on='file', how = 'inner')
src/embeddings_search.py CHANGED
@@ -1,73 +1,136 @@
 
 
 
 
1
  # import packages
2
  import numpy as np
3
  import polars as pl
4
-
5
  from sklearn.metrics.pairwise import cosine_similarity
 
 
 
 
6
 
7
- from transformers import AutoTokenizer, AutoModel
8
 
9
- import glob
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- import torch
 
 
 
 
 
12
 
13
- #
14
- def encode(sentences, tokenizer, model, device="mps"):
15
- inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(device = device)
16
 
17
- with torch.no_grad():
18
- outputs = model(**inputs)
19
 
20
- # outputs.last_hidden_state = [batch, tokens, hidden_dim]
21
- # mean pooling
22
- embeddings = outputs.last_hidden_state.mean(dim=1)
23
 
24
- return(embeddings)
 
 
25
 
 
 
 
26
 
 
 
 
27
 
28
- # define the device where torch calculations take place
29
- my_device = "cpu"
 
 
 
 
 
 
30
 
31
- # Instantiate the sentence-transformer model:
32
- model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
33
- sentence_tokenizer = AutoTokenizer.from_pretrained(model_name)
34
- sentence_model = AutoModel.from_pretrained(model_name).to(device = my_device)
35
 
36
- block_embeddings_df = pl.read_parquet("outputs/block_embeddings_df.parquet.zstd")
 
 
37
 
38
- def sbert_query(query, corpus_embeddings_df):
39
- query_embeddings = encode(query, tokenizer = sentence_tokenizer, model = sentence_model, device=my_device).cpu().numpy()
40
-
41
- sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx'])))
42
 
43
- sorted_df = pl.DataFrame(
44
- {
45
- 'score': np.reshape(sbert_scores, shape=-1),
46
- 'file': corpus_embeddings_df['file'],
47
- 'doc_block_indx': corpus_embeddings_df['doc_block_indx']
48
- }).group_by("file").agg(pl.col("score").max())
49
 
50
- #top_df['file'][0]
51
- return(sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])])))
52
 
53
- def sbert_query_factory(corpus_embeddings_df):
54
- def do_sbert_query(my_query):
55
- return sbert_query(my_query, corpus_embeddings_df)
56
-
57
- return do_sbert_query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
 
 
59
 
60
- # create a function to run the SBERT queries
61
- sbert_query_docs = sbert_query_factory(block_embeddings_df)
62
 
63
- query = "plans for raising grant revenue directed to the libraries"
64
- res_sbert = sbert_query_docs(query)
65
 
66
 
67
- #res.group_by("file").agg(pl.col("rank").min(), pl.col("score").max()).sort("rank")
 
 
68
 
 
 
 
 
69
 
 
 
70
 
 
 
 
71
 
 
 
72
 
 
 
73
 
 
1
+ # This script defines functions that search the corpus for blocks that are similar to the query.
2
+ # Loading embeddings of the query had to be changed for deployment in production because
3
+ # my CSVs took too much space for the free tier of HuggingFace spaces.
4
+
5
  # import packages
6
  import numpy as np
7
  import polars as pl
 
8
  from sklearn.metrics.pairwise import cosine_similarity
9
+ from sentence_transformers import SentenceTransformer
10
+ import glob
11
+ from collections.abc import Callable
12
+ import os
13
 
 
14
 
15
+ def sbert_query(query: str, corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> pl.DataFrame:
16
+ """
17
+ Calculate the cosine similarity of the query to each block of text from the corpus.
18
+
19
+ Parameters:
20
+ query (str): Text of the query to search for in the documents.
21
+ corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
22
+ model (sentence_transformers.SentenceTransformer): The model used to encode the sentences.
23
+
24
+ Returns:
25
+ polars.DataFrame: Corpus documents ranked by their match to the query.
26
+ """
27
+ query_embeddings = np.reshape(model.encode(query), shape = (1, -1))
28
+
29
+ sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx'])))
30
 
31
+ sorted_df = pl.DataFrame(
32
+ {
33
+ 'score': np.reshape(sbert_scores, shape=-1),
34
+ 'file': corpus_embeddings_df['file'],
35
+ 'doc_block_indx': corpus_embeddings_df['doc_block_indx']
36
+ }).group_by("file").agg(pl.col("score").max())
37
 
38
+ # sort the results and return
39
+ return sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])]))
 
40
 
 
 
41
 
42
+ def sbert_query_factory(corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> Callable[[str], pl.DataFrame]:
43
+ """
44
+ Create a function that compares query text to the corpus by matching vector space embeddings.
45
 
46
+ Parameters:
47
+ corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
48
+ model (sentence_transformers.SentenceTransformer): The model used to estimate embeddings.
49
 
50
+ Returns:
51
+ Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
52
+ """
53
 
54
+ def do_sbert_query(query: str) -> pl.DataFrame:
55
+ """
56
+ Compare the query to the corpus.
57
 
58
+ Parameters:
59
+ query (str): The query with which to search the corpus.
60
+
61
+ Returns:
62
+ polars.DataFrame: Corpus documents ranked by their match to the query.
63
+ """
64
+ search_fun = sbert_query(query, corpus_embeddings_df, model)
65
+ return search_fun
66
 
 
 
 
 
67
 
68
+ def load_embeddings_dfs(embeddings_dir: str = "block-embeddings") -> pl.DataFrame:
69
+ """
70
+ Create the paragraph-feature embeddings data frame by loading all the CSVs in a directory.
71
 
72
+ Parameters:
73
+ embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
 
 
74
 
75
+ Returns:
76
+ polars.DataFrame: Data frame of the vector space embeddings for all documents in the corpus. Size is (paragraphs, features) plus two columns of metadata (`file` and `doc_block_indx` [aka within-document paragraph index].)
77
+ """
 
 
 
78
 
79
+ # import the block embeddings
80
+ files = glob.glob(os.path.join(embeddings_dir, "block-embeddings") + "*")
81
 
82
+ block_embeddings_list = list()
83
+ for filename in files:
84
+ print("Reading:", filename)
85
+ block_embeddings_list.append(pl.read_csv(filename))
86
+
87
+ return pl.concat(block_embeddings_list, how = 'vertical')
88
+
89
+
90
+
91
+ def create_embeddings_search_function(model_name: str, embeddings_dir: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
92
+ """
93
+ Create a function that compares query text to the corpus by matching vector space embeddings.
94
+
95
+ Parameters:
96
+ embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
97
+ model_name (str): Name of model used to calulate embeddings.
98
+ device (str): Device on which to do the calculations.
99
+
100
+ Returns:
101
+ Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
102
+
103
+ """
104
+ # Instantiate the sentence-transformer model:
105
+ sentence_model = SentenceTransformer(model_name).to(device = device)
106
 
107
+ # import the embeddings CSVs
108
+ block_embeddings_df = load_embeddings_dfs(embeddings_dir)
109
 
110
+ # call the factory to make the search function and return it
111
+ return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)
112
 
 
 
113
 
114
 
115
+ def create_embeddings_search_function_from_embeddings_df(model_name: str, embeddings_df_path: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
116
+ """
117
+ Create a function that compares query text to the corpus by matching vector space embeddings.
118
 
119
+ Parameters:
120
+ embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
121
+ model_name (str): Name of model used to calulate embeddings.
122
+ device (str): Device on which to do the calculations.
123
 
124
+ Returns:
125
+ Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
126
 
127
+ """
128
+ # Instantiate the sentence-transformer model:
129
+ sentence_model = SentenceTransformer(model_name).to(device = device)
130
 
131
+ # import the embeddings CSVs
132
+ block_embeddings_df = pl.read_parquet(embeddings_df_path)
133
 
134
+ # call the factory to make the search function and return it
135
+ return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)
136
 
src/tfidf_search.py CHANGED
@@ -1,39 +1,35 @@
 
 
 
 
1
  import polars as pl
2
  from sklearn.decomposition import TruncatedSVD
3
  from sklearn.metrics.pairwise import cosine_similarity
4
  from huggingface_hub import hf_hub_download
5
  import numpy as np
 
6
  from joblib import load
7
  import scipy
8
  import fasttext
 
9
 
10
- # define the device where torch calculations take place
11
- my_device = "cpu"
12
-
13
- # load the fasttext model
14
- fasttext_model = fasttext.load_model(hf_hub_download("facebook/fasttext-en-vectors", "model.bin"))
15
-
16
- # load the TF-IDF and DTM
17
- my_vectorizer = load("outputs/tfidf_vectorizer_doc_text.joblib")
18
 
19
- # vocab embeddings:
20
- my_vocabulary = my_vectorizer.get_feature_names_out()
21
- vocab_embeddings = np.array([fasttext_model.get_word_vector(term) for term in my_vocabulary])
22
- keep_terms = [any(vocab_embeddings[i,] != 0) for i in range(vocab_embeddings.shape[0])]
23
 
24
- # drop terms that have no embeddings in the fasttext model:
25
- vocab_embeddings = vocab_embeddings[keep_terms, :]
26
- my_vocabulary = my_vocabulary[keep_terms]
 
 
 
 
 
 
 
 
27
 
28
- # calculate length of each embedding vector
29
- vocab_norm = vocab_embeddings / np.linalg.norm(vocab_embeddings, axis=1, keepdims=True)
30
-
31
- # get the document-term matrix and project it to 300 pseudo-topics.
32
- dtm_svd = load("outputs/dtm_svd.joblib")
33
- X_svd = np.load("outputs/X_svd.npy", allow_pickle=True)
34
- my_files = load("outputs/my_files.joblib")
35
-
36
- def query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration = 10 ):
37
  # query embeddings:
38
  query_embeddings = np.array([fasttext_model.get_word_vector(term) for term in query.split()])
39
 
@@ -42,30 +38,85 @@ def query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration = 10 ):
42
 
43
  # Compute cosine similarity matrix
44
  query_similarities = np.dot(query_norm, vocab_norm.T)
45
- query_tfidf = np.reshape(my_vectorizer.idf_[keep_terms], shape=(-1, vocab_norm.shape[0])) * scipy.special.softmax(query_similarities * concentration, axis = 1)
46
  query_weights = np.mean(dtm_svd.transform(query_tfidf), axis=0)
47
 
48
  # calculate the average TF-IDF score of the query over topics:
49
- #mean_query_score = np.sum(np.mean(query_weights, axis=0) * dtm_svd_mat, axis=1)
50
  mean_query_score = np.reshape(cosine_similarity(np.reshape(query_weights, shape = (1, -1)), dtm_svd_mat), shape=-1)
51
 
52
  sorted_df = pl.DataFrame(
53
  {
54
  'score-tfidf': mean_query_score,
55
- 'file':my_files
56
  }).sort("score-tfidf", descending = True).with_columns(pl.Series("rank-tfidf", [i + 1 for i in range(len(mean_query_score))]))
57
-
58
- #top_df['file'][0]
59
  return(sorted_df)
60
 
61
 
62
 
63
- def query_factory(dtm_svd, dtm_svd_mat, vocab_norm, concentration = 10):
64
- def do_query(query):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  return query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration)
66
 
67
  return do_query
68
 
69
- query_docs = query_factory(dtm_svd = dtm_svd, dtm_svd_mat = X_svd, vocab_norm=vocab_norm, concentration = 30)
70
 
71
- #res_tfidf = query_docs(query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This script defines functions that search the corpus for blocks that are similar to the query.
2
+ # Loading embeddings of the query had to be changed for deployment in production because
3
+ # my CSVs took too much space for the free tier of HuggingFace spaces.
4
+
5
  import polars as pl
6
  from sklearn.decomposition import TruncatedSVD
7
  from sklearn.metrics.pairwise import cosine_similarity
8
  from huggingface_hub import hf_hub_download
9
  import numpy as np
10
+ from numpy.typing import NDArray
11
  from joblib import load
12
  import scipy
13
  import fasttext
14
+ from collections.abc import Callable
15
 
 
 
 
 
 
 
 
 
16
 
17
+ def query_worker(query: str, fasttext_model: fasttext.FastText._FastText, idf: NDArray[np.float64], dtm_svd: NDArray[np.float64], dtm_svd_mat: NDArray[np.float64], vocab_norm: NDArray[np.float64], concentration: float = 10 ) -> pl.DataFrame:
18
+ """
19
+ Calculate the cosine similarity of the query to each block of text from the corpus.
 
20
 
21
+ Parameters:
22
+ query (str): Search query
23
+ fasttext_model (fasttext.FastText._FastText):
24
+ idf (numpy.ndarray):
25
+ dtm_svd (numpy.ndarray):
26
+ dtm_svd_mat (numpy.ndarray):
27
+ vocab_norm (numpy.ndarray):
28
+ concentration (float):
29
+ Returns:
30
+ polars.DataFrame: Results sorted so that the best matches (according to column `score-tfidf`) are listed first.
31
+ """
32
 
 
 
 
 
 
 
 
 
 
33
  # query embeddings:
34
  query_embeddings = np.array([fasttext_model.get_word_vector(term) for term in query.split()])
35
 
 
38
 
39
  # Compute cosine similarity matrix
40
  query_similarities = np.dot(query_norm, vocab_norm.T)
41
+ query_tfidf = idf * scipy.special.softmax(query_similarities * concentration, axis = 1)
42
  query_weights = np.mean(dtm_svd.transform(query_tfidf), axis=0)
43
 
44
  # calculate the average TF-IDF score of the query over topics:
 
45
  mean_query_score = np.reshape(cosine_similarity(np.reshape(query_weights, shape = (1, -1)), dtm_svd_mat), shape=-1)
46
 
47
  sorted_df = pl.DataFrame(
48
  {
49
  'score-tfidf': mean_query_score,
50
+ 'file':my_df['file']
51
  }).sort("score-tfidf", descending = True).with_columns(pl.Series("rank-tfidf", [i + 1 for i in range(len(mean_query_score))]))
52
+
53
+ #return the sorted results
54
  return(sorted_df)
55
 
56
 
57
 
58
+ def query_factory(dtm_svd: NDArray[np.float64], dtm_svd_mat: NDArray[np.float64], vocab_norm: NDArray[np.float64], concentration: float = 10) -> Callable[[str], pl.DataFrame]:
59
+ """
60
+ Create a function that will compare query text to the documents in the corpus.
61
+
62
+ Parameters:
63
+ dtm_svd (np.ndarray):
64
+ """
65
+
66
+ def do_query(query: str) -> pl.DataFrame:
67
+ """
68
+ Call the worker that compares the query term distribution to the documents in the corpus
69
+
70
+ Parameters:
71
+ query (str): Text to compare to the documents
72
+
73
+ Returns:
74
+ polars.DataFrame: Results sorted so that the best matches (according to column `score-tfidf`) are listed first.
75
+ """
76
  return query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration)
77
 
78
  return do_query
79
 
 
80
 
81
+
82
+ def create_tfidf_search_function(dtm_df_path: str, vectorizer_path: str, model_name: str = "facebook/fasttext-en-vectors") -> Callable[[str], pl.DataFrame]:
83
+ """
84
+ Create a function that compares the word distribution in a query to each document in the corpus.
85
+
86
+ Parameters:
87
+ dtm_df_path (str): Path to a TF-IDF document-term matrix (DTM) for the corpus in parquet format.
88
+ vectorizer_path (str): Path to the saved vectorizer that generated the DTM saved at `csv_path`. We expect that the vectorizer was dumped to disk by `joblib`.
89
+ model_name (str): Name of a model on HuggingFace that generates word embeddings (default is 'facebook/fasttext-en-vectors'.)"
90
+
91
+ Returns:
92
+ callable: Function that compares the query string to the corpus.
93
+ """
94
+
95
+ # load the fasttext model
96
+ fasttext_model = fasttext.load_model(hf_hub_download(model_name, "fasttext-model.bin"))
97
+
98
+ # load the TF-IDF and DTM
99
+ my_df = pl.read_parquet(dtm_df_path)
100
+ my_vectorizer = load(vectorizer_path)
101
+
102
+ # vocab embeddings:
103
+ my_vocabulary = my_vectorizer.get_feature_names_out()
104
+ vocab_embeddings = np.array([fasttext_model.get_word_vector(term) for term in my_vocabulary])
105
+ keep_terms = [any(vocab_embeddings[i,] != 0) for i in range(vocab_embeddings.shape[0])]
106
+
107
+ # drop terms that have no embeddings in the fasttext model:
108
+ vocab_embeddings = vocab_embeddings[keep_terms, :]
109
+ my_vocabulary = my_vocabulary[keep_terms]
110
+
111
+ # get just IDF document-term matrix of the corpus:
112
+ my_idf = np.reshape(my_vectorizer.idf_[keep_terms], shape=(-1, vocab_embeddings.shape[0]))
113
+
114
+ # calculate length of each embedding vector
115
+ vocab_norm = vocab_embeddings / np.linalg.norm(vocab_embeddings, axis=1, keepdims=True)
116
+
117
+ # get the document-term matrix and project it to 300 pseudo-topics.
118
+ doc_term_mat = my_df.select(pl.exclude(["file"]))[:,keep_terms]
119
+ dtm_svd = TruncatedSVD(n_components=300)
120
+ X_svd = dtm_svd.fit_transform(doc_term_mat)
121
+
122
+ return query_factory(dtm_svd = dtm_svd, dtm_svd_mat = X_svd, vocab_norm=vocab_norm, idf = my_idf, concentration = 30)