Spaces:
Running
Running
switched to the new search functions using sentence-transformers
Browse files- app.py +24 -3
- src/embeddings_search.py +105 -42
- src/tfidf_search.py +84 -33
app.py
CHANGED
|
@@ -1,12 +1,19 @@
|
|
| 1 |
from fastapi import FastAPI, Request, Query
|
| 2 |
from fastapi.responses import JSONResponse
|
| 3 |
-
from src.
|
| 4 |
-
from src.
|
|
|
|
| 5 |
import polars as pl
|
| 6 |
#from jinja2 import Template
|
| 7 |
|
|
|
|
| 8 |
path_prefix = "/Users/wes/Google Drive/Shared drives/datalab/projects/2025_coul_aisearch/data/original_box_download/"
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
app = FastAPI()
|
| 11 |
|
| 12 |
|
|
@@ -37,7 +44,21 @@ def square(x: int):
|
|
| 37 |
|
| 38 |
@app.get("/search", response_class=JSONResponse)
|
| 39 |
def search(q: str = Query(..., description="Search query")):
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
res_sbert = sbert_query_docs(q)
|
| 42 |
|
| 43 |
joined = res_sbert.join(res_tfidf, on='file', how = 'inner')
|
|
|
|
| 1 |
from fastapi import FastAPI, Request, Query
|
| 2 |
from fastapi.responses import JSONResponse
|
| 3 |
+
from src.embeddings_search import create_embeddings_search_function_from_embeddings_df
|
| 4 |
+
from src.tfidf_search import create_tfidf_search_function
|
| 5 |
+
|
| 6 |
import polars as pl
|
| 7 |
#from jinja2 import Template
|
| 8 |
|
| 9 |
+
# remove this prefix from the file paths:
|
| 10 |
path_prefix = "/Users/wes/Google Drive/Shared drives/datalab/projects/2025_coul_aisearch/data/original_box_download/"
|
| 11 |
|
| 12 |
+
# data we will need for search:
|
| 13 |
+
block_embeddings_df_path = "block_embeddings/block-embeddings.parquet"
|
| 14 |
+
doc_tfidf_df_path = "block_tfidf/TF-IDF-doc-text.parquet"
|
| 15 |
+
tfidf_vectorizer_path = "block_tfidf/tfidf_vectorizer_doc_text.joblib"
|
| 16 |
+
|
| 17 |
app = FastAPI()
|
| 18 |
|
| 19 |
|
|
|
|
| 44 |
|
| 45 |
@app.get("/search", response_class=JSONResponse)
|
| 46 |
def search(q: str = Query(..., description="Search query")):
|
| 47 |
+
block_embeddings_df_path = "block_embeddings/block-embeddings.parquet"
|
| 48 |
+
doc_tfidf_df_path = "block_tfidf/TF-IDF-doc-text.parquet"
|
| 49 |
+
tfidf_vectorizer_path = "block_tfidf/tfidf_vectorizer_doc_text.joblib"
|
| 50 |
+
|
| 51 |
+
sbert_query_docs = create_embeddings_search_function_from_embeddings_df(
|
| 52 |
+
model_name = "sentence-transformers/all-MiniLM-L6-v2",
|
| 53 |
+
embeddings_df_path = block_embeddings_df_path,
|
| 54 |
+
device = "cpu")
|
| 55 |
+
tfidf_query_docs = create_tfidf_search_function(
|
| 56 |
+
dtm_df_path = doc_tfidf_df_path,
|
| 57 |
+
vectorizer_path = tfidf_vectorizer_path,
|
| 58 |
+
model_name = "facebook/fasttext-en-vectors")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
res_tfidf = tfidf_query_docs(q)
|
| 62 |
res_sbert = sbert_query_docs(q)
|
| 63 |
|
| 64 |
joined = res_sbert.join(res_tfidf, on='file', how = 'inner')
|
src/embeddings_search.py
CHANGED
|
@@ -1,73 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# import packages
|
| 2 |
import numpy as np
|
| 3 |
import polars as pl
|
| 4 |
-
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
from transformers import AutoTokenizer, AutoModel
|
| 8 |
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
-
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(device = device)
|
| 16 |
|
| 17 |
-
with torch.no_grad():
|
| 18 |
-
outputs = model(**inputs)
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
|
| 24 |
-
|
|
|
|
|
|
|
| 25 |
|
|
|
|
|
|
|
|
|
|
| 26 |
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
-
# Instantiate the sentence-transformer model:
|
| 32 |
-
model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
|
| 33 |
-
sentence_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 34 |
-
sentence_model = AutoModel.from_pretrained(model_name).to(device = my_device)
|
| 35 |
|
| 36 |
-
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx'])))
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
'file': corpus_embeddings_df['file'],
|
| 47 |
-
'doc_block_indx': corpus_embeddings_df['doc_block_indx']
|
| 48 |
-
}).group_by("file").agg(pl.col("score").max())
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
|
|
|
|
|
|
| 59 |
|
| 60 |
-
#
|
| 61 |
-
|
| 62 |
|
| 63 |
-
query = "plans for raising grant revenue directed to the libraries"
|
| 64 |
-
res_sbert = sbert_query_docs(query)
|
| 65 |
|
| 66 |
|
| 67 |
-
|
|
|
|
|
|
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
|
|
|
|
|
|
| 70 |
|
|
|
|
|
|
|
|
|
|
| 71 |
|
|
|
|
|
|
|
| 72 |
|
|
|
|
|
|
|
| 73 |
|
|
|
|
| 1 |
+
# This script defines functions that search the corpus for blocks that are similar to the query.
|
| 2 |
+
# Loading embeddings of the query had to be changed for deployment in production because
|
| 3 |
+
# my CSVs took too much space for the free tier of HuggingFace spaces.
|
| 4 |
+
|
| 5 |
# import packages
|
| 6 |
import numpy as np
|
| 7 |
import polars as pl
|
|
|
|
| 8 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
+
from sentence_transformers import SentenceTransformer
|
| 10 |
+
import glob
|
| 11 |
+
from collections.abc import Callable
|
| 12 |
+
import os
|
| 13 |
|
|
|
|
| 14 |
|
| 15 |
+
def sbert_query(query: str, corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> pl.DataFrame:
|
| 16 |
+
"""
|
| 17 |
+
Calculate the cosine similarity of the query to each block of text from the corpus.
|
| 18 |
+
|
| 19 |
+
Parameters:
|
| 20 |
+
query (str): Text of the query to search for in the documents.
|
| 21 |
+
corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
|
| 22 |
+
model (sentence_transformers.SentenceTransformer): The model used to encode the sentences.
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
polars.DataFrame: Corpus documents ranked by their match to the query.
|
| 26 |
+
"""
|
| 27 |
+
query_embeddings = np.reshape(model.encode(query), shape = (1, -1))
|
| 28 |
+
|
| 29 |
+
sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx'])))
|
| 30 |
|
| 31 |
+
sorted_df = pl.DataFrame(
|
| 32 |
+
{
|
| 33 |
+
'score': np.reshape(sbert_scores, shape=-1),
|
| 34 |
+
'file': corpus_embeddings_df['file'],
|
| 35 |
+
'doc_block_indx': corpus_embeddings_df['doc_block_indx']
|
| 36 |
+
}).group_by("file").agg(pl.col("score").max())
|
| 37 |
|
| 38 |
+
# sort the results and return
|
| 39 |
+
return sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])]))
|
|
|
|
| 40 |
|
|
|
|
|
|
|
| 41 |
|
| 42 |
+
def sbert_query_factory(corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> Callable[[str], pl.DataFrame]:
|
| 43 |
+
"""
|
| 44 |
+
Create a function that compares query text to the corpus by matching vector space embeddings.
|
| 45 |
|
| 46 |
+
Parameters:
|
| 47 |
+
corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
|
| 48 |
+
model (sentence_transformers.SentenceTransformer): The model used to estimate embeddings.
|
| 49 |
|
| 50 |
+
Returns:
|
| 51 |
+
Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
|
| 52 |
+
"""
|
| 53 |
|
| 54 |
+
def do_sbert_query(query: str) -> pl.DataFrame:
|
| 55 |
+
"""
|
| 56 |
+
Compare the query to the corpus.
|
| 57 |
|
| 58 |
+
Parameters:
|
| 59 |
+
query (str): The query with which to search the corpus.
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
polars.DataFrame: Corpus documents ranked by their match to the query.
|
| 63 |
+
"""
|
| 64 |
+
search_fun = sbert_query(query, corpus_embeddings_df, model)
|
| 65 |
+
return search_fun
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
+
def load_embeddings_dfs(embeddings_dir: str = "block-embeddings") -> pl.DataFrame:
|
| 69 |
+
"""
|
| 70 |
+
Create the paragraph-feature embeddings data frame by loading all the CSVs in a directory.
|
| 71 |
|
| 72 |
+
Parameters:
|
| 73 |
+
embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
|
|
|
|
|
|
|
| 74 |
|
| 75 |
+
Returns:
|
| 76 |
+
polars.DataFrame: Data frame of the vector space embeddings for all documents in the corpus. Size is (paragraphs, features) plus two columns of metadata (`file` and `doc_block_indx` [aka within-document paragraph index].)
|
| 77 |
+
"""
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
+
# import the block embeddings
|
| 80 |
+
files = glob.glob(os.path.join(embeddings_dir, "block-embeddings") + "*")
|
| 81 |
|
| 82 |
+
block_embeddings_list = list()
|
| 83 |
+
for filename in files:
|
| 84 |
+
print("Reading:", filename)
|
| 85 |
+
block_embeddings_list.append(pl.read_csv(filename))
|
| 86 |
+
|
| 87 |
+
return pl.concat(block_embeddings_list, how = 'vertical')
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def create_embeddings_search_function(model_name: str, embeddings_dir: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
|
| 92 |
+
"""
|
| 93 |
+
Create a function that compares query text to the corpus by matching vector space embeddings.
|
| 94 |
+
|
| 95 |
+
Parameters:
|
| 96 |
+
embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
|
| 97 |
+
model_name (str): Name of model used to calulate embeddings.
|
| 98 |
+
device (str): Device on which to do the calculations.
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
|
| 102 |
+
|
| 103 |
+
"""
|
| 104 |
+
# Instantiate the sentence-transformer model:
|
| 105 |
+
sentence_model = SentenceTransformer(model_name).to(device = device)
|
| 106 |
|
| 107 |
+
# import the embeddings CSVs
|
| 108 |
+
block_embeddings_df = load_embeddings_dfs(embeddings_dir)
|
| 109 |
|
| 110 |
+
# call the factory to make the search function and return it
|
| 111 |
+
return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)
|
| 112 |
|
|
|
|
|
|
|
| 113 |
|
| 114 |
|
| 115 |
+
def create_embeddings_search_function_from_embeddings_df(model_name: str, embeddings_df_path: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
|
| 116 |
+
"""
|
| 117 |
+
Create a function that compares query text to the corpus by matching vector space embeddings.
|
| 118 |
|
| 119 |
+
Parameters:
|
| 120 |
+
embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
|
| 121 |
+
model_name (str): Name of model used to calulate embeddings.
|
| 122 |
+
device (str): Device on which to do the calculations.
|
| 123 |
|
| 124 |
+
Returns:
|
| 125 |
+
Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
|
| 126 |
|
| 127 |
+
"""
|
| 128 |
+
# Instantiate the sentence-transformer model:
|
| 129 |
+
sentence_model = SentenceTransformer(model_name).to(device = device)
|
| 130 |
|
| 131 |
+
# import the embeddings CSVs
|
| 132 |
+
block_embeddings_df = pl.read_parquet(embeddings_df_path)
|
| 133 |
|
| 134 |
+
# call the factory to make the search function and return it
|
| 135 |
+
return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)
|
| 136 |
|
src/tfidf_search.py
CHANGED
|
@@ -1,39 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import polars as pl
|
| 2 |
from sklearn.decomposition import TruncatedSVD
|
| 3 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 4 |
from huggingface_hub import hf_hub_download
|
| 5 |
import numpy as np
|
|
|
|
| 6 |
from joblib import load
|
| 7 |
import scipy
|
| 8 |
import fasttext
|
|
|
|
| 9 |
|
| 10 |
-
# define the device where torch calculations take place
|
| 11 |
-
my_device = "cpu"
|
| 12 |
-
|
| 13 |
-
# load the fasttext model
|
| 14 |
-
fasttext_model = fasttext.load_model(hf_hub_download("facebook/fasttext-en-vectors", "model.bin"))
|
| 15 |
-
|
| 16 |
-
# load the TF-IDF and DTM
|
| 17 |
-
my_vectorizer = load("outputs/tfidf_vectorizer_doc_text.joblib")
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
keep_terms = [any(vocab_embeddings[i,] != 0) for i in range(vocab_embeddings.shape[0])]
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
# calculate length of each embedding vector
|
| 29 |
-
vocab_norm = vocab_embeddings / np.linalg.norm(vocab_embeddings, axis=1, keepdims=True)
|
| 30 |
-
|
| 31 |
-
# get the document-term matrix and project it to 300 pseudo-topics.
|
| 32 |
-
dtm_svd = load("outputs/dtm_svd.joblib")
|
| 33 |
-
X_svd = np.load("outputs/X_svd.npy", allow_pickle=True)
|
| 34 |
-
my_files = load("outputs/my_files.joblib")
|
| 35 |
-
|
| 36 |
-
def query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration = 10 ):
|
| 37 |
# query embeddings:
|
| 38 |
query_embeddings = np.array([fasttext_model.get_word_vector(term) for term in query.split()])
|
| 39 |
|
|
@@ -42,30 +38,85 @@ def query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration = 10 ):
|
|
| 42 |
|
| 43 |
# Compute cosine similarity matrix
|
| 44 |
query_similarities = np.dot(query_norm, vocab_norm.T)
|
| 45 |
-
query_tfidf =
|
| 46 |
query_weights = np.mean(dtm_svd.transform(query_tfidf), axis=0)
|
| 47 |
|
| 48 |
# calculate the average TF-IDF score of the query over topics:
|
| 49 |
-
#mean_query_score = np.sum(np.mean(query_weights, axis=0) * dtm_svd_mat, axis=1)
|
| 50 |
mean_query_score = np.reshape(cosine_similarity(np.reshape(query_weights, shape = (1, -1)), dtm_svd_mat), shape=-1)
|
| 51 |
|
| 52 |
sorted_df = pl.DataFrame(
|
| 53 |
{
|
| 54 |
'score-tfidf': mean_query_score,
|
| 55 |
-
'file':
|
| 56 |
}).sort("score-tfidf", descending = True).with_columns(pl.Series("rank-tfidf", [i + 1 for i in range(len(mean_query_score))]))
|
| 57 |
-
|
| 58 |
-
#
|
| 59 |
return(sorted_df)
|
| 60 |
|
| 61 |
|
| 62 |
|
| 63 |
-
def query_factory(dtm_svd, dtm_svd_mat, vocab_norm, concentration = 10):
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
return query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration)
|
| 66 |
|
| 67 |
return do_query
|
| 68 |
|
| 69 |
-
query_docs = query_factory(dtm_svd = dtm_svd, dtm_svd_mat = X_svd, vocab_norm=vocab_norm, concentration = 30)
|
| 70 |
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This script defines functions that search the corpus for blocks that are similar to the query.
|
| 2 |
+
# Loading embeddings of the query had to be changed for deployment in production because
|
| 3 |
+
# my CSVs took too much space for the free tier of HuggingFace spaces.
|
| 4 |
+
|
| 5 |
import polars as pl
|
| 6 |
from sklearn.decomposition import TruncatedSVD
|
| 7 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 8 |
from huggingface_hub import hf_hub_download
|
| 9 |
import numpy as np
|
| 10 |
+
from numpy.typing import NDArray
|
| 11 |
from joblib import load
|
| 12 |
import scipy
|
| 13 |
import fasttext
|
| 14 |
+
from collections.abc import Callable
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
def query_worker(query: str, fasttext_model: fasttext.FastText._FastText, idf: NDArray[np.float64], dtm_svd: NDArray[np.float64], dtm_svd_mat: NDArray[np.float64], vocab_norm: NDArray[np.float64], concentration: float = 10 ) -> pl.DataFrame:
|
| 18 |
+
"""
|
| 19 |
+
Calculate the cosine similarity of the query to each block of text from the corpus.
|
|
|
|
| 20 |
|
| 21 |
+
Parameters:
|
| 22 |
+
query (str): Search query
|
| 23 |
+
fasttext_model (fasttext.FastText._FastText):
|
| 24 |
+
idf (numpy.ndarray):
|
| 25 |
+
dtm_svd (numpy.ndarray):
|
| 26 |
+
dtm_svd_mat (numpy.ndarray):
|
| 27 |
+
vocab_norm (numpy.ndarray):
|
| 28 |
+
concentration (float):
|
| 29 |
+
Returns:
|
| 30 |
+
polars.DataFrame: Results sorted so that the best matches (according to column `score-tfidf`) are listed first.
|
| 31 |
+
"""
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
# query embeddings:
|
| 34 |
query_embeddings = np.array([fasttext_model.get_word_vector(term) for term in query.split()])
|
| 35 |
|
|
|
|
| 38 |
|
| 39 |
# Compute cosine similarity matrix
|
| 40 |
query_similarities = np.dot(query_norm, vocab_norm.T)
|
| 41 |
+
query_tfidf = idf * scipy.special.softmax(query_similarities * concentration, axis = 1)
|
| 42 |
query_weights = np.mean(dtm_svd.transform(query_tfidf), axis=0)
|
| 43 |
|
| 44 |
# calculate the average TF-IDF score of the query over topics:
|
|
|
|
| 45 |
mean_query_score = np.reshape(cosine_similarity(np.reshape(query_weights, shape = (1, -1)), dtm_svd_mat), shape=-1)
|
| 46 |
|
| 47 |
sorted_df = pl.DataFrame(
|
| 48 |
{
|
| 49 |
'score-tfidf': mean_query_score,
|
| 50 |
+
'file':my_df['file']
|
| 51 |
}).sort("score-tfidf", descending = True).with_columns(pl.Series("rank-tfidf", [i + 1 for i in range(len(mean_query_score))]))
|
| 52 |
+
|
| 53 |
+
#return the sorted results
|
| 54 |
return(sorted_df)
|
| 55 |
|
| 56 |
|
| 57 |
|
| 58 |
+
def query_factory(dtm_svd: NDArray[np.float64], dtm_svd_mat: NDArray[np.float64], vocab_norm: NDArray[np.float64], concentration: float = 10) -> Callable[[str], pl.DataFrame]:
|
| 59 |
+
"""
|
| 60 |
+
Create a function that will compare query text to the documents in the corpus.
|
| 61 |
+
|
| 62 |
+
Parameters:
|
| 63 |
+
dtm_svd (np.ndarray):
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
def do_query(query: str) -> pl.DataFrame:
|
| 67 |
+
"""
|
| 68 |
+
Call the worker that compares the query term distribution to the documents in the corpus
|
| 69 |
+
|
| 70 |
+
Parameters:
|
| 71 |
+
query (str): Text to compare to the documents
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
polars.DataFrame: Results sorted so that the best matches (according to column `score-tfidf`) are listed first.
|
| 75 |
+
"""
|
| 76 |
return query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration)
|
| 77 |
|
| 78 |
return do_query
|
| 79 |
|
|
|
|
| 80 |
|
| 81 |
+
|
| 82 |
+
def create_tfidf_search_function(dtm_df_path: str, vectorizer_path: str, model_name: str = "facebook/fasttext-en-vectors") -> Callable[[str], pl.DataFrame]:
|
| 83 |
+
"""
|
| 84 |
+
Create a function that compares the word distribution in a query to each document in the corpus.
|
| 85 |
+
|
| 86 |
+
Parameters:
|
| 87 |
+
dtm_df_path (str): Path to a TF-IDF document-term matrix (DTM) for the corpus in parquet format.
|
| 88 |
+
vectorizer_path (str): Path to the saved vectorizer that generated the DTM saved at `csv_path`. We expect that the vectorizer was dumped to disk by `joblib`.
|
| 89 |
+
model_name (str): Name of a model on HuggingFace that generates word embeddings (default is 'facebook/fasttext-en-vectors'.)"
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
callable: Function that compares the query string to the corpus.
|
| 93 |
+
"""
|
| 94 |
+
|
| 95 |
+
# load the fasttext model
|
| 96 |
+
fasttext_model = fasttext.load_model(hf_hub_download(model_name, "fasttext-model.bin"))
|
| 97 |
+
|
| 98 |
+
# load the TF-IDF and DTM
|
| 99 |
+
my_df = pl.read_parquet(dtm_df_path)
|
| 100 |
+
my_vectorizer = load(vectorizer_path)
|
| 101 |
+
|
| 102 |
+
# vocab embeddings:
|
| 103 |
+
my_vocabulary = my_vectorizer.get_feature_names_out()
|
| 104 |
+
vocab_embeddings = np.array([fasttext_model.get_word_vector(term) for term in my_vocabulary])
|
| 105 |
+
keep_terms = [any(vocab_embeddings[i,] != 0) for i in range(vocab_embeddings.shape[0])]
|
| 106 |
+
|
| 107 |
+
# drop terms that have no embeddings in the fasttext model:
|
| 108 |
+
vocab_embeddings = vocab_embeddings[keep_terms, :]
|
| 109 |
+
my_vocabulary = my_vocabulary[keep_terms]
|
| 110 |
+
|
| 111 |
+
# get just IDF document-term matrix of the corpus:
|
| 112 |
+
my_idf = np.reshape(my_vectorizer.idf_[keep_terms], shape=(-1, vocab_embeddings.shape[0]))
|
| 113 |
+
|
| 114 |
+
# calculate length of each embedding vector
|
| 115 |
+
vocab_norm = vocab_embeddings / np.linalg.norm(vocab_embeddings, axis=1, keepdims=True)
|
| 116 |
+
|
| 117 |
+
# get the document-term matrix and project it to 300 pseudo-topics.
|
| 118 |
+
doc_term_mat = my_df.select(pl.exclude(["file"]))[:,keep_terms]
|
| 119 |
+
dtm_svd = TruncatedSVD(n_components=300)
|
| 120 |
+
X_svd = dtm_svd.fit_transform(doc_term_mat)
|
| 121 |
+
|
| 122 |
+
return query_factory(dtm_svd = dtm_svd, dtm_svd_mat = X_svd, vocab_norm=vocab_norm, idf = my_idf, concentration = 30)
|