Spaces:

wbrooks
/

CoUL-document-search

Running

App Files Files Community

wbrooks commited on 1 day ago

Commit

68fd999

1 Parent(s): 1b85d5f

switched to the new search functions using sentence-transformers

Browse files

Files changed (3) hide show

app.py +24 -3
src/embeddings_search.py +105 -42
src/tfidf_search.py +84 -33

app.py CHANGED Viewed

@@ -1,12 +1,19 @@
 from fastapi import FastAPI, Request, Query
 from fastapi.responses import JSONResponse
-from src.do_pca_on_tfidf import query_docs
-from src.search_embeddings import sbert_query_docs
 import polars as pl
 #from jinja2 import Template
 path_prefix = "/Users/wes/Google Drive/Shared drives/datalab/projects/2025_coul_aisearch/data/original_box_download/"
 app = FastAPI()
@@ -37,7 +44,21 @@ def square(x: int):
 @app.get("/search", response_class=JSONResponse)
 def search(q: str = Query(..., description="Search query")):
-    res_tfidf = query_docs(q)
     res_sbert = sbert_query_docs(q)
     joined = res_sbert.join(res_tfidf, on='file', how = 'inner')

 from fastapi import FastAPI, Request, Query
 from fastapi.responses import JSONResponse
+from src.embeddings_search import create_embeddings_search_function_from_embeddings_df
+from src.tfidf_search import create_tfidf_search_function
 import polars as pl
 #from jinja2 import Template
+# remove this prefix from the file paths:
 path_prefix = "/Users/wes/Google Drive/Shared drives/datalab/projects/2025_coul_aisearch/data/original_box_download/"
+# data we will need for search:
+block_embeddings_df_path = "block_embeddings/block-embeddings.parquet"
+doc_tfidf_df_path = "block_tfidf/TF-IDF-doc-text.parquet"
+tfidf_vectorizer_path = "block_tfidf/tfidf_vectorizer_doc_text.joblib"
 app = FastAPI()
 @app.get("/search", response_class=JSONResponse)
 def search(q: str = Query(..., description="Search query")):
+    block_embeddings_df_path = "block_embeddings/block-embeddings.parquet"
+    doc_tfidf_df_path = "block_tfidf/TF-IDF-doc-text.parquet"
+    tfidf_vectorizer_path = "block_tfidf/tfidf_vectorizer_doc_text.joblib"
+    sbert_query_docs = create_embeddings_search_function_from_embeddings_df(
+        model_name = "sentence-transformers/all-MiniLM-L6-v2",
+        embeddings_df_path = block_embeddings_df_path,
+        device = "cpu")
+    tfidf_query_docs = create_tfidf_search_function(
+        dtm_df_path = doc_tfidf_df_path,
+        vectorizer_path = tfidf_vectorizer_path,
+        model_name = "facebook/fasttext-en-vectors")
+    res_tfidf = tfidf_query_docs(q)
     res_sbert = sbert_query_docs(q)
     joined = res_sbert.join(res_tfidf, on='file', how = 'inner')

src/embeddings_search.py CHANGED Viewed

@@ -1,73 +1,136 @@
 # import packages
 import numpy as np
 import polars as pl
 from sklearn.metrics.pairwise import cosine_similarity
-from transformers import AutoTokenizer, AutoModel
-import glob
-import torch
-#
-def encode(sentences, tokenizer, model, device="mps"):
-    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(device = device)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    # outputs.last_hidden_state = [batch, tokens, hidden_dim]
-    # mean pooling
-    embeddings = outputs.last_hidden_state.mean(dim=1)
-    return(embeddings)
-# define the device where torch calculations take place
-my_device = "cpu"
-# Instantiate the sentence-transformer model:
-model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
-sentence_tokenizer = AutoTokenizer.from_pretrained(model_name)
-sentence_model = AutoModel.from_pretrained(model_name).to(device = my_device)
-block_embeddings_df = pl.read_parquet("outputs/block_embeddings_df.parquet.zstd")
-def sbert_query(query, corpus_embeddings_df):
-    query_embeddings = encode(query, tokenizer = sentence_tokenizer, model = sentence_model, device=my_device).cpu().numpy()
-    sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx'])))
-    sorted_df = pl.DataFrame(
-        {
-            'score': np.reshape(sbert_scores, shape=-1),
-            'file': corpus_embeddings_df['file'],
-            'doc_block_indx': corpus_embeddings_df['doc_block_indx']
-        }).group_by("file").agg(pl.col("score").max())
-    #top_df['file'][0]
-    return(sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])])))
-def sbert_query_factory(corpus_embeddings_df):
-    def do_sbert_query(my_query):
-        return sbert_query(my_query, corpus_embeddings_df)
-    return do_sbert_query
-# create a function to run the SBERT queries
-sbert_query_docs = sbert_query_factory(block_embeddings_df)
-query = "plans for raising grant revenue directed to the libraries"
-res_sbert = sbert_query_docs(query)
-#res.group_by("file").agg(pl.col("rank").min(), pl.col("score").max()).sort("rank")

+# This script defines functions that search the corpus for blocks that are similar to the query.
+# Loading embeddings of the query had to be changed for deployment in production because
+# my CSVs took too much space for the free tier of HuggingFace spaces.
 # import packages
 import numpy as np
 import polars as pl
 from sklearn.metrics.pairwise import cosine_similarity
+from sentence_transformers import SentenceTransformer
+import glob
+from collections.abc import Callable
+import os
+def sbert_query(query: str, corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> pl.DataFrame:
+    """
+    Calculate the cosine similarity of the query to each block of text from the corpus.
+    Parameters:
+        query (str): Text of the query to search for in the documents.
+        corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
+        model (sentence_transformers.SentenceTransformer): The model used to encode the sentences.
+    Returns:
+        polars.DataFrame: Corpus documents ranked by their match to the query.
+    """
+    query_embeddings = np.reshape(model.encode(query), shape = (1, -1))
+    sbert_scores = cosine_similarity(query_embeddings, corpus_embeddings_df.select(pl.exclude(['file', 'doc_block_indx'])))
+    sorted_df = pl.DataFrame(
+        {
+            'score': np.reshape(sbert_scores, shape=-1),
+            'file': corpus_embeddings_df['file'],
+            'doc_block_indx': corpus_embeddings_df['doc_block_indx']
+        }).group_by("file").agg(pl.col("score").max())
+    # sort the results and return
+    return sorted_df.sort("score", descending = True).with_columns(pl.Series("rank-sbert", [i + 1 for i in range(sorted_df.shape[0])]))
+def sbert_query_factory(corpus_embeddings_df: pl.DataFrame, model: SentenceTransformer) -> Callable[[str], pl.DataFrame]:
+    """
+    Create a function that compares query text to the corpus by matching vector space embeddings.
+    Parameters:
+        corpus_embeddings_df (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
+        model (sentence_transformers.SentenceTransformer): The model used to estimate embeddings.
+    Returns:
+        Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
+    """
+    def do_sbert_query(query: str) -> pl.DataFrame:
+        """
+        Compare the query to the corpus.
+        Parameters:
+            query (str): The query with which to search the corpus.
+        Returns:
+            polars.DataFrame: Corpus documents ranked by their match to the query.
+        """
+        search_fun = sbert_query(query, corpus_embeddings_df, model)
+        return search_fun
+def load_embeddings_dfs(embeddings_dir: str = "block-embeddings") -> pl.DataFrame:
+    """
+    Create the paragraph-feature embeddings data frame by loading all the CSVs in a directory.
+    Parameters:
+        embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
+    Returns:
+        polars.DataFrame: Data frame of the vector space embeddings for all documents in the corpus. Size is (paragraphs, features) plus two columns of metadata (`file` and `doc_block_indx` [aka within-document paragraph index].)
+    """
+    # import the block embeddings
+    files = glob.glob(os.path.join(embeddings_dir, "block-embeddings") + "*")
+    block_embeddings_list = list()
+    for filename in files:
+        print("Reading:", filename)
+        block_embeddings_list.append(pl.read_csv(filename))
+    return pl.concat(block_embeddings_list, how = 'vertical')
+def create_embeddings_search_function(model_name: str, embeddings_dir: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
+    """
+    Create a function that compares query text to the corpus by matching vector space embeddings.
+    Parameters:
+        embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
+        model_name (str): Name of model used to calulate embeddings.
+        device (str): Device on which to do the calculations.
+    Returns:
+        Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
+    """
+    # Instantiate the sentence-transformer model:
+    sentence_model = SentenceTransformer(model_name).to(device = device)
+    # import the embeddings CSVs
+    block_embeddings_df = load_embeddings_dfs(embeddings_dir)
+    # call the factory to make the search function and return it
+    return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)
+def create_embeddings_search_function_from_embeddings_df(model_name: str, embeddings_df_path: str, device : str = "cpu" ) -> Callable[[str], pl.DataFrame]:
+    """
+    Create a function that compares query text to the corpus by matching vector space embeddings.
+    Parameters:
+        embeddings_path_pattern (polars.DataFrame): DataFrame containing the embeddings of each document in the corpus in the shape (documents, features).
+        model_name (str): Name of model used to calulate embeddings.
+        device (str): Device on which to do the calculations.
+    Returns:
+        Callable[[str], pl.DataFrame]: Function to compare the query string to the corpus and return results sorted by the cosine similarity.
+    """
+    # Instantiate the sentence-transformer model:
+    sentence_model = SentenceTransformer(model_name).to(device = device)
+    # import the embeddings CSVs
+    block_embeddings_df = pl.read_parquet(embeddings_df_path)
+    # call the factory to make the search function and return it
+    return sbert_query_factory(corpus_embeddings_df = block_embeddings_df, model = sentence_model)

src/tfidf_search.py CHANGED Viewed

@@ -1,39 +1,35 @@
 import polars as pl
 from sklearn.decomposition import TruncatedSVD
 from sklearn.metrics.pairwise import cosine_similarity
 from huggingface_hub import hf_hub_download
 import numpy as np
 from joblib import load
 import scipy
 import fasttext
-# define the device where torch calculations take place
-my_device = "cpu"
-# load the fasttext model
-fasttext_model = fasttext.load_model(hf_hub_download("facebook/fasttext-en-vectors", "model.bin"))
-# load the TF-IDF and DTM
-my_vectorizer = load("outputs/tfidf_vectorizer_doc_text.joblib")
-# vocab embeddings:
-my_vocabulary = my_vectorizer.get_feature_names_out()
-vocab_embeddings = np.array([fasttext_model.get_word_vector(term) for term in my_vocabulary])
-keep_terms = [any(vocab_embeddings[i,] != 0) for i in range(vocab_embeddings.shape[0])]
-# drop terms that have no embeddings in the fasttext model:
-vocab_embeddings = vocab_embeddings[keep_terms, :]
-my_vocabulary = my_vocabulary[keep_terms]
-# calculate length of each embedding vector
-vocab_norm = vocab_embeddings / np.linalg.norm(vocab_embeddings, axis=1, keepdims=True)
-# get the document-term matrix and project it to 300 pseudo-topics.
-dtm_svd = load("outputs/dtm_svd.joblib")
-X_svd = np.load("outputs/X_svd.npy", allow_pickle=True)
-my_files = load("outputs/my_files.joblib")
-def query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration = 10 ):
     # query embeddings:
     query_embeddings = np.array([fasttext_model.get_word_vector(term) for term in query.split()])
@@ -42,30 +38,85 @@ def query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration = 10 ):
     # Compute cosine similarity matrix
     query_similarities = np.dot(query_norm, vocab_norm.T)
-    query_tfidf = np.reshape(my_vectorizer.idf_[keep_terms], shape=(-1, vocab_norm.shape[0])) * scipy.special.softmax(query_similarities * concentration, axis = 1)
     query_weights = np.mean(dtm_svd.transform(query_tfidf), axis=0)
     # calculate the average TF-IDF score of the query over topics:
-    #mean_query_score = np.sum(np.mean(query_weights, axis=0) * dtm_svd_mat, axis=1)
     mean_query_score = np.reshape(cosine_similarity(np.reshape(query_weights, shape = (1, -1)), dtm_svd_mat), shape=-1)
     sorted_df = pl.DataFrame(
         {
             'score-tfidf': mean_query_score,
-            'file':my_files
         }).sort("score-tfidf", descending = True).with_columns(pl.Series("rank-tfidf", [i + 1 for i in range(len(mean_query_score))]))
-    #top_df['file'][0]
     return(sorted_df)
-def query_factory(dtm_svd, dtm_svd_mat, vocab_norm, concentration = 10):
-    def do_query(query):
         return query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration)
     return do_query
-query_docs = query_factory(dtm_svd = dtm_svd, dtm_svd_mat = X_svd, vocab_norm=vocab_norm, concentration = 30)
-#res_tfidf = query_docs(query)

+# This script defines functions that search the corpus for blocks that are similar to the query.
+# Loading embeddings of the query had to be changed for deployment in production because
+# my CSVs took too much space for the free tier of HuggingFace spaces.
 import polars as pl
 from sklearn.decomposition import TruncatedSVD
 from sklearn.metrics.pairwise import cosine_similarity
 from huggingface_hub import hf_hub_download
 import numpy as np
+from numpy.typing import NDArray
 from joblib import load
 import scipy
 import fasttext
+from collections.abc import Callable
+def query_worker(query: str, fasttext_model: fasttext.FastText._FastText, idf: NDArray[np.float64], dtm_svd: NDArray[np.float64], dtm_svd_mat: NDArray[np.float64], vocab_norm: NDArray[np.float64], concentration: float = 10 ) -> pl.DataFrame:
+    """
+    Calculate the cosine similarity of the query to each block of text from the corpus.
+    Parameters:
+        query (str): Search query
+        fasttext_model (fasttext.FastText._FastText):
+        idf (numpy.ndarray):
+        dtm_svd (numpy.ndarray):
+        dtm_svd_mat (numpy.ndarray):
+        vocab_norm (numpy.ndarray):
+        concentration (float):
+    Returns:
+        polars.DataFrame: Results sorted so that the best matches (according to column `score-tfidf`) are listed first.
+    """
     # query embeddings:
     query_embeddings = np.array([fasttext_model.get_word_vector(term) for term in query.split()])
     # Compute cosine similarity matrix
     query_similarities = np.dot(query_norm, vocab_norm.T)
+    query_tfidf = idf * scipy.special.softmax(query_similarities * concentration, axis = 1)
     query_weights = np.mean(dtm_svd.transform(query_tfidf), axis=0)
     # calculate the average TF-IDF score of the query over topics:
     mean_query_score = np.reshape(cosine_similarity(np.reshape(query_weights, shape = (1, -1)), dtm_svd_mat), shape=-1)
     sorted_df = pl.DataFrame(
         {
             'score-tfidf': mean_query_score,
+            'file':my_df['file']
         }).sort("score-tfidf", descending = True).with_columns(pl.Series("rank-tfidf", [i + 1 for i in range(len(mean_query_score))]))
+    #return the sorted results
     return(sorted_df)
+def query_factory(dtm_svd: NDArray[np.float64], dtm_svd_mat: NDArray[np.float64], vocab_norm: NDArray[np.float64], concentration: float = 10) -> Callable[[str], pl.DataFrame]:
+    """
+    Create a function that will compare query text to the documents in the corpus.
+    Parameters:
+        dtm_svd (np.ndarray):
+    """
+    def do_query(query: str) -> pl.DataFrame:
+        """
+        Call the worker that compares the query term distribution to the documents in the corpus
+        Parameters:
+            query (str): Text to compare to the documents
+        Returns:
+            polars.DataFrame: Results sorted so that the best matches (according to column `score-tfidf`) are listed first.
+        """
         return query_worker(query, dtm_svd, dtm_svd_mat, vocab_norm, concentration)
     return do_query
+def create_tfidf_search_function(dtm_df_path: str, vectorizer_path: str, model_name: str = "facebook/fasttext-en-vectors") -> Callable[[str], pl.DataFrame]:
+    """
+    Create a function that compares the word distribution in a query to each document in the corpus.
+    Parameters:
+        dtm_df_path (str): Path to a TF-IDF document-term matrix (DTM) for the corpus in parquet format.
+        vectorizer_path (str): Path to the saved vectorizer that generated the DTM saved at `csv_path`. We expect that the vectorizer was dumped to disk by `joblib`.
+        model_name (str): Name of a model on HuggingFace that generates word embeddings (default is 'facebook/fasttext-en-vectors'.)"
+    Returns:
+        callable: Function that compares the query string to the corpus.
+    """
+    # load the fasttext model
+    fasttext_model = fasttext.load_model(hf_hub_download(model_name, "fasttext-model.bin"))
+    # load the TF-IDF and DTM
+    my_df = pl.read_parquet(dtm_df_path)
+    my_vectorizer = load(vectorizer_path)
+    # vocab embeddings:
+    my_vocabulary = my_vectorizer.get_feature_names_out()
+    vocab_embeddings = np.array([fasttext_model.get_word_vector(term) for term in my_vocabulary])
+    keep_terms = [any(vocab_embeddings[i,] != 0) for i in range(vocab_embeddings.shape[0])]
+    # drop terms that have no embeddings in the fasttext model:
+    vocab_embeddings = vocab_embeddings[keep_terms, :]
+    my_vocabulary = my_vocabulary[keep_terms]
+    # get just IDF document-term matrix of the corpus:
+    my_idf = np.reshape(my_vectorizer.idf_[keep_terms], shape=(-1, vocab_embeddings.shape[0]))
+    # calculate length of each embedding vector
+    vocab_norm = vocab_embeddings / np.linalg.norm(vocab_embeddings, axis=1, keepdims=True)
+    # get the document-term matrix and project it to 300 pseudo-topics.
+    doc_term_mat = my_df.select(pl.exclude(["file"]))[:,keep_terms]
+    dtm_svd = TruncatedSVD(n_components=300)
+    X_svd = dtm_svd.fit_transform(doc_term_mat)
+    return query_factory(dtm_svd = dtm_svd, dtm_svd_mat = X_svd, vocab_norm=vocab_norm, idf = my_idf, concentration = 30)