Spaces:
Running
Running
File size: 1,986 Bytes
2fe266e 6f54f14 68fd999 e2ee208 6f54f14 68fd999 547533f 8363418 68fd999 2fe266e 8363418 6f54f14 a5caccb 6f54f14 547533f 68fd999 6f54f14 e2ee208 547533f 861d14f 9fbd1cf 861d14f e2ee208 928dc40 106e459 7e2a479 e2ee208 f6d14bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
from fastapi import FastAPI, Query
from fastapi.responses import JSONResponse
from src.embeddings_search import create_embeddings_search_function_from_embeddings_df
from src.tfidf_search import create_tfidf_search_function
import polars as pl
#from jinja2 import Template
# remove this prefix from the file paths:
path_prefix = "/Users/wes/Google Drive/Shared drives/datalab/projects/2025_coul_aisearch/data/original_box_download/"
# data we will need for search:
block_embeddings_df_path = "block_embeddings/block-embeddings.parquet"
doc_tfidf_df_path = "block_tfidf/TF-IDF-doc-text.parquet"
tfidf_vectorizer_path = "block_tfidf/tfidf_vectorizer_doc_text.joblib"
sbert_query_docs = create_embeddings_search_function_from_embeddings_df(
model_name = "sentence-transformers/all-MiniLM-L6-v2",
embeddings_df_path = block_embeddings_df_path,
device = "cpu")
tfidf_query_docs = create_tfidf_search_function(
dtm_df_path = doc_tfidf_df_path,
vectorizer_path = tfidf_vectorizer_path,
model_name = "facebook/fasttext-en-vectors")
app = FastAPI()
@app.get("/")
def default():
return {"status": "ok", "version": 0.1}
@app.get("/search", response_class=JSONResponse)
def search(q: str = Query(..., description="Search query")):
res_tfidf = tfidf_query_docs(q)
res_sbert = sbert_query_docs(q)
joined = res_sbert.join(res_tfidf, on='file', how = 'inner')
res_combined = joined.with_columns(
(0.7 * pl.col("rank-sbert") + 0.3 * pl.col("rank-tfidf")).alias("rank-combined"),
pl.col("file").str.strip_prefix(path_prefix).alias("file")
).sort("rank-combined").with_columns(
(20.0 / pl.col('rank-combined')).round(2).alias('confidence')
).select(['file', 'confidence'])
#return {"request": request, "results": str(res_combined)}
#return {"request": request, "results": res_combined.to_dicts()}
return res_combined.to_dicts()
@app.get("/test")
def echo(query: str):
return {"echo": query}
|