File size: 1,986 Bytes
2fe266e
6f54f14
68fd999
 
 
e2ee208
6f54f14
 
68fd999
547533f
8363418
68fd999
 
 
 
2fe266e
 
 
 
 
 
 
 
 
 
8363418
 
6f54f14
a5caccb
 
 
 
 
6f54f14
547533f
68fd999
6f54f14
e2ee208
 
 
547533f
 
861d14f
 
9fbd1cf
861d14f
e2ee208
928dc40
106e459
7e2a479
e2ee208
 
f6d14bf
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from fastapi import FastAPI, Query
from fastapi.responses import JSONResponse
from src.embeddings_search import create_embeddings_search_function_from_embeddings_df
from src.tfidf_search import create_tfidf_search_function

import polars as pl
#from jinja2 import Template

# remove this prefix from the file paths:
path_prefix = "/Users/wes/Google Drive/Shared drives/datalab/projects/2025_coul_aisearch/data/original_box_download/"

# data we will need for search:
block_embeddings_df_path = "block_embeddings/block-embeddings.parquet"
doc_tfidf_df_path = "block_tfidf/TF-IDF-doc-text.parquet"
tfidf_vectorizer_path = "block_tfidf/tfidf_vectorizer_doc_text.joblib"

sbert_query_docs = create_embeddings_search_function_from_embeddings_df(
    model_name = "sentence-transformers/all-MiniLM-L6-v2",
    embeddings_df_path = block_embeddings_df_path,
    device = "cpu")
tfidf_query_docs = create_tfidf_search_function(
    dtm_df_path = doc_tfidf_df_path,
    vectorizer_path = tfidf_vectorizer_path,
    model_name = "facebook/fasttext-en-vectors")

app = FastAPI()


@app.get("/")
def default():
    return {"status": "ok", "version": 0.1}


@app.get("/search", response_class=JSONResponse)
def search(q: str = Query(..., description="Search query")):
    res_tfidf = tfidf_query_docs(q)
    res_sbert = sbert_query_docs(q)

    joined = res_sbert.join(res_tfidf, on='file', how = 'inner')

    res_combined = joined.with_columns(
        (0.7 * pl.col("rank-sbert") + 0.3 * pl.col("rank-tfidf")).alias("rank-combined"),
        pl.col("file").str.strip_prefix(path_prefix).alias("file")
    ).sort("rank-combined").with_columns(
        (20.0 / pl.col('rank-combined')).round(2).alias('confidence')
    ).select(['file', 'confidence'])

    #return {"request": request, "results": str(res_combined)}
    #return {"request": request, "results": res_combined.to_dicts()}
    return res_combined.to_dicts()




@app.get("/test")
def echo(query: str):
    
    return {"echo": query}