Spaces:

wbrooks
/

CoUL-document-search

Running

App Files Files Community

CoUL-document-search / app.py

wbrooks

removed debugging messages now that search is working

b6127b6 about 22 hours ago

raw

history blame contribute delete

1.99 kB

	from fastapi import FastAPI, Query
	from fastapi.responses import JSONResponse
	from src.embeddings_search import create_embeddings_search_function_from_embeddings_df
	from src.tfidf_search import create_tfidf_search_function

	import polars as pl
	#from jinja2 import Template

	# remove this prefix from the file paths:
	path_prefix = "/Users/wes/Google Drive/Shared drives/datalab/projects/2025_coul_aisearch/data/original_box_download/"

	# data we will need for search:
	block_embeddings_df_path = "block_embeddings/block-embeddings.parquet"
	doc_tfidf_df_path = "block_tfidf/TF-IDF-doc-text.parquet"
	tfidf_vectorizer_path = "block_tfidf/tfidf_vectorizer_doc_text.joblib"

	sbert_query_docs = create_embeddings_search_function_from_embeddings_df(
	model_name = "sentence-transformers/all-MiniLM-L6-v2",
	embeddings_df_path = block_embeddings_df_path,
	device = "cpu")
	tfidf_query_docs = create_tfidf_search_function(
	dtm_df_path = doc_tfidf_df_path,
	vectorizer_path = tfidf_vectorizer_path,
	model_name = "facebook/fasttext-en-vectors")

	app = FastAPI()


	@app.get("/")
	def default():
	return {"status": "ok", "version": 0.1}


	@app.get("/search", response_class=JSONResponse)
	def search(q: str = Query(..., description="Search query")):
	res_tfidf = tfidf_query_docs(q)
	res_sbert = sbert_query_docs(q)

	joined = res_sbert.join(res_tfidf, on='file', how = 'inner')

	res_combined = joined.with_columns(
	(0.7 * pl.col("rank-sbert") + 0.3 * pl.col("rank-tfidf")).alias("rank-combined"),
	pl.col("file").str.strip_prefix(path_prefix).alias("file")
	).sort("rank-combined").with_columns(
	(20.0 / pl.col('rank-combined')).round(2).alias('confidence')
	).select(['file', 'confidence'])

	#return {"request": request, "results": str(res_combined)}
	#return {"request": request, "results": res_combined.to_dicts()}
	return res_combined.to_dicts()




	@app.get("/test")
	def echo(query: str):

	return {"echo": query}