Spaces:

Akashmj22122002
/

Akash_ai_talks

Running

App Files Files Community

Akash_ai_talks / community_contributions /iamumarjaved /rag_system.py

Akashmj22122002

Upload folder using huggingface_hub

b02e301 verified 9 days ago

raw

history blame

10.7 kB

	import os
	import json
	import pickle
	from typing import List, Dict, Tuple, Optional
	from pathlib import Path
	import numpy as np
	from sentence_transformers import SentenceTransformer, CrossEncoder
	from rank_bm25 import BM25Okapi
	import chromadb


	class QueryExpander:

	def __init__(self, openai_client):
	self.client = openai_client

	def expand_query(self, query: str, num_variations: int = 3) -> List[str]:
	prompt = f"""Given this user query, generate {num_variations} alternative phrasings that capture the same intent but use different words.

	Original query: {query}

	Return ONLY a JSON array of alternative queries, nothing else.
	Example: ["query1", "query2", "query3"]"""

	try:
	response = self.client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[{"role": "user", "content": prompt}],
	temperature=0.7
	)
	variations = json.loads(response.choices[0].message.content)
	return [query] + variations
	except Exception as e:
	print(f"Query expansion failed: {e}")
	return [query]


	class HybridRetriever:

	def __init__(self, embedding_model: str = "all-MiniLM-L6-v2", reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", data_dir: str = "data"):
	self.data_dir = Path(data_dir)
	self.data_dir.mkdir(exist_ok=True)

	print("Loading embedding model...")
	self.embedder = SentenceTransformer(embedding_model)

	print("Loading reranker model...")
	self.reranker = CrossEncoder(reranker_model)

	self.chroma_client = chromadb.PersistentClient(path=str(self.data_dir / "vector_store"))
	self.documents: List[Dict] = []
	self.bm25: Optional[BM25Okapi] = None
	self.collection = None

	def chunk_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
	words = text.split()
	chunks = []
	for i in range(0, len(words), chunk_size - overlap):
	chunk = ' '.join(words[i:i + chunk_size])
	if chunk:
	chunks.append(chunk)
	return chunks

	def index_documents(self, documents: Dict[str, str], chunk_size: int = 500, overlap: int = 50, collection_name: str = "knowledge_base"):
	print(f"Indexing documents with chunk_size={chunk_size}, overlap={overlap}")

	all_chunks = []
	for doc_id, content in documents.items():
	chunks = self.chunk_text(content, chunk_size, overlap)
	for idx, chunk in enumerate(chunks):
	all_chunks.append({
	"id": f"{doc_id}_{idx}",
	"text": chunk,
	"source": doc_id,
	"chunk_idx": idx
	})

	self.documents = all_chunks

	if not all_chunks:
	raise ValueError("No text chunks created from documents. Please check your document content.")

	print("Building BM25 index...")
	tokenized_docs = [doc["text"].lower().split() for doc in all_chunks]
	self.bm25 = BM25Okapi(tokenized_docs)

	print("Building semantic index...")
	try:
	self.chroma_client.delete_collection(collection_name)
	except:
	pass

	self.collection = self.chroma_client.create_collection(name=collection_name, metadata={"hnsw:space": "cosine"})

	batch_size = 100
	for i in range(0, len(all_chunks), batch_size):
	batch = all_chunks[i:i + batch_size]
	self.collection.add(
	documents=[doc["text"] for doc in batch],
	ids=[doc["id"] for doc in batch],
	metadatas=[{"source": doc["source"], "chunk_idx": doc["chunk_idx"]} for doc in batch]
	)

	print(f"Indexed {len(all_chunks)} chunks from {len(documents)} documents")

	with open(self.data_dir / "bm25_index.pkl", "wb") as f:
	pickle.dump((self.bm25, self.documents), f)

	def retrieve_bm25(self, query: str, top_k: int = 10) -> List[Tuple[Dict, float]]:
	if self.bm25 is None:
	return []

	tokenized_query = query.lower().split()
	scores = self.bm25.get_scores(tokenized_query)
	top_indices = np.argsort(scores)[::-1][:top_k]

	results = []
	for idx in top_indices:
	if scores[idx] > 0:
	results.append((self.documents[idx], float(scores[idx])))
	return results

	def retrieve_semantic(self, query: str, top_k: int = 10) -> List[Tuple[Dict, float]]:
	if self.collection is None:
	return []

	results = self.collection.query(query_texts=[query], n_results=top_k)

	retrieved = []
	for i, doc_id in enumerate(results["ids"][0]):
	doc = next((d for d in self.documents if d["id"] == doc_id), None)
	if doc:
	distance = results["distances"][0][i]
	similarity = 1 / (1 + distance)
	retrieved.append((doc, similarity))
	return retrieved

	def retrieve_hybrid(self, query: str, top_k: int = 10, bm25_weight: float = 0.5, semantic_weight: float = 0.5) -> List[Tuple[Dict, float]]:
	bm25_results = self.retrieve_bm25(query, top_k * 2)
	semantic_results = self.retrieve_semantic(query, top_k * 2)

	def normalize_scores(results):
	if not results:
	return {}
	scores = [score for _, score in results]
	max_score = max(scores) if scores else 1.0
	min_score = min(scores) if scores else 0.0
	range_score = max_score - min_score if max_score != min_score else 1.0
	return {doc["id"]: (score - min_score) / range_score for doc, score in results}

	bm25_scores = normalize_scores(bm25_results)
	semantic_scores = normalize_scores(semantic_results)

	all_doc_ids = set(bm25_scores.keys()) \| set(semantic_scores.keys())
	combined_scores = {}
	for doc_id in all_doc_ids:
	bm25_score = bm25_scores.get(doc_id, 0.0)
	semantic_score = semantic_scores.get(doc_id, 0.0)
	combined_scores[doc_id] = bm25_weight * bm25_score + semantic_weight * semantic_score

	sorted_ids = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]

	results = []
	for doc_id, score in sorted_ids:
	doc = next((d for d in self.documents if d["id"] == doc_id), None)
	if doc:
	results.append((doc, score))
	return results

	def rerank(self, query: str, documents: List[Tuple[Dict, float]], top_k: int = 5) -> List[Tuple[Dict, float]]:
	if not documents:
	return []

	pairs = [[query, doc["text"]] for doc, _ in documents]
	rerank_scores = self.reranker.predict(pairs)
	reranked = [(doc, float(score)) for (doc, _), score in zip(documents, rerank_scores)]
	reranked.sort(key=lambda x: x[1], reverse=True)
	return reranked[:top_k]

	def retrieve(self, query: str, method: str = "hybrid_rerank", top_k: int = 5, expand_query: bool = False, query_expander: Optional['QueryExpander'] = None, **kwargs) -> List[Dict]:
	queries = [query]

	if expand_query and query_expander:
	queries = query_expander.expand_query(query)
	print(f"Expanded to {len(queries)} queries")

	all_results = {}
	for q in queries:
	if method == "bm25":
	results = self.retrieve_bm25(q, top_k * 2)
	elif method == "semantic":
	results = self.retrieve_semantic(q, top_k * 2)
	elif method in ["hybrid", "hybrid_rerank"]:
	results = self.retrieve_hybrid(q, top_k * 2, kwargs.get("bm25_weight", 0.5), kwargs.get("semantic_weight", 0.5))
	else:
	raise ValueError(f"Unknown method: {method}")

	for doc, score in results:
	doc_id = doc["id"]
	if doc_id not in all_results:
	all_results[doc_id] = (doc, 0.0)
	all_results[doc_id] = (doc, all_results[doc_id][1] + score)

	aggregated = list(all_results.values())
	aggregated.sort(key=lambda x: x[1], reverse=True)

	if "rerank" in method:
	print(f"Reranking {len(aggregated)} results...")
	aggregated = self.rerank(query, aggregated[:top_k * 3], top_k)
	else:
	aggregated = aggregated[:top_k]

	return [{"retrieval_score": score, **doc} for doc, score in aggregated]


	class RAGSystem:

	def __init__(self, openai_client, data_dir: str = "data"):
	self.client = openai_client
	self.retriever = HybridRetriever(data_dir=data_dir)
	self.query_expander = QueryExpander(openai_client)
	self.data_dir = Path(data_dir)

	def load_knowledge_base(self, documents: Dict[str, str], chunk_size: int = 500, overlap: int = 50):
	self.retriever.index_documents(documents, chunk_size, overlap)

	def generate_answer(self, query: str, context: List[Dict], system_prompt: str) -> str:
	context_str = "\n\n".join([f"[Source: {doc['source']}, Chunk {doc['chunk_idx']}]\n{doc['text']}" for doc in context])

	augmented_prompt = f"""{system_prompt}

	## Retrieved Context:
	{context_str}

	## User Query:
	{query}

	Please answer the query based on the context provided above."""

	messages = [{"role": "user", "content": augmented_prompt}]
	response = self.client.chat.completions.create(model="gpt-4o-mini", messages=messages, temperature=0.7)
	return response.choices[0].message.content

	def query(self, query: str, system_prompt: str, method: str = "hybrid_rerank", top_k: int = 5, expand_query: bool = False, **kwargs) -> Dict:
	context = self.retriever.retrieve(query, method=method, top_k=top_k, expand_query=expand_query, query_expander=self.query_expander if expand_query else None, **kwargs)
	answer = self.generate_answer(query, context, system_prompt)
	return {"answer": answer, "context": context, "method": method, "query": query}