Spaces:

CultriX
/

RAG-Scraper

Sleeping

App Files Files Community

RAG-Scraper / app.py

CultriX

Update app.py

5458065 verified 3 months ago

raw

history blame contribute delete

13.5 kB

	# app.py
	from __future__ import annotations

	import os
	import csv
	import json
	import re
	import subprocess
	import tempfile
	from typing import Optional, Tuple, Literal

	import gradio as gr
	import markdown_pdf
	from typing_extensions import Annotated, Doc

	from pydantic import BaseModel, Field, conint

	from rag_scraper.scraper import Scraper
	from rag_scraper.converter import Converter
	from rag_scraper.link_extractor import LinkExtractor, LinkType
	from rag_scraper.utils import URLUtils

	# -----------------------------
	# Environment (HF cache dir)
	# -----------------------------
	os.environ["HF_HOME"] = "/tmp/hf_cache"
	os.makedirs(os.environ["HF_HOME"], exist_ok=True)


	# -----------------------------
	# Helper utilities
	# -----------------------------
	def check_repomix_installed() -> bool:
	"""Return True if `repomix` is available on PATH."""
	try:
	r = subprocess.run(
	["repomix", "--version"],
	capture_output=True,
	text=True,
	check=False,
	)
	return r.returncode == 0
	except Exception:
	return False


	def run_repomix(
	repo_url_or_id: str,
	progress: gr.Progress = gr.Progress(track_tqdm=True),
	) -> Tuple[str, Optional[str]]:
	"""Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
	progress(0, desc="Starting Repomix…")
	try:
	with tempfile.TemporaryDirectory() as td:
	out_path = os.path.join(td, "repomix-output.md")
	repo_url = (
	f"https://github.com/{repo_url_or_id}"
	if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http"))
	else repo_url_or_id
	)
	cmd = [
	"repomix",
	"--remote",
	repo_url,
	"--output",
	out_path,
	"--style",
	"markdown",
	"--compress",
	]
	p = subprocess.run(
	cmd, capture_output=True, text=True, check=False, encoding="utf-8"
	)
	progress(0.8, desc="Repomix done.")
	if p.returncode != 0:
	err = (
	f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
	)
	return f"Error running Repomix:\n{err}", None
	if os.path.exists(out_path):
	with open(out_path, "r", encoding="utf-8") as f:
	return f.read(), out_path
	return "Error: Repomix did not produce an output file.", None
	except Exception as e:
	progress(1, desc="Error")
	return f"Error processing GitHub repository: {e}", None


	def scrape_and_convert_website(
	url: str,
	depth: int,
	progress: gr.Progress = gr.Progress(track_tqdm=True),
	) -> Tuple[str, str]:
	"""Recursively scrape a website and convert visited pages to Markdown."""
	progress(0, desc=f"Scraping {url}…")
	visited = set()

	def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
	if u in visited or d < 0:
	return ""
	visited.add(u)
	try:
	progress(i / n if n > 0 else 0, desc=f"Scraping: {u}")
	html = Scraper.fetch_html(u)
	except Exception as e:
	return f"Error fetching {u}: {e}\n"
	md = (
	f"## Extracted from: {u}\n\n"
	+ Converter.html_to_markdown(
	html=html, base_url=u, parser_features="html.parser", ignore_links=True
	)
	+ "\n\n"
	)
	if d > 0:
	try:
	links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL)
	valid = [
	l
	for l in links
	if URLUtils.is_internal(l, u) and l not in visited
	]
	for j, nxt in enumerate(valid):
	md += rec(nxt, d - 1, len(valid), j)
	except Exception as e:
	md += f"Error extracting links from {u}: {e}\n"
	return md

	all_md = rec(url, depth)
	with tempfile.NamedTemporaryFile(
	mode="w+", delete=False, suffix=".md", encoding="utf-8"
	) as tmp:
	tmp.write(all_md)
	return all_md, tmp.name


	def convert_to_json(markdown_content: str, source: str) -> str:
	"""Wrap Markdown in a tiny JSON schema."""
	return json.dumps({"source": source, "content": markdown_content}, indent=2)


	def convert_to_csv(markdown_content: str, source: str) -> str:
	"""Write a simple 2-column CSV and return its path."""
	f = tempfile.NamedTemporaryFile(
	mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
	)
	w = csv.writer(f)
	w.writerow(["source", "content"])
	w.writerow([source, markdown_content])
	f.close()
	return f.name


	def save_output_to_file(content: str, fmt: str, source: str) -> str:
	"""Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path."""
	if fmt == "JSON":
	data = convert_to_json(content, source)
	suffix = ".json"
	elif fmt == "CSV":
	return convert_to_csv(content, source)
	elif fmt == "Text":
	data, suffix = content, ".txt"
	elif fmt == "PDF":
	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
	path = tmp_pdf.name
	markdown_pdf.MarkdownPdf(toc_level=2).convert_from_string(content, path)
	return path
	except Exception as e:
	print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
	data, suffix = content, ".pdf.md"
	else:
	data, suffix = content, ".md"

	with tempfile.NamedTemporaryFile(
	mode="w+", delete=False, suffix=suffix, encoding="utf-8"
	) as tmp:
	tmp.write(data)
	return tmp.name


	# -----------------------------
	# Core UI-bound function
	# -----------------------------
	def process_input_updated(
	url_or_id: str,
	source_type: Literal["Webpage", "GitHub Repository"],
	depth: int,
	output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
	progress: gr.Progress = gr.Progress(track_tqdm=True),
	) -> Tuple[str, str, Optional[str]]:
	"""
	UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
	then export as Markdown/JSON/CSV/Text/PDF.
	"""
	progress(0, desc="Initializing…")
	out_path: Optional[str] = None

	if source_type == "GitHub Repository":
	if not check_repomix_installed():
	return "Repomix is not installed or not accessible.", "", None
	raw, _ = run_repomix(url_or_id, progress=progress)
	if raw.startswith("Error"):
	return raw, "", None
	elif source_type == "Webpage":
	raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
	if raw.startswith("Error"):
	return raw, "", None
	else:
	return "Invalid source type selected.", "", None

	try:
	progress(0.9, desc=f"Converting to {output_format_selection}…")
	out_path = save_output_to_file(raw, output_format_selection, url_or_id)

	preview = raw
	if output_format_selection == "JSON":
	preview = convert_to_json(raw, url_or_id)
	elif output_format_selection == "CSV":
	try:
	with open(out_path, "r", encoding="utf-8") as f:
	first_lines = [next(f) for _ in range(5)]
	preview = "".join(first_lines) or "[CSV content is empty or very short]"
	except StopIteration:
	with open(out_path, "r", encoding="utf-8") as f:
	preview = f.read() or "[CSV content is empty]"
	except Exception as e:
	preview = f"[Error reading CSV for preview: {e}]"
	elif output_format_selection == "PDF":
	from os.path import basename

	preview = (
	f"[PDF generated. Download to view: "
	f"{basename(out_path) if out_path else 'file.pdf'}]"
	)

	progress(1, desc="Done.")
	return f"Successfully processed: {url_or_id}", preview, out_path

	except Exception as e:
	return f"Error during conversion: {e}", "", None


	# -----------------------------
	# Pydantic models for MCP tool
	# -----------------------------
	class ProcessArgs(BaseModel):
	url_or_id: str = Field(
	...,
	description=(
	"For webpages, a full URL (e.g., https://example.com). "
	"For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)."
	),
	)
	source_type: Literal["Webpage", "GitHub Repository"] = Field(
	...,
	description='Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.',
	)
	depth: conint(ge=0, le=3) = Field(
	...,
	description="Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub.",
	)
	output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"] = Field(
	...,
	description="Desired output format for the processed content.",
	)


	class ProcessResult(BaseModel):
	status: str = Field(..., description="Human-readable status line.")
	preview: str = Field(
	...,
	description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
	)
	file_path: Optional[str] = Field(
	None, description="Temp file path for the artifact, or null if not created."
	)


	def process_input_mcp(args: ProcessArgs) -> ProcessResult:
	"""
	MCP-friendly tool that accepts/returns Pydantic models (schema carries field descriptions).
	"""
	status, preview, path = process_input_updated(
	args.url_or_id, args.source_type, int(args.depth), args.output_format_selection
	)
	return ProcessResult(status=status, preview=preview, file_path=path)


	# -----------------------------
	# Gradio UI
	# -----------------------------
	with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as ui_iface:
	gr.Markdown("# RAG-Ready Content Scraper")
	gr.Markdown(
	"Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
	)

	with gr.Row():
	with gr.Column(scale=2):
	url_input = gr.Textbox(
	label="Enter URL or GitHub Repository ID",
	placeholder="https://example.com or owner/repo",
	)
	source_type_input = gr.Radio(
	choices=["Webpage", "GitHub Repository"],
	value="Webpage",
	label="Select Source Type",
	)
	depth_input = gr.Slider(
	minimum=0,
	maximum=3,
	step=1,
	value=0,
	label="Scraping Depth (for Webpages)",
	info="0 = only main page. Ignored for GitHub.",
	)
	output_format_input = gr.Dropdown(
	choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
	value="Markdown",
	label="Select Output Format",
	)
	submit_button = gr.Button("Process Content", variant="primary")
	with gr.Column(scale=3):
	status_output = gr.Textbox(label="Status", interactive=False)
	preview_output = gr.Code(
	label="Preview Content", language="markdown", interactive=False
	)
	file_download_output = gr.File(
	label="Download Processed File", interactive=False
	)

	gr.Examples(
	examples=[
	["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
	["gradio-app/gradio", "GitHub Repository", 0, "Text"],
	[
	"https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
	"Webpage",
	0,
	"JSON",
	],
	],
	inputs=[url_input, source_type_input, depth_input, output_format_input],
	outputs=[status_output, preview_output, file_download_output],
	fn=process_input_updated,
	cache_examples=False,
	)

	submit_button.click(
	fn=process_input_updated,
	inputs=[url_input, source_type_input, depth_input, output_format_input],
	outputs=[status_output, preview_output, file_download_output],
	)

	# -----------------------------
	# MCP-only Interface (Pydantic tool)
	# -----------------------------
	# We expose a second interface whose function signature uses Pydantic models.
	# MCP reads this signature to build a JSON Schema with rich field descriptions.
	mcp_iface = gr.Interface(
	fn=process_input_mcp,
	# Components are placeholders; MCP ignores them and reads the Python types.
	# Keep them simple so the tab is usable if someone clicks it.
	inputs=gr.JSON(label="ProcessArgs (JSON)"),
	outputs=gr.JSON(label="ProcessResult (JSON)"),
	title="MCP Tool: process_input_mcp",
	description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
	allow_flagging="never",
	)

	# Combine the user UI and the MCP tool as two tabs (the second can be ignored by users).
	app = gr.TabbedInterface([ui_iface, mcp_iface], tab_names=["App", "MCP"])


	if __name__ == "__main__":
	# IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
	app.queue().launch(share=True, mcp_server=True)