RAG-Scraper / app.py
CultriX's picture
Update app.py
5458065 verified
# app.py
from __future__ import annotations
import os
import csv
import json
import re
import subprocess
import tempfile
from typing import Optional, Tuple, Literal
import gradio as gr
import markdown_pdf
from typing_extensions import Annotated, Doc
from pydantic import BaseModel, Field, conint
from rag_scraper.scraper import Scraper
from rag_scraper.converter import Converter
from rag_scraper.link_extractor import LinkExtractor, LinkType
from rag_scraper.utils import URLUtils
# -----------------------------
# Environment (HF cache dir)
# -----------------------------
os.environ["HF_HOME"] = "/tmp/hf_cache"
os.makedirs(os.environ["HF_HOME"], exist_ok=True)
# -----------------------------
# Helper utilities
# -----------------------------
def check_repomix_installed() -> bool:
"""Return True if `repomix` is available on PATH."""
try:
r = subprocess.run(
["repomix", "--version"],
capture_output=True,
text=True,
check=False,
)
return r.returncode == 0
except Exception:
return False
def run_repomix(
repo_url_or_id: str,
progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> Tuple[str, Optional[str]]:
"""Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
progress(0, desc="Starting Repomix…")
try:
with tempfile.TemporaryDirectory() as td:
out_path = os.path.join(td, "repomix-output.md")
repo_url = (
f"https://github.com/{repo_url_or_id}"
if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http"))
else repo_url_or_id
)
cmd = [
"repomix",
"--remote",
repo_url,
"--output",
out_path,
"--style",
"markdown",
"--compress",
]
p = subprocess.run(
cmd, capture_output=True, text=True, check=False, encoding="utf-8"
)
progress(0.8, desc="Repomix done.")
if p.returncode != 0:
err = (
f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
)
return f"Error running Repomix:\n{err}", None
if os.path.exists(out_path):
with open(out_path, "r", encoding="utf-8") as f:
return f.read(), out_path
return "Error: Repomix did not produce an output file.", None
except Exception as e:
progress(1, desc="Error")
return f"Error processing GitHub repository: {e}", None
def scrape_and_convert_website(
url: str,
depth: int,
progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> Tuple[str, str]:
"""Recursively scrape a website and convert visited pages to Markdown."""
progress(0, desc=f"Scraping {url}…")
visited = set()
def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
if u in visited or d < 0:
return ""
visited.add(u)
try:
progress(i / n if n > 0 else 0, desc=f"Scraping: {u}")
html = Scraper.fetch_html(u)
except Exception as e:
return f"Error fetching {u}: {e}\n"
md = (
f"## Extracted from: {u}\n\n"
+ Converter.html_to_markdown(
html=html, base_url=u, parser_features="html.parser", ignore_links=True
)
+ "\n\n"
)
if d > 0:
try:
links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL)
valid = [
l
for l in links
if URLUtils.is_internal(l, u) and l not in visited
]
for j, nxt in enumerate(valid):
md += rec(nxt, d - 1, len(valid), j)
except Exception as e:
md += f"Error extracting links from {u}: {e}\n"
return md
all_md = rec(url, depth)
with tempfile.NamedTemporaryFile(
mode="w+", delete=False, suffix=".md", encoding="utf-8"
) as tmp:
tmp.write(all_md)
return all_md, tmp.name
def convert_to_json(markdown_content: str, source: str) -> str:
"""Wrap Markdown in a tiny JSON schema."""
return json.dumps({"source": source, "content": markdown_content}, indent=2)
def convert_to_csv(markdown_content: str, source: str) -> str:
"""Write a simple 2-column CSV and return its path."""
f = tempfile.NamedTemporaryFile(
mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
)
w = csv.writer(f)
w.writerow(["source", "content"])
w.writerow([source, markdown_content])
f.close()
return f.name
def save_output_to_file(content: str, fmt: str, source: str) -> str:
"""Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path."""
if fmt == "JSON":
data = convert_to_json(content, source)
suffix = ".json"
elif fmt == "CSV":
return convert_to_csv(content, source)
elif fmt == "Text":
data, suffix = content, ".txt"
elif fmt == "PDF":
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
path = tmp_pdf.name
markdown_pdf.MarkdownPdf(toc_level=2).convert_from_string(content, path)
return path
except Exception as e:
print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
data, suffix = content, ".pdf.md"
else:
data, suffix = content, ".md"
with tempfile.NamedTemporaryFile(
mode="w+", delete=False, suffix=suffix, encoding="utf-8"
) as tmp:
tmp.write(data)
return tmp.name
# -----------------------------
# Core UI-bound function
# -----------------------------
def process_input_updated(
url_or_id: str,
source_type: Literal["Webpage", "GitHub Repository"],
depth: int,
output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> Tuple[str, str, Optional[str]]:
"""
UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
then export as Markdown/JSON/CSV/Text/PDF.
"""
progress(0, desc="Initializing…")
out_path: Optional[str] = None
if source_type == "GitHub Repository":
if not check_repomix_installed():
return "Repomix is not installed or not accessible.", "", None
raw, _ = run_repomix(url_or_id, progress=progress)
if raw.startswith("Error"):
return raw, "", None
elif source_type == "Webpage":
raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
if raw.startswith("Error"):
return raw, "", None
else:
return "Invalid source type selected.", "", None
try:
progress(0.9, desc=f"Converting to {output_format_selection}…")
out_path = save_output_to_file(raw, output_format_selection, url_or_id)
preview = raw
if output_format_selection == "JSON":
preview = convert_to_json(raw, url_or_id)
elif output_format_selection == "CSV":
try:
with open(out_path, "r", encoding="utf-8") as f:
first_lines = [next(f) for _ in range(5)]
preview = "".join(first_lines) or "[CSV content is empty or very short]"
except StopIteration:
with open(out_path, "r", encoding="utf-8") as f:
preview = f.read() or "[CSV content is empty]"
except Exception as e:
preview = f"[Error reading CSV for preview: {e}]"
elif output_format_selection == "PDF":
from os.path import basename
preview = (
f"[PDF generated. Download to view: "
f"{basename(out_path) if out_path else 'file.pdf'}]"
)
progress(1, desc="Done.")
return f"Successfully processed: {url_or_id}", preview, out_path
except Exception as e:
return f"Error during conversion: {e}", "", None
# -----------------------------
# Pydantic models for MCP tool
# -----------------------------
class ProcessArgs(BaseModel):
url_or_id: str = Field(
...,
description=(
"For webpages, a full URL (e.g., https://example.com). "
"For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)."
),
)
source_type: Literal["Webpage", "GitHub Repository"] = Field(
...,
description='Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.',
)
depth: conint(ge=0, le=3) = Field(
...,
description="Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub.",
)
output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"] = Field(
...,
description="Desired output format for the processed content.",
)
class ProcessResult(BaseModel):
status: str = Field(..., description="Human-readable status line.")
preview: str = Field(
...,
description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
)
file_path: Optional[str] = Field(
None, description="Temp file path for the artifact, or null if not created."
)
def process_input_mcp(args: ProcessArgs) -> ProcessResult:
"""
MCP-friendly tool that accepts/returns Pydantic models (schema carries field descriptions).
"""
status, preview, path = process_input_updated(
args.url_or_id, args.source_type, int(args.depth), args.output_format_selection
)
return ProcessResult(status=status, preview=preview, file_path=path)
# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as ui_iface:
gr.Markdown("# RAG-Ready Content Scraper")
gr.Markdown(
"Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
)
with gr.Row():
with gr.Column(scale=2):
url_input = gr.Textbox(
label="Enter URL or GitHub Repository ID",
placeholder="https://example.com or owner/repo",
)
source_type_input = gr.Radio(
choices=["Webpage", "GitHub Repository"],
value="Webpage",
label="Select Source Type",
)
depth_input = gr.Slider(
minimum=0,
maximum=3,
step=1,
value=0,
label="Scraping Depth (for Webpages)",
info="0 = only main page. Ignored for GitHub.",
)
output_format_input = gr.Dropdown(
choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
value="Markdown",
label="Select Output Format",
)
submit_button = gr.Button("Process Content", variant="primary")
with gr.Column(scale=3):
status_output = gr.Textbox(label="Status", interactive=False)
preview_output = gr.Code(
label="Preview Content", language="markdown", interactive=False
)
file_download_output = gr.File(
label="Download Processed File", interactive=False
)
gr.Examples(
examples=[
["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
["gradio-app/gradio", "GitHub Repository", 0, "Text"],
[
"https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
"Webpage",
0,
"JSON",
],
],
inputs=[url_input, source_type_input, depth_input, output_format_input],
outputs=[status_output, preview_output, file_download_output],
fn=process_input_updated,
cache_examples=False,
)
submit_button.click(
fn=process_input_updated,
inputs=[url_input, source_type_input, depth_input, output_format_input],
outputs=[status_output, preview_output, file_download_output],
)
# -----------------------------
# MCP-only Interface (Pydantic tool)
# -----------------------------
# We expose a second interface whose *function signature* uses Pydantic models.
# MCP reads this signature to build a JSON Schema with rich field descriptions.
mcp_iface = gr.Interface(
fn=process_input_mcp,
# Components are placeholders; MCP ignores them and reads the Python types.
# Keep them simple so the tab is usable if someone clicks it.
inputs=gr.JSON(label="ProcessArgs (JSON)"),
outputs=gr.JSON(label="ProcessResult (JSON)"),
title="MCP Tool: process_input_mcp",
description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
allow_flagging="never",
)
# Combine the user UI and the MCP tool as two tabs (the second can be ignored by users).
app = gr.TabbedInterface([ui_iface, mcp_iface], tab_names=["App", "MCP"])
if __name__ == "__main__":
# IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
app.queue().launch(share=True, mcp_server=True)