# app.py from __future__ import annotations import os import csv import json import re import subprocess import tempfile from typing import Optional, Tuple, Literal import gradio as gr import markdown_pdf from typing_extensions import Annotated, Doc from pydantic import BaseModel, Field, conint from rag_scraper.scraper import Scraper from rag_scraper.converter import Converter from rag_scraper.link_extractor import LinkExtractor, LinkType from rag_scraper.utils import URLUtils # ----------------------------- # Environment (HF cache dir) # ----------------------------- os.environ["HF_HOME"] = "/tmp/hf_cache" os.makedirs(os.environ["HF_HOME"], exist_ok=True) # ----------------------------- # Helper utilities # ----------------------------- def check_repomix_installed() -> bool: """Return True if `repomix` is available on PATH.""" try: r = subprocess.run( ["repomix", "--version"], capture_output=True, text=True, check=False, ) return r.returncode == 0 except Exception: return False def run_repomix( repo_url_or_id: str, progress: gr.Progress = gr.Progress(track_tqdm=True), ) -> Tuple[str, Optional[str]]: """Run Repomix on a GitHub repo and return combined Markdown (or an Error string).""" progress(0, desc="Starting Repomix…") try: with tempfile.TemporaryDirectory() as td: out_path = os.path.join(td, "repomix-output.md") repo_url = ( f"https://github.com/{repo_url_or_id}" if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http")) else repo_url_or_id ) cmd = [ "repomix", "--remote", repo_url, "--output", out_path, "--style", "markdown", "--compress", ] p = subprocess.run( cmd, capture_output=True, text=True, check=False, encoding="utf-8" ) progress(0.8, desc="Repomix done.") if p.returncode != 0: err = ( f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}" ) return f"Error running Repomix:\n{err}", None if os.path.exists(out_path): with open(out_path, "r", encoding="utf-8") as f: return f.read(), out_path return "Error: Repomix did not produce an output file.", None except Exception as e: progress(1, desc="Error") return f"Error processing GitHub repository: {e}", None def scrape_and_convert_website( url: str, depth: int, progress: gr.Progress = gr.Progress(track_tqdm=True), ) -> Tuple[str, str]: """Recursively scrape a website and convert visited pages to Markdown.""" progress(0, desc=f"Scraping {url}…") visited = set() def rec(u: str, d: int, n: int = 1, i: int = 0) -> str: if u in visited or d < 0: return "" visited.add(u) try: progress(i / n if n > 0 else 0, desc=f"Scraping: {u}") html = Scraper.fetch_html(u) except Exception as e: return f"Error fetching {u}: {e}\n" md = ( f"## Extracted from: {u}\n\n" + Converter.html_to_markdown( html=html, base_url=u, parser_features="html.parser", ignore_links=True ) + "\n\n" ) if d > 0: try: links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL) valid = [ l for l in links if URLUtils.is_internal(l, u) and l not in visited ] for j, nxt in enumerate(valid): md += rec(nxt, d - 1, len(valid), j) except Exception as e: md += f"Error extracting links from {u}: {e}\n" return md all_md = rec(url, depth) with tempfile.NamedTemporaryFile( mode="w+", delete=False, suffix=".md", encoding="utf-8" ) as tmp: tmp.write(all_md) return all_md, tmp.name def convert_to_json(markdown_content: str, source: str) -> str: """Wrap Markdown in a tiny JSON schema.""" return json.dumps({"source": source, "content": markdown_content}, indent=2) def convert_to_csv(markdown_content: str, source: str) -> str: """Write a simple 2-column CSV and return its path.""" f = tempfile.NamedTemporaryFile( mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8" ) w = csv.writer(f) w.writerow(["source", "content"]) w.writerow([source, markdown_content]) f.close() return f.name def save_output_to_file(content: str, fmt: str, source: str) -> str: """Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path.""" if fmt == "JSON": data = convert_to_json(content, source) suffix = ".json" elif fmt == "CSV": return convert_to_csv(content, source) elif fmt == "Text": data, suffix = content, ".txt" elif fmt == "PDF": try: with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf: path = tmp_pdf.name markdown_pdf.MarkdownPdf(toc_level=2).convert_from_string(content, path) return path except Exception as e: print(f"PDF conversion failed: {e}. Saving as Markdown instead.") data, suffix = content, ".pdf.md" else: data, suffix = content, ".md" with tempfile.NamedTemporaryFile( mode="w+", delete=False, suffix=suffix, encoding="utf-8" ) as tmp: tmp.write(data) return tmp.name # ----------------------------- # Core UI-bound function # ----------------------------- def process_input_updated( url_or_id: str, source_type: Literal["Webpage", "GitHub Repository"], depth: int, output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"], progress: gr.Progress = gr.Progress(track_tqdm=True), ) -> Tuple[str, str, Optional[str]]: """ UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix), then export as Markdown/JSON/CSV/Text/PDF. """ progress(0, desc="Initializing…") out_path: Optional[str] = None if source_type == "GitHub Repository": if not check_repomix_installed(): return "Repomix is not installed or not accessible.", "", None raw, _ = run_repomix(url_or_id, progress=progress) if raw.startswith("Error"): return raw, "", None elif source_type == "Webpage": raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress) if raw.startswith("Error"): return raw, "", None else: return "Invalid source type selected.", "", None try: progress(0.9, desc=f"Converting to {output_format_selection}…") out_path = save_output_to_file(raw, output_format_selection, url_or_id) preview = raw if output_format_selection == "JSON": preview = convert_to_json(raw, url_or_id) elif output_format_selection == "CSV": try: with open(out_path, "r", encoding="utf-8") as f: first_lines = [next(f) for _ in range(5)] preview = "".join(first_lines) or "[CSV content is empty or very short]" except StopIteration: with open(out_path, "r", encoding="utf-8") as f: preview = f.read() or "[CSV content is empty]" except Exception as e: preview = f"[Error reading CSV for preview: {e}]" elif output_format_selection == "PDF": from os.path import basename preview = ( f"[PDF generated. Download to view: " f"{basename(out_path) if out_path else 'file.pdf'}]" ) progress(1, desc="Done.") return f"Successfully processed: {url_or_id}", preview, out_path except Exception as e: return f"Error during conversion: {e}", "", None # ----------------------------- # Pydantic models for MCP tool # ----------------------------- class ProcessArgs(BaseModel): url_or_id: str = Field( ..., description=( "For webpages, a full URL (e.g., https://example.com). " "For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)." ), ) source_type: Literal["Webpage", "GitHub Repository"] = Field( ..., description='Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.', ) depth: conint(ge=0, le=3) = Field( ..., description="Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub.", ) output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"] = Field( ..., description="Desired output format for the processed content.", ) class ProcessResult(BaseModel): status: str = Field(..., description="Human-readable status line.") preview: str = Field( ..., description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.", ) file_path: Optional[str] = Field( None, description="Temp file path for the artifact, or null if not created." ) def process_input_mcp(args: ProcessArgs) -> ProcessResult: """ MCP-friendly tool that accepts/returns Pydantic models (schema carries field descriptions). """ status, preview, path = process_input_updated( args.url_or_id, args.source_type, int(args.depth), args.output_format_selection ) return ProcessResult(status=status, preview=preview, file_path=path) # ----------------------------- # Gradio UI # ----------------------------- with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as ui_iface: gr.Markdown("# RAG-Ready Content Scraper") gr.Markdown( "Scrape webpage content or GitHub repositories to generate RAG-ready datasets." ) with gr.Row(): with gr.Column(scale=2): url_input = gr.Textbox( label="Enter URL or GitHub Repository ID", placeholder="https://example.com or owner/repo", ) source_type_input = gr.Radio( choices=["Webpage", "GitHub Repository"], value="Webpage", label="Select Source Type", ) depth_input = gr.Slider( minimum=0, maximum=3, step=1, value=0, label="Scraping Depth (for Webpages)", info="0 = only main page. Ignored for GitHub.", ) output_format_input = gr.Dropdown( choices=["Markdown", "JSON", "CSV", "Text", "PDF"], value="Markdown", label="Select Output Format", ) submit_button = gr.Button("Process Content", variant="primary") with gr.Column(scale=3): status_output = gr.Textbox(label="Status", interactive=False) preview_output = gr.Code( label="Preview Content", language="markdown", interactive=False ) file_download_output = gr.File( label="Download Processed File", interactive=False ) gr.Examples( examples=[ ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"], ["gradio-app/gradio", "GitHub Repository", 0, "Text"], [ "https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON", ], ], inputs=[url_input, source_type_input, depth_input, output_format_input], outputs=[status_output, preview_output, file_download_output], fn=process_input_updated, cache_examples=False, ) submit_button.click( fn=process_input_updated, inputs=[url_input, source_type_input, depth_input, output_format_input], outputs=[status_output, preview_output, file_download_output], ) # ----------------------------- # MCP-only Interface (Pydantic tool) # ----------------------------- # We expose a second interface whose *function signature* uses Pydantic models. # MCP reads this signature to build a JSON Schema with rich field descriptions. mcp_iface = gr.Interface( fn=process_input_mcp, # Components are placeholders; MCP ignores them and reads the Python types. # Keep them simple so the tab is usable if someone clicks it. inputs=gr.JSON(label="ProcessArgs (JSON)"), outputs=gr.JSON(label="ProcessResult (JSON)"), title="MCP Tool: process_input_mcp", description="Pydantic-typed MCP tool exposing rich parameter descriptions.", allow_flagging="never", ) # Combine the user UI and the MCP tool as two tabs (the second can be ignored by users). app = gr.TabbedInterface([ui_iface, mcp_iface], tab_names=["App", "MCP"]) if __name__ == "__main__": # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse app.queue().launch(share=True, mcp_server=True)