Spaces:
Sleeping
Sleeping
| # app.py | |
| from __future__ import annotations | |
| import os | |
| import csv | |
| import json | |
| import re | |
| import subprocess | |
| import tempfile | |
| from typing import Optional, Tuple, Literal | |
| import gradio as gr | |
| import markdown_pdf | |
| from typing_extensions import Annotated, Doc | |
| from pydantic import BaseModel, Field, conint | |
| from rag_scraper.scraper import Scraper | |
| from rag_scraper.converter import Converter | |
| from rag_scraper.link_extractor import LinkExtractor, LinkType | |
| from rag_scraper.utils import URLUtils | |
| # ----------------------------- | |
| # Environment (HF cache dir) | |
| # ----------------------------- | |
| os.environ["HF_HOME"] = "/tmp/hf_cache" | |
| os.makedirs(os.environ["HF_HOME"], exist_ok=True) | |
| # ----------------------------- | |
| # Helper utilities | |
| # ----------------------------- | |
| def check_repomix_installed() -> bool: | |
| """Return True if `repomix` is available on PATH.""" | |
| try: | |
| r = subprocess.run( | |
| ["repomix", "--version"], | |
| capture_output=True, | |
| text=True, | |
| check=False, | |
| ) | |
| return r.returncode == 0 | |
| except Exception: | |
| return False | |
| def run_repomix( | |
| repo_url_or_id: str, | |
| progress: gr.Progress = gr.Progress(track_tqdm=True), | |
| ) -> Tuple[str, Optional[str]]: | |
| """Run Repomix on a GitHub repo and return combined Markdown (or an Error string).""" | |
| progress(0, desc="Starting Repomix…") | |
| try: | |
| with tempfile.TemporaryDirectory() as td: | |
| out_path = os.path.join(td, "repomix-output.md") | |
| repo_url = ( | |
| f"https://github.com/{repo_url_or_id}" | |
| if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http")) | |
| else repo_url_or_id | |
| ) | |
| cmd = [ | |
| "repomix", | |
| "--remote", | |
| repo_url, | |
| "--output", | |
| out_path, | |
| "--style", | |
| "markdown", | |
| "--compress", | |
| ] | |
| p = subprocess.run( | |
| cmd, capture_output=True, text=True, check=False, encoding="utf-8" | |
| ) | |
| progress(0.8, desc="Repomix done.") | |
| if p.returncode != 0: | |
| err = ( | |
| f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}" | |
| ) | |
| return f"Error running Repomix:\n{err}", None | |
| if os.path.exists(out_path): | |
| with open(out_path, "r", encoding="utf-8") as f: | |
| return f.read(), out_path | |
| return "Error: Repomix did not produce an output file.", None | |
| except Exception as e: | |
| progress(1, desc="Error") | |
| return f"Error processing GitHub repository: {e}", None | |
| def scrape_and_convert_website( | |
| url: str, | |
| depth: int, | |
| progress: gr.Progress = gr.Progress(track_tqdm=True), | |
| ) -> Tuple[str, str]: | |
| """Recursively scrape a website and convert visited pages to Markdown.""" | |
| progress(0, desc=f"Scraping {url}…") | |
| visited = set() | |
| def rec(u: str, d: int, n: int = 1, i: int = 0) -> str: | |
| if u in visited or d < 0: | |
| return "" | |
| visited.add(u) | |
| try: | |
| progress(i / n if n > 0 else 0, desc=f"Scraping: {u}") | |
| html = Scraper.fetch_html(u) | |
| except Exception as e: | |
| return f"Error fetching {u}: {e}\n" | |
| md = ( | |
| f"## Extracted from: {u}\n\n" | |
| + Converter.html_to_markdown( | |
| html=html, base_url=u, parser_features="html.parser", ignore_links=True | |
| ) | |
| + "\n\n" | |
| ) | |
| if d > 0: | |
| try: | |
| links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL) | |
| valid = [ | |
| l | |
| for l in links | |
| if URLUtils.is_internal(l, u) and l not in visited | |
| ] | |
| for j, nxt in enumerate(valid): | |
| md += rec(nxt, d - 1, len(valid), j) | |
| except Exception as e: | |
| md += f"Error extracting links from {u}: {e}\n" | |
| return md | |
| all_md = rec(url, depth) | |
| with tempfile.NamedTemporaryFile( | |
| mode="w+", delete=False, suffix=".md", encoding="utf-8" | |
| ) as tmp: | |
| tmp.write(all_md) | |
| return all_md, tmp.name | |
| def convert_to_json(markdown_content: str, source: str) -> str: | |
| """Wrap Markdown in a tiny JSON schema.""" | |
| return json.dumps({"source": source, "content": markdown_content}, indent=2) | |
| def convert_to_csv(markdown_content: str, source: str) -> str: | |
| """Write a simple 2-column CSV and return its path.""" | |
| f = tempfile.NamedTemporaryFile( | |
| mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8" | |
| ) | |
| w = csv.writer(f) | |
| w.writerow(["source", "content"]) | |
| w.writerow([source, markdown_content]) | |
| f.close() | |
| return f.name | |
| def save_output_to_file(content: str, fmt: str, source: str) -> str: | |
| """Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path.""" | |
| if fmt == "JSON": | |
| data = convert_to_json(content, source) | |
| suffix = ".json" | |
| elif fmt == "CSV": | |
| return convert_to_csv(content, source) | |
| elif fmt == "Text": | |
| data, suffix = content, ".txt" | |
| elif fmt == "PDF": | |
| try: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf: | |
| path = tmp_pdf.name | |
| markdown_pdf.MarkdownPdf(toc_level=2).convert_from_string(content, path) | |
| return path | |
| except Exception as e: | |
| print(f"PDF conversion failed: {e}. Saving as Markdown instead.") | |
| data, suffix = content, ".pdf.md" | |
| else: | |
| data, suffix = content, ".md" | |
| with tempfile.NamedTemporaryFile( | |
| mode="w+", delete=False, suffix=suffix, encoding="utf-8" | |
| ) as tmp: | |
| tmp.write(data) | |
| return tmp.name | |
| # ----------------------------- | |
| # Core UI-bound function | |
| # ----------------------------- | |
| def process_input_updated( | |
| url_or_id: str, | |
| source_type: Literal["Webpage", "GitHub Repository"], | |
| depth: int, | |
| output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"], | |
| progress: gr.Progress = gr.Progress(track_tqdm=True), | |
| ) -> Tuple[str, str, Optional[str]]: | |
| """ | |
| UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix), | |
| then export as Markdown/JSON/CSV/Text/PDF. | |
| """ | |
| progress(0, desc="Initializing…") | |
| out_path: Optional[str] = None | |
| if source_type == "GitHub Repository": | |
| if not check_repomix_installed(): | |
| return "Repomix is not installed or not accessible.", "", None | |
| raw, _ = run_repomix(url_or_id, progress=progress) | |
| if raw.startswith("Error"): | |
| return raw, "", None | |
| elif source_type == "Webpage": | |
| raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress) | |
| if raw.startswith("Error"): | |
| return raw, "", None | |
| else: | |
| return "Invalid source type selected.", "", None | |
| try: | |
| progress(0.9, desc=f"Converting to {output_format_selection}…") | |
| out_path = save_output_to_file(raw, output_format_selection, url_or_id) | |
| preview = raw | |
| if output_format_selection == "JSON": | |
| preview = convert_to_json(raw, url_or_id) | |
| elif output_format_selection == "CSV": | |
| try: | |
| with open(out_path, "r", encoding="utf-8") as f: | |
| first_lines = [next(f) for _ in range(5)] | |
| preview = "".join(first_lines) or "[CSV content is empty or very short]" | |
| except StopIteration: | |
| with open(out_path, "r", encoding="utf-8") as f: | |
| preview = f.read() or "[CSV content is empty]" | |
| except Exception as e: | |
| preview = f"[Error reading CSV for preview: {e}]" | |
| elif output_format_selection == "PDF": | |
| from os.path import basename | |
| preview = ( | |
| f"[PDF generated. Download to view: " | |
| f"{basename(out_path) if out_path else 'file.pdf'}]" | |
| ) | |
| progress(1, desc="Done.") | |
| return f"Successfully processed: {url_or_id}", preview, out_path | |
| except Exception as e: | |
| return f"Error during conversion: {e}", "", None | |
| # ----------------------------- | |
| # Pydantic models for MCP tool | |
| # ----------------------------- | |
| class ProcessArgs(BaseModel): | |
| url_or_id: str = Field( | |
| ..., | |
| description=( | |
| "For webpages, a full URL (e.g., https://example.com). " | |
| "For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)." | |
| ), | |
| ) | |
| source_type: Literal["Webpage", "GitHub Repository"] = Field( | |
| ..., | |
| description='Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.', | |
| ) | |
| depth: conint(ge=0, le=3) = Field( | |
| ..., | |
| description="Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub.", | |
| ) | |
| output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"] = Field( | |
| ..., | |
| description="Desired output format for the processed content.", | |
| ) | |
| class ProcessResult(BaseModel): | |
| status: str = Field(..., description="Human-readable status line.") | |
| preview: str = Field( | |
| ..., | |
| description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.", | |
| ) | |
| file_path: Optional[str] = Field( | |
| None, description="Temp file path for the artifact, or null if not created." | |
| ) | |
| def process_input_mcp(args: ProcessArgs) -> ProcessResult: | |
| """ | |
| MCP-friendly tool that accepts/returns Pydantic models (schema carries field descriptions). | |
| """ | |
| status, preview, path = process_input_updated( | |
| args.url_or_id, args.source_type, int(args.depth), args.output_format_selection | |
| ) | |
| return ProcessResult(status=status, preview=preview, file_path=path) | |
| # ----------------------------- | |
| # Gradio UI | |
| # ----------------------------- | |
| with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as ui_iface: | |
| gr.Markdown("# RAG-Ready Content Scraper") | |
| gr.Markdown( | |
| "Scrape webpage content or GitHub repositories to generate RAG-ready datasets." | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| url_input = gr.Textbox( | |
| label="Enter URL or GitHub Repository ID", | |
| placeholder="https://example.com or owner/repo", | |
| ) | |
| source_type_input = gr.Radio( | |
| choices=["Webpage", "GitHub Repository"], | |
| value="Webpage", | |
| label="Select Source Type", | |
| ) | |
| depth_input = gr.Slider( | |
| minimum=0, | |
| maximum=3, | |
| step=1, | |
| value=0, | |
| label="Scraping Depth (for Webpages)", | |
| info="0 = only main page. Ignored for GitHub.", | |
| ) | |
| output_format_input = gr.Dropdown( | |
| choices=["Markdown", "JSON", "CSV", "Text", "PDF"], | |
| value="Markdown", | |
| label="Select Output Format", | |
| ) | |
| submit_button = gr.Button("Process Content", variant="primary") | |
| with gr.Column(scale=3): | |
| status_output = gr.Textbox(label="Status", interactive=False) | |
| preview_output = gr.Code( | |
| label="Preview Content", language="markdown", interactive=False | |
| ) | |
| file_download_output = gr.File( | |
| label="Download Processed File", interactive=False | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"], | |
| ["gradio-app/gradio", "GitHub Repository", 0, "Text"], | |
| [ | |
| "https://en.wikipedia.org/wiki/Retrieval-augmented_generation", | |
| "Webpage", | |
| 0, | |
| "JSON", | |
| ], | |
| ], | |
| inputs=[url_input, source_type_input, depth_input, output_format_input], | |
| outputs=[status_output, preview_output, file_download_output], | |
| fn=process_input_updated, | |
| cache_examples=False, | |
| ) | |
| submit_button.click( | |
| fn=process_input_updated, | |
| inputs=[url_input, source_type_input, depth_input, output_format_input], | |
| outputs=[status_output, preview_output, file_download_output], | |
| ) | |
| # ----------------------------- | |
| # MCP-only Interface (Pydantic tool) | |
| # ----------------------------- | |
| # We expose a second interface whose *function signature* uses Pydantic models. | |
| # MCP reads this signature to build a JSON Schema with rich field descriptions. | |
| mcp_iface = gr.Interface( | |
| fn=process_input_mcp, | |
| # Components are placeholders; MCP ignores them and reads the Python types. | |
| # Keep them simple so the tab is usable if someone clicks it. | |
| inputs=gr.JSON(label="ProcessArgs (JSON)"), | |
| outputs=gr.JSON(label="ProcessResult (JSON)"), | |
| title="MCP Tool: process_input_mcp", | |
| description="Pydantic-typed MCP tool exposing rich parameter descriptions.", | |
| allow_flagging="never", | |
| ) | |
| # Combine the user UI and the MCP tool as two tabs (the second can be ignored by users). | |
| app = gr.TabbedInterface([ui_iface, mcp_iface], tab_names=["App", "MCP"]) | |
| if __name__ == "__main__": | |
| # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse | |
| app.queue().launch(share=True, mcp_server=True) | |