RAG-Scraper / app.py
CultriX's picture
Update app.py
1303e35 verified
raw
history blame
7.65 kB
import gradio as gr
import subprocess
import os
import re
import tempfile
from rag_scraper.scraper import Scraper
from rag_scraper.converter import Converter
from rag_scraper.link_extractor import LinkExtractor, LinkType
from rag_scraper.utils import URLUtils
def is_github_repo(url_or_id):
"""Check if the input is a GitHub repository URL or ID."""
# Check for GitHub URL
if "github.com" in url_or_id:
return True
# Check for shorthand notation (username/repo)
if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id):
return True
return False
def extract_repo_info(url_or_id):
"""Extract repository owner and name from URL or ID."""
# Handle GitHub URLs
github_url_pattern = r'github\.com/([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)'
match = re.search(github_url_pattern, url_or_id)
if match:
return match.group(1), match.group(2)
# Handle shorthand notation (username/repo)
if '/' in url_or_id and not url_or_id.startswith('http'):
parts = url_or_id.split('/')
if len(parts) == 2:
return parts[0], parts[1]
return None, None
def is_running_on_huggingface():
"""Check if the app is running on HuggingFace Spaces."""
return os.environ.get('SPACE_ID') is not None
def check_repomix_installed():
"""Check if Repomix is installed."""
# If running on HuggingFace Spaces, Repomix is likely not available
if is_running_on_huggingface():
return False
try:
result = subprocess.run(["npx", "repomix", "--version"],
capture_output=True, text=True, check=False)
return result.returncode == 0
except Exception:
return False
def run_repomix(repo_url_or_id, output_format="markdown"):
"""Run Repomix on the GitHub repository and return the content."""
try:
# Create a temporary directory for the output
with tempfile.TemporaryDirectory() as temp_dir:
output_file = os.path.join(temp_dir, f"repomix-output.{output_format}")
# Prepare the command
if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
# Handle shorthand notation
repo_url = f"https://github.com/{repo_url_or_id}"
else:
repo_url = repo_url_or_id
# Run Repomix
cmd = [
"npx", "repomix",
"--remote", repo_url,
"--output", output_file,
"--style", output_format,
"--compress" # Use compression for better token efficiency
]
process = subprocess.run(cmd, capture_output=True, text=True, check=False)
if process.returncode != 0:
return f"Error running Repomix: {process.stderr}"
# Read the output file
if os.path.exists(output_file):
with open(output_file, 'r', encoding='utf-8') as f:
return f.read()
else:
return f"Error: Repomix did not generate an output file."
except Exception as e:
return f"Error processing GitHub repository: {str(e)}"
def process_input(url_or_id, depth, input_type="auto"):
"""Process the input based on its type."""
try:
# Determine if this is a GitHub repository
is_github = is_github_repo(url_or_id) if input_type == "auto" else (input_type == "github")
if is_github:
# Check if running on HuggingFace Spaces
if is_running_on_huggingface():
return (
"GitHub repository processing with Repomix is not available on HuggingFace Spaces. "
"This feature requires Node.js and the ability to run npm/npx commands, "
"which are typically not available in the HuggingFace Spaces environment.\n\n"
"You can still use the web scraping functionality for regular websites, "
"or run this application locally to use the Repomix feature."
)
# Check if Repomix is installed
if not check_repomix_installed():
return (
"Repomix is not installed or not accessible. "
"Please install it using: npm install -g repomix\n"
"Or you can run it without installation using: npx repomix"
)
# Process GitHub repository with Repomix
return run_repomix(url_or_id, output_format="markdown")
else:
# Process regular URL with web scraping
return scrape_and_convert(url_or_id, depth)
except Exception as e:
return f"Error: {str(e)}"
def scrape_and_convert(url, depth):
"""Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
try:
visited_urls = set()
def recursive_scrape(url, current_depth):
"""Recursively scrape and convert pages up to the given depth."""
if url in visited_urls or current_depth < 0:
return ""
visited_urls.add(url)
# Fetch HTML content
try:
html_content = Scraper.fetch_html(url)
except Exception as e:
return f"Error fetching {url}: {str(e)}\n"
# Convert to Markdown
markdown_content = f"## Extracted from: {url}\n\n"
markdown_content += Converter.html_to_markdown(
html=html_content,
base_url=url,
parser_features='html.parser',
ignore_links=True
)
# If depth > 0, extract links and process them
if current_depth > 0:
links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)
for link in links:
if link not in visited_urls:
markdown_content += f"\n\n### Extracted from: {link}\n"
markdown_content += recursive_scrape(link, current_depth - 1)
return markdown_content
# Start the recursive scraping process
result = recursive_scrape(url, depth)
return result
except Exception as e:
return f"Error: {str(e)}"
# Define Gradio interface
iface = gr.Interface(
fn=process_input,
inputs=[
gr.Textbox(label="Enter URL or GitHub Repository",
placeholder="https://example.com or username/repo"),
gr.Slider(minimum=0, maximum=3, step=1, value=0,
label="Search Depth (0 = Only main page, ignored for GitHub repos)"),
gr.Radio(
choices=["auto", "website", "github"],
value="auto",
label="Input Type",
info="Auto will detect GitHub repos automatically"
)
],
outputs=gr.Code(label="Output", language="markdown"),
title="RAGScraper with GitHub Repository Support",
description=(
"Enter a URL to scrape a website, or a GitHub repository URL/ID (e.g., 'username/repo') "
"to use Repomix for repository processing. "
"For websites, you can specify the search depth for recursive scraping."
),
examples=[
["https://example.com", 0, "auto"],
["yamadashy/repomix", 0, "auto"],
["https://github.com/yamadashy/repomix", 0, "auto"]
]
)
# Launch the Gradio app
if __name__ == "__main__":
iface.launch()