Spaces:
Sleeping
Sleeping
File size: 7,652 Bytes
1151f26 1303e35 1151f26 d70a98e 1151f26 1303e35 d70a98e 1151f26 d70a98e ad147d8 d70a98e ad147d8 d70a98e ad147d8 d70a98e 1151f26 1303e35 d70a98e 1303e35 d70a98e 1303e35 1151f26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
import gradio as gr
import subprocess
import os
import re
import tempfile
from rag_scraper.scraper import Scraper
from rag_scraper.converter import Converter
from rag_scraper.link_extractor import LinkExtractor, LinkType
from rag_scraper.utils import URLUtils
def is_github_repo(url_or_id):
"""Check if the input is a GitHub repository URL or ID."""
# Check for GitHub URL
if "github.com" in url_or_id:
return True
# Check for shorthand notation (username/repo)
if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id):
return True
return False
def extract_repo_info(url_or_id):
"""Extract repository owner and name from URL or ID."""
# Handle GitHub URLs
github_url_pattern = r'github\.com/([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)'
match = re.search(github_url_pattern, url_or_id)
if match:
return match.group(1), match.group(2)
# Handle shorthand notation (username/repo)
if '/' in url_or_id and not url_or_id.startswith('http'):
parts = url_or_id.split('/')
if len(parts) == 2:
return parts[0], parts[1]
return None, None
def is_running_on_huggingface():
"""Check if the app is running on HuggingFace Spaces."""
return os.environ.get('SPACE_ID') is not None
def check_repomix_installed():
"""Check if Repomix is installed."""
# If running on HuggingFace Spaces, Repomix is likely not available
if is_running_on_huggingface():
return False
try:
result = subprocess.run(["npx", "repomix", "--version"],
capture_output=True, text=True, check=False)
return result.returncode == 0
except Exception:
return False
def run_repomix(repo_url_or_id, output_format="markdown"):
"""Run Repomix on the GitHub repository and return the content."""
try:
# Create a temporary directory for the output
with tempfile.TemporaryDirectory() as temp_dir:
output_file = os.path.join(temp_dir, f"repomix-output.{output_format}")
# Prepare the command
if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
# Handle shorthand notation
repo_url = f"https://github.com/{repo_url_or_id}"
else:
repo_url = repo_url_or_id
# Run Repomix
cmd = [
"npx", "repomix",
"--remote", repo_url,
"--output", output_file,
"--style", output_format,
"--compress" # Use compression for better token efficiency
]
process = subprocess.run(cmd, capture_output=True, text=True, check=False)
if process.returncode != 0:
return f"Error running Repomix: {process.stderr}"
# Read the output file
if os.path.exists(output_file):
with open(output_file, 'r', encoding='utf-8') as f:
return f.read()
else:
return f"Error: Repomix did not generate an output file."
except Exception as e:
return f"Error processing GitHub repository: {str(e)}"
def process_input(url_or_id, depth, input_type="auto"):
"""Process the input based on its type."""
try:
# Determine if this is a GitHub repository
is_github = is_github_repo(url_or_id) if input_type == "auto" else (input_type == "github")
if is_github:
# Check if running on HuggingFace Spaces
if is_running_on_huggingface():
return (
"GitHub repository processing with Repomix is not available on HuggingFace Spaces. "
"This feature requires Node.js and the ability to run npm/npx commands, "
"which are typically not available in the HuggingFace Spaces environment.\n\n"
"You can still use the web scraping functionality for regular websites, "
"or run this application locally to use the Repomix feature."
)
# Check if Repomix is installed
if not check_repomix_installed():
return (
"Repomix is not installed or not accessible. "
"Please install it using: npm install -g repomix\n"
"Or you can run it without installation using: npx repomix"
)
# Process GitHub repository with Repomix
return run_repomix(url_or_id, output_format="markdown")
else:
# Process regular URL with web scraping
return scrape_and_convert(url_or_id, depth)
except Exception as e:
return f"Error: {str(e)}"
def scrape_and_convert(url, depth):
"""Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
try:
visited_urls = set()
def recursive_scrape(url, current_depth):
"""Recursively scrape and convert pages up to the given depth."""
if url in visited_urls or current_depth < 0:
return ""
visited_urls.add(url)
# Fetch HTML content
try:
html_content = Scraper.fetch_html(url)
except Exception as e:
return f"Error fetching {url}: {str(e)}\n"
# Convert to Markdown
markdown_content = f"## Extracted from: {url}\n\n"
markdown_content += Converter.html_to_markdown(
html=html_content,
base_url=url,
parser_features='html.parser',
ignore_links=True
)
# If depth > 0, extract links and process them
if current_depth > 0:
links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)
for link in links:
if link not in visited_urls:
markdown_content += f"\n\n### Extracted from: {link}\n"
markdown_content += recursive_scrape(link, current_depth - 1)
return markdown_content
# Start the recursive scraping process
result = recursive_scrape(url, depth)
return result
except Exception as e:
return f"Error: {str(e)}"
# Define Gradio interface
iface = gr.Interface(
fn=process_input,
inputs=[
gr.Textbox(label="Enter URL or GitHub Repository",
placeholder="https://example.com or username/repo"),
gr.Slider(minimum=0, maximum=3, step=1, value=0,
label="Search Depth (0 = Only main page, ignored for GitHub repos)"),
gr.Radio(
choices=["auto", "website", "github"],
value="auto",
label="Input Type",
info="Auto will detect GitHub repos automatically"
)
],
outputs=gr.Code(label="Output", language="markdown"),
title="RAGScraper with GitHub Repository Support",
description=(
"Enter a URL to scrape a website, or a GitHub repository URL/ID (e.g., 'username/repo') "
"to use Repomix for repository processing. "
"For websites, you can specify the search depth for recursive scraping."
),
examples=[
["https://example.com", 0, "auto"],
["yamadashy/repomix", 0, "auto"],
["https://github.com/yamadashy/repomix", 0, "auto"]
]
)
# Launch the Gradio app
if __name__ == "__main__":
iface.launch()
|