Spaces:

CultriX
/

RAG-Scraper

Sleeping

App Files Files Community

RAG-Scraper / app.py

CultriX

Update app.py

1303e35 verified 7 months ago

raw

history blame

7.65 kB

	import gradio as gr
	import subprocess
	import os
	import re
	import tempfile
	from rag_scraper.scraper import Scraper
	from rag_scraper.converter import Converter
	from rag_scraper.link_extractor import LinkExtractor, LinkType
	from rag_scraper.utils import URLUtils

	def is_github_repo(url_or_id):
	"""Check if the input is a GitHub repository URL or ID."""
	# Check for GitHub URL
	if "github.com" in url_or_id:
	return True

	# Check for shorthand notation (username/repo)
	if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id):
	return True

	return False

	def extract_repo_info(url_or_id):
	"""Extract repository owner and name from URL or ID."""
	# Handle GitHub URLs
	github_url_pattern = r'github\.com/([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)'
	match = re.search(github_url_pattern, url_or_id)
	if match:
	return match.group(1), match.group(2)

	# Handle shorthand notation (username/repo)
	if '/' in url_or_id and not url_or_id.startswith('http'):
	parts = url_or_id.split('/')
	if len(parts) == 2:
	return parts[0], parts[1]

	return None, None

	def is_running_on_huggingface():
	"""Check if the app is running on HuggingFace Spaces."""
	return os.environ.get('SPACE_ID') is not None

	def check_repomix_installed():
	"""Check if Repomix is installed."""
	# If running on HuggingFace Spaces, Repomix is likely not available
	if is_running_on_huggingface():
	return False

	try:
	result = subprocess.run(["npx", "repomix", "--version"],
	capture_output=True, text=True, check=False)
	return result.returncode == 0
	except Exception:
	return False

	def run_repomix(repo_url_or_id, output_format="markdown"):
	"""Run Repomix on the GitHub repository and return the content."""
	try:
	# Create a temporary directory for the output
	with tempfile.TemporaryDirectory() as temp_dir:
	output_file = os.path.join(temp_dir, f"repomix-output.{output_format}")

	# Prepare the command
	if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
	# Handle shorthand notation
	repo_url = f"https://github.com/{repo_url_or_id}"
	else:
	repo_url = repo_url_or_id

	# Run Repomix
	cmd = [
	"npx", "repomix",
	"--remote", repo_url,
	"--output", output_file,
	"--style", output_format,
	"--compress" # Use compression for better token efficiency
	]

	process = subprocess.run(cmd, capture_output=True, text=True, check=False)

	if process.returncode != 0:
	return f"Error running Repomix: {process.stderr}"

	# Read the output file
	if os.path.exists(output_file):
	with open(output_file, 'r', encoding='utf-8') as f:
	return f.read()
	else:
	return f"Error: Repomix did not generate an output file."

	except Exception as e:
	return f"Error processing GitHub repository: {str(e)}"

	def process_input(url_or_id, depth, input_type="auto"):
	"""Process the input based on its type."""
	try:
	# Determine if this is a GitHub repository
	is_github = is_github_repo(url_or_id) if input_type == "auto" else (input_type == "github")

	if is_github:
	# Check if running on HuggingFace Spaces
	if is_running_on_huggingface():
	return (
	"GitHub repository processing with Repomix is not available on HuggingFace Spaces. "
	"This feature requires Node.js and the ability to run npm/npx commands, "
	"which are typically not available in the HuggingFace Spaces environment.\n\n"
	"You can still use the web scraping functionality for regular websites, "
	"or run this application locally to use the Repomix feature."
	)

	# Check if Repomix is installed
	if not check_repomix_installed():
	return (
	"Repomix is not installed or not accessible. "
	"Please install it using: npm install -g repomix\n"
	"Or you can run it without installation using: npx repomix"
	)

	# Process GitHub repository with Repomix
	return run_repomix(url_or_id, output_format="markdown")
	else:
	# Process regular URL with web scraping
	return scrape_and_convert(url_or_id, depth)

	except Exception as e:
	return f"Error: {str(e)}"

	def scrape_and_convert(url, depth):
	"""Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
	try:
	visited_urls = set()

	def recursive_scrape(url, current_depth):
	"""Recursively scrape and convert pages up to the given depth."""
	if url in visited_urls or current_depth < 0:
	return ""

	visited_urls.add(url)

	# Fetch HTML content
	try:
	html_content = Scraper.fetch_html(url)
	except Exception as e:
	return f"Error fetching {url}: {str(e)}\n"

	# Convert to Markdown
	markdown_content = f"## Extracted from: {url}\n\n"
	markdown_content += Converter.html_to_markdown(
	html=html_content,
	base_url=url,
	parser_features='html.parser',
	ignore_links=True
	)

	# If depth > 0, extract links and process them
	if current_depth > 0:
	links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)

	for link in links:
	if link not in visited_urls:
	markdown_content += f"\n\n### Extracted from: {link}\n"
	markdown_content += recursive_scrape(link, current_depth - 1)

	return markdown_content

	# Start the recursive scraping process
	result = recursive_scrape(url, depth)
	return result

	except Exception as e:
	return f"Error: {str(e)}"

	# Define Gradio interface
	iface = gr.Interface(
	fn=process_input,
	inputs=[
	gr.Textbox(label="Enter URL or GitHub Repository",
	placeholder="https://example.com or username/repo"),
	gr.Slider(minimum=0, maximum=3, step=1, value=0,
	label="Search Depth (0 = Only main page, ignored for GitHub repos)"),
	gr.Radio(
	choices=["auto", "website", "github"],
	value="auto",
	label="Input Type",
	info="Auto will detect GitHub repos automatically"
	)
	],
	outputs=gr.Code(label="Output", language="markdown"),
	title="RAGScraper with GitHub Repository Support",
	description=(
	"Enter a URL to scrape a website, or a GitHub repository URL/ID (e.g., 'username/repo') "
	"to use Repomix for repository processing. "
	"For websites, you can specify the search depth for recursive scraping."
	),
	examples=[
	["https://example.com", 0, "auto"],
	["yamadashy/repomix", 0, "auto"],
	["https://github.com/yamadashy/repomix", 0, "auto"]
	]
	)

	# Launch the Gradio app
	if __name__ == "__main__":
	iface.launch()