Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import subprocess | |
| import os | |
| import re | |
| import tempfile | |
| from rag_scraper.scraper import Scraper | |
| from rag_scraper.converter import Converter | |
| from rag_scraper.link_extractor import LinkExtractor, LinkType | |
| from rag_scraper.utils import URLUtils | |
| def is_github_repo(url_or_id): | |
| """Check if the input is a GitHub repository URL or ID.""" | |
| # Check for GitHub URL | |
| if "github.com" in url_or_id: | |
| return True | |
| # Check for shorthand notation (username/repo) | |
| if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id): | |
| return True | |
| return False | |
| def extract_repo_info(url_or_id): | |
| """Extract repository owner and name from URL or ID.""" | |
| # Handle GitHub URLs | |
| github_url_pattern = r'github\.com/([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)' | |
| match = re.search(github_url_pattern, url_or_id) | |
| if match: | |
| return match.group(1), match.group(2) | |
| # Handle shorthand notation (username/repo) | |
| if '/' in url_or_id and not url_or_id.startswith('http'): | |
| parts = url_or_id.split('/') | |
| if len(parts) == 2: | |
| return parts[0], parts[1] | |
| return None, None | |
| def is_running_on_huggingface(): | |
| """Check if the app is running on HuggingFace Spaces.""" | |
| return os.environ.get('SPACE_ID') is not None | |
| def check_repomix_installed(): | |
| """Check if Repomix is installed.""" | |
| # If running on HuggingFace Spaces, Repomix is likely not available | |
| if is_running_on_huggingface(): | |
| return False | |
| try: | |
| result = subprocess.run(["npx", "repomix", "--version"], | |
| capture_output=True, text=True, check=False) | |
| return result.returncode == 0 | |
| except Exception: | |
| return False | |
| def run_repomix(repo_url_or_id, output_format="markdown"): | |
| """Run Repomix on the GitHub repository and return the content.""" | |
| try: | |
| # Create a temporary directory for the output | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| output_file = os.path.join(temp_dir, f"repomix-output.{output_format}") | |
| # Prepare the command | |
| if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'): | |
| # Handle shorthand notation | |
| repo_url = f"https://github.com/{repo_url_or_id}" | |
| else: | |
| repo_url = repo_url_or_id | |
| # Run Repomix | |
| cmd = [ | |
| "npx", "repomix", | |
| "--remote", repo_url, | |
| "--output", output_file, | |
| "--style", output_format, | |
| "--compress" # Use compression for better token efficiency | |
| ] | |
| process = subprocess.run(cmd, capture_output=True, text=True, check=False) | |
| if process.returncode != 0: | |
| return f"Error running Repomix: {process.stderr}" | |
| # Read the output file | |
| if os.path.exists(output_file): | |
| with open(output_file, 'r', encoding='utf-8') as f: | |
| return f.read() | |
| else: | |
| return f"Error: Repomix did not generate an output file." | |
| except Exception as e: | |
| return f"Error processing GitHub repository: {str(e)}" | |
| def process_input(url_or_id, depth, input_type="auto"): | |
| """Process the input based on its type.""" | |
| try: | |
| # Determine if this is a GitHub repository | |
| is_github = is_github_repo(url_or_id) if input_type == "auto" else (input_type == "github") | |
| if is_github: | |
| # Check if running on HuggingFace Spaces | |
| if is_running_on_huggingface(): | |
| return ( | |
| "GitHub repository processing with Repomix is not available on HuggingFace Spaces. " | |
| "This feature requires Node.js and the ability to run npm/npx commands, " | |
| "which are typically not available in the HuggingFace Spaces environment.\n\n" | |
| "You can still use the web scraping functionality for regular websites, " | |
| "or run this application locally to use the Repomix feature." | |
| ) | |
| # Check if Repomix is installed | |
| if not check_repomix_installed(): | |
| return ( | |
| "Repomix is not installed or not accessible. " | |
| "Please install it using: npm install -g repomix\n" | |
| "Or you can run it without installation using: npx repomix" | |
| ) | |
| # Process GitHub repository with Repomix | |
| return run_repomix(url_or_id, output_format="markdown") | |
| else: | |
| # Process regular URL with web scraping | |
| return scrape_and_convert(url_or_id, depth) | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def scrape_and_convert(url, depth): | |
| """Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown.""" | |
| try: | |
| visited_urls = set() | |
| def recursive_scrape(url, current_depth): | |
| """Recursively scrape and convert pages up to the given depth.""" | |
| if url in visited_urls or current_depth < 0: | |
| return "" | |
| visited_urls.add(url) | |
| # Fetch HTML content | |
| try: | |
| html_content = Scraper.fetch_html(url) | |
| except Exception as e: | |
| return f"Error fetching {url}: {str(e)}\n" | |
| # Convert to Markdown | |
| markdown_content = f"## Extracted from: {url}\n\n" | |
| markdown_content += Converter.html_to_markdown( | |
| html=html_content, | |
| base_url=url, | |
| parser_features='html.parser', | |
| ignore_links=True | |
| ) | |
| # If depth > 0, extract links and process them | |
| if current_depth > 0: | |
| links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL) | |
| for link in links: | |
| if link not in visited_urls: | |
| markdown_content += f"\n\n### Extracted from: {link}\n" | |
| markdown_content += recursive_scrape(link, current_depth - 1) | |
| return markdown_content | |
| # Start the recursive scraping process | |
| result = recursive_scrape(url, depth) | |
| return result | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # Define Gradio interface | |
| iface = gr.Interface( | |
| fn=process_input, | |
| inputs=[ | |
| gr.Textbox(label="Enter URL or GitHub Repository", | |
| placeholder="https://example.com or username/repo"), | |
| gr.Slider(minimum=0, maximum=3, step=1, value=0, | |
| label="Search Depth (0 = Only main page, ignored for GitHub repos)"), | |
| gr.Radio( | |
| choices=["auto", "website", "github"], | |
| value="auto", | |
| label="Input Type", | |
| info="Auto will detect GitHub repos automatically" | |
| ) | |
| ], | |
| outputs=gr.Code(label="Output", language="markdown"), | |
| title="RAGScraper with GitHub Repository Support", | |
| description=( | |
| "Enter a URL to scrape a website, or a GitHub repository URL/ID (e.g., 'username/repo') " | |
| "to use Repomix for repository processing. " | |
| "For websites, you can specify the search depth for recursive scraping." | |
| ), | |
| examples=[ | |
| ["https://example.com", 0, "auto"], | |
| ["yamadashy/repomix", 0, "auto"], | |
| ["https://github.com/yamadashy/repomix", 0, "auto"] | |
| ] | |
| ) | |
| # Launch the Gradio app | |
| if __name__ == "__main__": | |
| iface.launch() | |