Spaces:

CultriX
/

RAG-Scraper

Sleeping

File size: 7,652 Bytes

import gradio as gr
import subprocess
import os
import re
import tempfile
from rag_scraper.scraper import Scraper
from rag_scraper.converter import Converter
from rag_scraper.link_extractor import LinkExtractor, LinkType
from rag_scraper.utils import URLUtils

def is_github_repo(url_or_id):
    """Check if the input is a GitHub repository URL or ID."""
    # Check for GitHub URL
    if "github.com" in url_or_id:
        return True
    
    # Check for shorthand notation (username/repo)
    if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id):
        return True
    
    return False

def extract_repo_info(url_or_id):
    """Extract repository owner and name from URL or ID."""
    # Handle GitHub URLs
    github_url_pattern = r'github\.com/([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)'
    match = re.search(github_url_pattern, url_or_id)
    if match:
        return match.group(1), match.group(2)
    
    # Handle shorthand notation (username/repo)
    if '/' in url_or_id and not url_or_id.startswith('http'):
        parts = url_or_id.split('/')
        if len(parts) == 2:
            return parts[0], parts[1]
    
    return None, None

def is_running_on_huggingface():
    """Check if the app is running on HuggingFace Spaces."""
    return os.environ.get('SPACE_ID') is not None

def check_repomix_installed():
    """Check if Repomix is installed."""
    # If running on HuggingFace Spaces, Repomix is likely not available
    if is_running_on_huggingface():
        return False
        
    try:
        result = subprocess.run(["npx", "repomix", "--version"], 
                               capture_output=True, text=True, check=False)
        return result.returncode == 0
    except Exception:
        return False

def run_repomix(repo_url_or_id, output_format="markdown"):
    """Run Repomix on the GitHub repository and return the content."""
    try:
        # Create a temporary directory for the output
        with tempfile.TemporaryDirectory() as temp_dir:
            output_file = os.path.join(temp_dir, f"repomix-output.{output_format}")
            
            # Prepare the command
            if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
                # Handle shorthand notation
                repo_url = f"https://github.com/{repo_url_or_id}"
            else:
                repo_url = repo_url_or_id
            
            # Run Repomix
            cmd = [
                "npx", "repomix",
                "--remote", repo_url,
                "--output", output_file,
                "--style", output_format,
                "--compress"  # Use compression for better token efficiency
            ]
            
            process = subprocess.run(cmd, capture_output=True, text=True, check=False)
            
            if process.returncode != 0:
                return f"Error running Repomix: {process.stderr}"
            
            # Read the output file
            if os.path.exists(output_file):
                with open(output_file, 'r', encoding='utf-8') as f:
                    return f.read()
            else:
                return f"Error: Repomix did not generate an output file."
    
    except Exception as e:
        return f"Error processing GitHub repository: {str(e)}"

def process_input(url_or_id, depth, input_type="auto"):
    """Process the input based on its type."""
    try:
        # Determine if this is a GitHub repository
        is_github = is_github_repo(url_or_id) if input_type == "auto" else (input_type == "github")
        
        if is_github:
            # Check if running on HuggingFace Spaces
            if is_running_on_huggingface():
                return (
                    "GitHub repository processing with Repomix is not available on HuggingFace Spaces. "
                    "This feature requires Node.js and the ability to run npm/npx commands, "
                    "which are typically not available in the HuggingFace Spaces environment.\n\n"
                    "You can still use the web scraping functionality for regular websites, "
                    "or run this application locally to use the Repomix feature."
                )
            
            # Check if Repomix is installed
            if not check_repomix_installed():
                return (
                    "Repomix is not installed or not accessible. "
                    "Please install it using: npm install -g repomix\n"
                    "Or you can run it without installation using: npx repomix"
                )
            
            # Process GitHub repository with Repomix
            return run_repomix(url_or_id, output_format="markdown")
        else:
            # Process regular URL with web scraping
            return scrape_and_convert(url_or_id, depth)
    
    except Exception as e:
        return f"Error: {str(e)}"

def scrape_and_convert(url, depth):
    """Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
    try:
        visited_urls = set()

        def recursive_scrape(url, current_depth):
            """Recursively scrape and convert pages up to the given depth."""
            if url in visited_urls or current_depth < 0:
                return ""
            
            visited_urls.add(url)

            # Fetch HTML content
            try:
                html_content = Scraper.fetch_html(url)
            except Exception as e:
                return f"Error fetching {url}: {str(e)}\n"

            # Convert to Markdown
            markdown_content = f"## Extracted from: {url}\n\n"
            markdown_content += Converter.html_to_markdown(
                html=html_content,
                base_url=url,
                parser_features='html.parser',
                ignore_links=True
            )

            # If depth > 0, extract links and process them
            if current_depth > 0:
                links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)

                for link in links:
                    if link not in visited_urls:
                        markdown_content += f"\n\n### Extracted from: {link}\n"  
                        markdown_content += recursive_scrape(link, current_depth - 1)

            return markdown_content

        # Start the recursive scraping process
        result = recursive_scrape(url, depth)
        return result

    except Exception as e:
        return f"Error: {str(e)}"

# Define Gradio interface
iface = gr.Interface(
    fn=process_input, 
    inputs=[
        gr.Textbox(label="Enter URL or GitHub Repository", 
                  placeholder="https://example.com or username/repo"),
        gr.Slider(minimum=0, maximum=3, step=1, value=0, 
                 label="Search Depth (0 = Only main page, ignored for GitHub repos)"),
        gr.Radio(
            choices=["auto", "website", "github"],
            value="auto",
            label="Input Type",
            info="Auto will detect GitHub repos automatically"
        )
    ],
    outputs=gr.Code(label="Output", language="markdown"),
    title="RAGScraper with GitHub Repository Support",
    description=(
        "Enter a URL to scrape a website, or a GitHub repository URL/ID (e.g., 'username/repo') "
        "to use Repomix for repository processing. "
        "For websites, you can specify the search depth for recursive scraping."
    ),
    examples=[
        ["https://example.com", 0, "auto"],
        ["yamadashy/repomix", 0, "auto"],
        ["https://github.com/yamadashy/repomix", 0, "auto"]
    ]
)

# Launch the Gradio app
if __name__ == "__main__":
    iface.launch()