File size: 7,652 Bytes
1151f26
1303e35
 
 
 
1151f26
 
d70a98e
 
1151f26
1303e35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d70a98e
 
1151f26
d70a98e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad147d8
 
d70a98e
 
 
 
 
 
 
 
ad147d8
 
d70a98e
ad147d8
 
 
d70a98e
 
 
 
 
 
 
1151f26
 
 
 
 
1303e35
d70a98e
1303e35
 
 
 
 
 
 
 
 
 
d70a98e
1303e35
 
 
 
 
 
 
 
 
 
 
 
1151f26
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import gradio as gr
import subprocess
import os
import re
import tempfile
from rag_scraper.scraper import Scraper
from rag_scraper.converter import Converter
from rag_scraper.link_extractor import LinkExtractor, LinkType
from rag_scraper.utils import URLUtils

def is_github_repo(url_or_id):
    """Check if the input is a GitHub repository URL or ID."""
    # Check for GitHub URL
    if "github.com" in url_or_id:
        return True
    
    # Check for shorthand notation (username/repo)
    if re.match(r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$', url_or_id):
        return True
    
    return False

def extract_repo_info(url_or_id):
    """Extract repository owner and name from URL or ID."""
    # Handle GitHub URLs
    github_url_pattern = r'github\.com/([a-zA-Z0-9_.-]+)/([a-zA-Z0-9_.-]+)'
    match = re.search(github_url_pattern, url_or_id)
    if match:
        return match.group(1), match.group(2)
    
    # Handle shorthand notation (username/repo)
    if '/' in url_or_id and not url_or_id.startswith('http'):
        parts = url_or_id.split('/')
        if len(parts) == 2:
            return parts[0], parts[1]
    
    return None, None

def is_running_on_huggingface():
    """Check if the app is running on HuggingFace Spaces."""
    return os.environ.get('SPACE_ID') is not None

def check_repomix_installed():
    """Check if Repomix is installed."""
    # If running on HuggingFace Spaces, Repomix is likely not available
    if is_running_on_huggingface():
        return False
        
    try:
        result = subprocess.run(["npx", "repomix", "--version"], 
                               capture_output=True, text=True, check=False)
        return result.returncode == 0
    except Exception:
        return False

def run_repomix(repo_url_or_id, output_format="markdown"):
    """Run Repomix on the GitHub repository and return the content."""
    try:
        # Create a temporary directory for the output
        with tempfile.TemporaryDirectory() as temp_dir:
            output_file = os.path.join(temp_dir, f"repomix-output.{output_format}")
            
            # Prepare the command
            if '/' in repo_url_or_id and not repo_url_or_id.startswith('http'):
                # Handle shorthand notation
                repo_url = f"https://github.com/{repo_url_or_id}"
            else:
                repo_url = repo_url_or_id
            
            # Run Repomix
            cmd = [
                "npx", "repomix",
                "--remote", repo_url,
                "--output", output_file,
                "--style", output_format,
                "--compress"  # Use compression for better token efficiency
            ]
            
            process = subprocess.run(cmd, capture_output=True, text=True, check=False)
            
            if process.returncode != 0:
                return f"Error running Repomix: {process.stderr}"
            
            # Read the output file
            if os.path.exists(output_file):
                with open(output_file, 'r', encoding='utf-8') as f:
                    return f.read()
            else:
                return f"Error: Repomix did not generate an output file."
    
    except Exception as e:
        return f"Error processing GitHub repository: {str(e)}"

def process_input(url_or_id, depth, input_type="auto"):
    """Process the input based on its type."""
    try:
        # Determine if this is a GitHub repository
        is_github = is_github_repo(url_or_id) if input_type == "auto" else (input_type == "github")
        
        if is_github:
            # Check if running on HuggingFace Spaces
            if is_running_on_huggingface():
                return (
                    "GitHub repository processing with Repomix is not available on HuggingFace Spaces. "
                    "This feature requires Node.js and the ability to run npm/npx commands, "
                    "which are typically not available in the HuggingFace Spaces environment.\n\n"
                    "You can still use the web scraping functionality for regular websites, "
                    "or run this application locally to use the Repomix feature."
                )
            
            # Check if Repomix is installed
            if not check_repomix_installed():
                return (
                    "Repomix is not installed or not accessible. "
                    "Please install it using: npm install -g repomix\n"
                    "Or you can run it without installation using: npx repomix"
                )
            
            # Process GitHub repository with Repomix
            return run_repomix(url_or_id, output_format="markdown")
        else:
            # Process regular URL with web scraping
            return scrape_and_convert(url_or_id, depth)
    
    except Exception as e:
        return f"Error: {str(e)}"

def scrape_and_convert(url, depth):
    """Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
    try:
        visited_urls = set()

        def recursive_scrape(url, current_depth):
            """Recursively scrape and convert pages up to the given depth."""
            if url in visited_urls or current_depth < 0:
                return ""
            
            visited_urls.add(url)

            # Fetch HTML content
            try:
                html_content = Scraper.fetch_html(url)
            except Exception as e:
                return f"Error fetching {url}: {str(e)}\n"

            # Convert to Markdown
            markdown_content = f"## Extracted from: {url}\n\n"
            markdown_content += Converter.html_to_markdown(
                html=html_content,
                base_url=url,
                parser_features='html.parser',
                ignore_links=True
            )

            # If depth > 0, extract links and process them
            if current_depth > 0:
                links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)

                for link in links:
                    if link not in visited_urls:
                        markdown_content += f"\n\n### Extracted from: {link}\n"  
                        markdown_content += recursive_scrape(link, current_depth - 1)

            return markdown_content

        # Start the recursive scraping process
        result = recursive_scrape(url, depth)
        return result

    except Exception as e:
        return f"Error: {str(e)}"

# Define Gradio interface
iface = gr.Interface(
    fn=process_input, 
    inputs=[
        gr.Textbox(label="Enter URL or GitHub Repository", 
                  placeholder="https://example.com or username/repo"),
        gr.Slider(minimum=0, maximum=3, step=1, value=0, 
                 label="Search Depth (0 = Only main page, ignored for GitHub repos)"),
        gr.Radio(
            choices=["auto", "website", "github"],
            value="auto",
            label="Input Type",
            info="Auto will detect GitHub repos automatically"
        )
    ],
    outputs=gr.Code(label="Output", language="markdown"),
    title="RAGScraper with GitHub Repository Support",
    description=(
        "Enter a URL to scrape a website, or a GitHub repository URL/ID (e.g., 'username/repo') "
        "to use Repomix for repository processing. "
        "For websites, you can specify the search depth for recursive scraping."
    ),
    examples=[
        ["https://example.com", 0, "auto"],
        ["yamadashy/repomix", 0, "auto"],
        ["https://github.com/yamadashy/repomix", 0, "auto"]
    ]
)

# Launch the Gradio app
if __name__ == "__main__":
    iface.launch()