11import requests
22import base64
33import os
4+ import tempfile
5+ import git
46import time
57import fnmatch
68from typing import Union , Set , List , Dict , Tuple , Any
@@ -16,18 +18,21 @@ def crawl_github_files(
1618):
1719 """
1820 Crawl files from a specific path in a GitHub repository at a specific commit.
19-
21+
2022 Args:
2123 repo_url (str): URL of the GitHub repository with specific path and commit
2224 (e.g., 'https://github.com/microsoft/autogen/tree/e45a15766746d95f8cfaaa705b0371267bec812e/python/packages/autogen-core/src/autogen_core')
23- token (str, optional): GitHub personal access token. Required for private repositories and recommended for public repos to avoid rate limits.
25+ token (str, optional): **GitHub personal access token.**
26+ - **Required for private repositories.**
27+ - **Recommended for public repos to avoid rate limits.**
28+ - Can be passed explicitly or set via the `GITHUB_TOKEN` environment variable.
2429 max_file_size (int, optional): Maximum file size in bytes to download (default: 1 MB)
2530 use_relative_paths (bool, optional): If True, file paths will be relative to the specified subdirectory
2631 include_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to include (e.g., "*.py", {"*.md", "*.txt"}).
2732 If None, all files are included.
2833 exclude_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to exclude.
2934 If None, no files are excluded.
30-
35+
3136 Returns:
3237 dict: Dictionary with files and statistics
3338 """
@@ -36,7 +41,89 @@ def crawl_github_files(
3641 include_patterns = {include_patterns }
3742 if exclude_patterns and isinstance (exclude_patterns , str ):
3843 exclude_patterns = {exclude_patterns }
39-
44+
45+ def should_include_file (file_path : str , file_name : str ) -> bool :
46+ """Determine if a file should be included based on patterns"""
47+ # If no include patterns are specified, include all files
48+ if not include_patterns :
49+ include_file = True
50+ else :
51+ # Check if file matches any include pattern
52+ include_file = any (fnmatch .fnmatch (file_name , pattern ) for pattern in include_patterns )
53+
54+ # If exclude patterns are specified, check if file should be excluded
55+ if exclude_patterns and include_file :
56+ # Exclude if file matches any exclude pattern
57+ exclude_file = any (fnmatch .fnmatch (file_path , pattern ) for pattern in exclude_patterns )
58+ return not exclude_file
59+
60+ return include_file
61+
62+ # Detect SSH URL (git@ or .git suffix)
63+ is_ssh_url = repo_url .startswith ("git@" ) or repo_url .endswith (".git" )
64+
65+ if is_ssh_url :
66+ # Clone repo via SSH to temp dir
67+ with tempfile .TemporaryDirectory () as tmpdirname :
68+ print (f"Cloning SSH repo { repo_url } to temp dir { tmpdirname } ..." )
69+ try :
70+ repo = git .Repo .clone_from (repo_url , tmpdirname )
71+ except Exception as e :
72+ print (f"Error cloning repo: { e } " )
73+ return {"files" : {}, "stats" : {"error" : str (e )}}
74+
75+ # Attempt to checkout specific commit/branch if in URL
76+ # Parse ref and subdir from SSH URL? SSH URLs don't have branch info embedded
77+ # So rely on default branch, or user can checkout manually later
78+ # Optionally, user can pass ref explicitly in future API
79+
80+ # Walk directory
81+ files = {}
82+ skipped_files = []
83+
84+ for root , dirs , filenames in os .walk (tmpdirname ):
85+ for filename in filenames :
86+ abs_path = os .path .join (root , filename )
87+ rel_path = os .path .relpath (abs_path , tmpdirname )
88+
89+ # Check file size
90+ try :
91+ file_size = os .path .getsize (abs_path )
92+ except OSError :
93+ continue
94+
95+ if file_size > max_file_size :
96+ skipped_files .append ((rel_path , file_size ))
97+ print (f"Skipping { rel_path } : size { file_size } exceeds limit { max_file_size } " )
98+ continue
99+
100+ # Check include/exclude patterns
101+ if not should_include_file (rel_path , filename ):
102+ print (f"Skipping { rel_path } : does not match include/exclude patterns" )
103+ continue
104+
105+ # Read content
106+ try :
107+ with open (abs_path , "r" , encoding = "utf-8" ) as f :
108+ content = f .read ()
109+ files [rel_path ] = content
110+ print (f"Added { rel_path } ({ file_size } bytes)" )
111+ except Exception as e :
112+ print (f"Failed to read { rel_path } : { e } " )
113+
114+ return {
115+ "files" : files ,
116+ "stats" : {
117+ "downloaded_count" : len (files ),
118+ "skipped_count" : len (skipped_files ),
119+ "skipped_files" : skipped_files ,
120+ "base_path" : None ,
121+ "include_patterns" : include_patterns ,
122+ "exclude_patterns" : exclude_patterns ,
123+ "source" : "ssh_clone"
124+ }
125+ }
126+
40127 # Parse GitHub URL to extract owner, repo, commit/branch, and path
41128 parsed_url = urlparse (repo_url )
42129 path_parts = parsed_url .path .strip ('/' ).split ('/' )
@@ -101,9 +188,11 @@ def fetch_contents(path):
101188
102189 if response .status_code == 404 :
103190 if not token :
104- print (f"Error 404: Repository not found or is private. If this is a private repository, you need to provide a token." )
191+ print (f"Error 404: Repository not found or is private.\n "
192+ f"If this is a private repository, please provide a valid GitHub token via the 'token' argument or set the GITHUB_TOKEN environment variable." )
105193 else :
106- print (f"Error 404: Path '{ path } ' not found in repository or insufficient permissions." )
194+ print (f"Error 404: Path '{ path } ' not found in repository or insufficient permissions with the provided token.\n "
195+ f"Please verify the token has access to this repository and the path exists." )
107196 return
108197
109198 if response .status_code != 200 :
@@ -201,8 +290,12 @@ def fetch_contents(path):
201290
202291# Example usage
203292if __name__ == "__main__" :
204- # Get token from environment variable (more secure than hardcoding )
293+ # Get token from environment variable (recommended for private repos )
205294 github_token = os .environ .get ("GITHUB_TOKEN" )
295+ if not github_token :
296+ print ("Warning: No GitHub token found in environment variable 'GITHUB_TOKEN'.\n "
297+ "Private repositories will not be accessible without a token.\n "
298+ "To access private repos, set the environment variable or pass the token explicitly." )
206299
207300 repo_url = "https://github.com/pydantic/pydantic/tree/6c38dc93f40a47f4d1350adca9ec0d72502e223f/pydantic"
208301
0 commit comments