Skip to content

Commit ea5e896

Browse files
committed
support downloading GitHub repo with ssh
1 parent 6c574db commit ea5e896

File tree

2 files changed

+101
-7
lines changed

2 files changed

+101
-7
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
pocketflow>=0.0.1
22
pyyaml>=6.0
33
requests>=2.28.0
4+
gitpython>=3.1.0
45
google-cloud-aiplatform>=1.25.0
56
google-genai>=1.9.0

utils/crawl_github_files.py

Lines changed: 100 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import requests
22
import base64
33
import os
4+
import tempfile
5+
import git
46
import time
57
import fnmatch
68
from typing import Union, Set, List, Dict, Tuple, Any
@@ -16,18 +18,21 @@ def crawl_github_files(
1618
):
1719
"""
1820
Crawl files from a specific path in a GitHub repository at a specific commit.
19-
21+
2022
Args:
2123
repo_url (str): URL of the GitHub repository with specific path and commit
2224
(e.g., 'https://github.com/microsoft/autogen/tree/e45a15766746d95f8cfaaa705b0371267bec812e/python/packages/autogen-core/src/autogen_core')
23-
token (str, optional): GitHub personal access token. Required for private repositories and recommended for public repos to avoid rate limits.
25+
token (str, optional): **GitHub personal access token.**
26+
- **Required for private repositories.**
27+
- **Recommended for public repos to avoid rate limits.**
28+
- Can be passed explicitly or set via the `GITHUB_TOKEN` environment variable.
2429
max_file_size (int, optional): Maximum file size in bytes to download (default: 1 MB)
2530
use_relative_paths (bool, optional): If True, file paths will be relative to the specified subdirectory
2631
include_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to include (e.g., "*.py", {"*.md", "*.txt"}).
2732
If None, all files are included.
2833
exclude_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to exclude.
2934
If None, no files are excluded.
30-
35+
3136
Returns:
3237
dict: Dictionary with files and statistics
3338
"""
@@ -36,7 +41,89 @@ def crawl_github_files(
3641
include_patterns = {include_patterns}
3742
if exclude_patterns and isinstance(exclude_patterns, str):
3843
exclude_patterns = {exclude_patterns}
39-
44+
45+
def should_include_file(file_path: str, file_name: str) -> bool:
46+
"""Determine if a file should be included based on patterns"""
47+
# If no include patterns are specified, include all files
48+
if not include_patterns:
49+
include_file = True
50+
else:
51+
# Check if file matches any include pattern
52+
include_file = any(fnmatch.fnmatch(file_name, pattern) for pattern in include_patterns)
53+
54+
# If exclude patterns are specified, check if file should be excluded
55+
if exclude_patterns and include_file:
56+
# Exclude if file matches any exclude pattern
57+
exclude_file = any(fnmatch.fnmatch(file_path, pattern) for pattern in exclude_patterns)
58+
return not exclude_file
59+
60+
return include_file
61+
62+
# Detect SSH URL (git@ or .git suffix)
63+
is_ssh_url = repo_url.startswith("git@") or repo_url.endswith(".git")
64+
65+
if is_ssh_url:
66+
# Clone repo via SSH to temp dir
67+
with tempfile.TemporaryDirectory() as tmpdirname:
68+
print(f"Cloning SSH repo {repo_url} to temp dir {tmpdirname} ...")
69+
try:
70+
repo = git.Repo.clone_from(repo_url, tmpdirname)
71+
except Exception as e:
72+
print(f"Error cloning repo: {e}")
73+
return {"files": {}, "stats": {"error": str(e)}}
74+
75+
# Attempt to checkout specific commit/branch if in URL
76+
# Parse ref and subdir from SSH URL? SSH URLs don't have branch info embedded
77+
# So rely on default branch, or user can checkout manually later
78+
# Optionally, user can pass ref explicitly in future API
79+
80+
# Walk directory
81+
files = {}
82+
skipped_files = []
83+
84+
for root, dirs, filenames in os.walk(tmpdirname):
85+
for filename in filenames:
86+
abs_path = os.path.join(root, filename)
87+
rel_path = os.path.relpath(abs_path, tmpdirname)
88+
89+
# Check file size
90+
try:
91+
file_size = os.path.getsize(abs_path)
92+
except OSError:
93+
continue
94+
95+
if file_size > max_file_size:
96+
skipped_files.append((rel_path, file_size))
97+
print(f"Skipping {rel_path}: size {file_size} exceeds limit {max_file_size}")
98+
continue
99+
100+
# Check include/exclude patterns
101+
if not should_include_file(rel_path, filename):
102+
print(f"Skipping {rel_path}: does not match include/exclude patterns")
103+
continue
104+
105+
# Read content
106+
try:
107+
with open(abs_path, "r", encoding="utf-8") as f:
108+
content = f.read()
109+
files[rel_path] = content
110+
print(f"Added {rel_path} ({file_size} bytes)")
111+
except Exception as e:
112+
print(f"Failed to read {rel_path}: {e}")
113+
114+
return {
115+
"files": files,
116+
"stats": {
117+
"downloaded_count": len(files),
118+
"skipped_count": len(skipped_files),
119+
"skipped_files": skipped_files,
120+
"base_path": None,
121+
"include_patterns": include_patterns,
122+
"exclude_patterns": exclude_patterns,
123+
"source": "ssh_clone"
124+
}
125+
}
126+
40127
# Parse GitHub URL to extract owner, repo, commit/branch, and path
41128
parsed_url = urlparse(repo_url)
42129
path_parts = parsed_url.path.strip('/').split('/')
@@ -101,9 +188,11 @@ def fetch_contents(path):
101188

102189
if response.status_code == 404:
103190
if not token:
104-
print(f"Error 404: Repository not found or is private. If this is a private repository, you need to provide a token.")
191+
print(f"Error 404: Repository not found or is private.\n"
192+
f"If this is a private repository, please provide a valid GitHub token via the 'token' argument or set the GITHUB_TOKEN environment variable.")
105193
else:
106-
print(f"Error 404: Path '{path}' not found in repository or insufficient permissions.")
194+
print(f"Error 404: Path '{path}' not found in repository or insufficient permissions with the provided token.\n"
195+
f"Please verify the token has access to this repository and the path exists.")
107196
return
108197

109198
if response.status_code != 200:
@@ -201,8 +290,12 @@ def fetch_contents(path):
201290

202291
# Example usage
203292
if __name__ == "__main__":
204-
# Get token from environment variable (more secure than hardcoding)
293+
# Get token from environment variable (recommended for private repos)
205294
github_token = os.environ.get("GITHUB_TOKEN")
295+
if not github_token:
296+
print("Warning: No GitHub token found in environment variable 'GITHUB_TOKEN'.\n"
297+
"Private repositories will not be accessible without a token.\n"
298+
"To access private repos, set the environment variable or pass the token explicitly.")
206299

207300
repo_url = "https://github.com/pydantic/pydantic/tree/6c38dc93f40a47f4d1350adca9ec0d72502e223f/pydantic"
208301

0 commit comments

Comments
 (0)