Skip to content

Commit fd3a2de

Browse files
authored
Merge pull request #6 from mathiasesn/feat/create-repo-structure
feat: Add repo structure to output markdown
2 parents 3efd649 + 61b8212 commit fd3a2de

File tree

12 files changed

+513
-89
lines changed

12 files changed

+513
-89
lines changed

.pre-commit-config.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
repos:
2+
- repo: https://github.com/pre-commit/pre-commit-hooks
3+
rev: v4.4.0
4+
hooks:
5+
- id: end-of-file-fixer
6+
- repo: https://github.com/astral-sh/ruff-pre-commit
7+
# Ruff version.
8+
rev: v0.6.2
9+
hooks:
10+
# Run the linter.
11+
- id: ruff
12+
args: [ --fix ]
13+
# Run the formatter.
14+
- id: ruff-format

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "repo-context"
3-
version = "0.2.0"
3+
version = "0.3.0"
44
description = "Convert Git repositories into LLM-friendly context format"
55
authors = [{ name = "Mathias Nielsen", email = "[email protected]" }]
66
maintainers = [{ name = "Mathias Nielsen", email = "[email protected]" }]
@@ -26,4 +26,4 @@ repo-context = "repo_context.cli:main"
2626

2727
[build-system]
2828
requires = ["hatchling"]
29-
build-backend = "hatchling.build"
29+
build-backend = "hatchling.build"

repo_context/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
handlers=[RichHandler(console=console, rich_tracebacks=True)],
1515
)
1616

17-
from repo_context.repo_converter import RepoConverter # noqa: E402
17+
from repo_context.converter import RepoConverter # noqa: E402
18+
from repo_context.structure import RepoStructure # noqa: E402
1819

19-
__all__ = ["RepoConverter"]
20+
__all__ = ["RepoConverter", "RepoStructure"]

repo_context/cli.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pathlib import Path
44
from urllib.parse import urlparse
55

6-
from repo_context.repo_converter import RepoConverter
6+
from repo_context.converter import RepoConverter
77

88
logger = logging.getLogger("repo_context.cli")
99

@@ -60,18 +60,28 @@ def main():
6060
converter = RepoConverter(ignore_patterns=ignore_patterns)
6161

6262
try:
63+
# Clone or use local repository
6364
if urlparse(args.source).scheme:
6465
logger.info(f"Cloning repository from {args.source}")
6566
repo_path, _ = converter.clone_repo(args.source)
67+
fname = Path(urlparse(args.source).path).stem
6668
else:
6769
repo_path = Path(args.source)
70+
fname = repo_path.stem
6871

72+
# Convert repository to context
6973
context = converter.convert(repo_path, max_file_lines=args.max_file_lines)
7074

71-
for i, c in enumerate(context):
72-
output_path = Path(f"{args.output}/context_{i}.md")
73-
output_path.write_text(c)
75+
# Write context to files
76+
if len(context) == 1:
77+
output_path = Path(f"{args.output}/{fname}.md")
78+
output_path.write_text(context[0])
7479
logger.info(f"Context written to {output_path}")
80+
else:
81+
for i, c in enumerate(context):
82+
output_path = Path(f"{args.output}/{fname}_{i}.md")
83+
output_path.write_text(c)
84+
logger.info(f"Context written to {output_path}")
7585

7686
except Exception as e:
7787
logger.error(f"Error: {e}")

repo_context/repo_converter.py renamed to repo_context/converter.py

Lines changed: 27 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import logging
22
import tempfile
3-
from fnmatch import fnmatch
43
from multiprocessing import Pool, cpu_count
54
from pathlib import Path
65

@@ -9,6 +8,8 @@
98
from tqdm.contrib.logging import logging_redirect_tqdm
109

1110
from repo_context.ignore import EXTENSIONS, FILES, PATTERNS
11+
from repo_context.utils import should_ignore
12+
from repo_context.structure import RepoStructure
1213

1314
logger = logging.getLogger("repo_context.repo_converter")
1415

@@ -20,11 +21,25 @@ def __init__(
2021
max_file_size: int = 1_000_000,
2122
max_workers: int | None = None,
2223
) -> None:
24+
"""
25+
Initialize the converter with specified parameters.
26+
27+
Args:
28+
ignore_patterns (list[str] | None, optional): A list of patterns to ignore. Defaults to None.
29+
max_file_size (int, optional): The maximum file size to process in bytes. Defaults to 1,000,000.
30+
max_workers (int | None, optional): The maximum number of worker threads to use. Defaults to the number of CPU cores.
31+
32+
Attributes:
33+
ignore_patterns (list[str]): The list of patterns to ignore.
34+
max_file_size (int): The maximum file size to process in bytes.
35+
max_workers (int): The maximum number of worker threads to use.
36+
structure (RepoStructure): The repository structure initialized with the ignore patterns.
37+
"""
2338
self.ignore_patterns = ignore_patterns or []
2439
self.max_file_size = max_file_size
2540
self.max_workers = max_workers or cpu_count()
26-
2741
self.ignore_patterns += FILES + EXTENSIONS + PATTERNS
42+
self.structure = RepoStructure(ignore_patterns=self.ignore_patterns)
2843

2944
def clone_repo(self, url: str) -> Path:
3045
"""Clone a repository from URL to temporary directory.
@@ -68,57 +83,6 @@ def progress_callback(op_code, cur_count, max_count=None, message=""):
6883
logger.error(f"Failed to clone repository: {e}")
6984
raise
7085

71-
def should_ignore(self, path: Path) -> bool:
72-
"""Check if path matches ignore patterns.
73-
74-
Args:
75-
path: Path to check against ignore patterns
76-
77-
Returns:
78-
True if path should be ignored
79-
"""
80-
fname = path.name
81-
path_str = str(path)
82-
relative_path = self._get_relative_path(path)
83-
84-
for pattern in self.ignore_patterns:
85-
if pattern in FILES and fname == pattern:
86-
return True
87-
88-
if pattern in EXTENSIONS and fnmatch(fname, pattern):
89-
return True
90-
91-
if pattern in PATTERNS:
92-
if pattern in path_str:
93-
return True
94-
95-
normalized_path = relative_path.replace("\\", "/")
96-
normalized_pattern = pattern.replace("\\", "/")
97-
if fnmatch(normalized_path, normalized_pattern):
98-
return True
99-
100-
if fnmatch(path_str, pattern):
101-
return True
102-
103-
return False
104-
105-
@staticmethod
106-
def _get_relative_path(path: Path) -> str:
107-
"""
108-
Get the relative path of the given Path object with respect to the current working directory.
109-
110-
Args:
111-
path (Path): The Path object to be converted to a relative path.
112-
113-
Returns:
114-
str: The relative path as a string if the given path is within the current working directory,
115-
otherwise the absolute path as a string.
116-
"""
117-
try:
118-
return str(path.resolve().relative_to(Path.cwd()))
119-
except ValueError:
120-
return str(path)
121-
12286
def _process_file_wrapper(self, args: tuple[str, str]) -> str | None:
12387
"""
12488
Wrapper method to process a file with given file path and repository path.
@@ -149,14 +113,22 @@ def convert(self, repo_path: Path, max_file_lines: int | None = None) -> list[st
149113
if not repo_path.exists():
150114
raise FileNotFoundError(f"Repository path {repo_path} does not exist")
151115

116+
context = []
117+
118+
# Get structure of the repository
119+
tree_structure = self.structure.create_tree_structure(repo_path)
120+
if tree_structure:
121+
context.append(tree_structure)
122+
123+
# Get all files in the repository
152124
with logging_redirect_tqdm():
153125
file_paths = [
154126
(str(p), str(repo_path))
155127
for p in tqdm(repo_path.rglob("*"), ncols=120)
156128
if self._is_valid_file(p)
157129
]
158130

159-
context = []
131+
# Process files in parallel
160132
with Pool(self.max_workers) as pool:
161133
with logging_redirect_tqdm():
162134
with tqdm(
@@ -182,7 +154,7 @@ def _is_valid_file(self, path: Path) -> bool:
182154
"""Check if file should be processed."""
183155
return (
184156
path.is_file()
185-
and not self.should_ignore(path)
157+
and not should_ignore(path, self.ignore_patterns)
186158
and path.stat().st_size <= self.max_file_size
187159
)
188160

repo_context/ignore.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
"uv.lock",
99
"poetry.lock",
1010
".dockerignore",
11+
".coverage",
12+
".pre-commit-config.yaml",
1113
]
1214

1315
EXTENSIONS = [
@@ -27,6 +29,11 @@
2729
"*.pyo",
2830
"*.pyd",
2931
".DS_Store",
32+
"*.zip",
33+
"*.far",
34+
"*.fst",
35+
"*.tsv",
36+
"*.csv",
3037
]
3138

3239
PATTERNS = [
@@ -44,4 +51,5 @@
4451
"publish",
4552
"tests",
4653
"test",
54+
".ruff_cache",
4755
]

repo_context/structure.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import logging
2+
from pathlib import Path
3+
4+
from repo_context.utils import should_ignore
5+
6+
logger = logging.getLogger("repo_context.structure")
7+
8+
9+
class RepoStructure:
10+
def __init__(self, ignore_patterns: list[str] | None = None) -> None:
11+
self.ignore_patterns = ignore_patterns or []
12+
13+
def generate_tree(
14+
self,
15+
directory: Path,
16+
prefix: str = "",
17+
is_last: bool = True,
18+
) -> list[str]:
19+
"""
20+
Recursively generate a tree structure of the directory.
21+
22+
Args:
23+
directory (Path): Path object pointing to the directory
24+
prefix (str): Prefix for the current line (used for recursion). default: ""
25+
is_last (bool): Boolean indicating if this is the last item in current directory. default: True
26+
ignore_patterns (list[str] | None): List of patterns to ignore. default: None
27+
28+
Returns:
29+
list[str]: Lines of the tree structure
30+
"""
31+
if not directory.is_dir():
32+
logger.error(f"'{directory}' is not a valid directory")
33+
return []
34+
35+
tree_lines = []
36+
items = [
37+
item
38+
for item in sorted(directory.iterdir())
39+
if not should_ignore(item.name, self.ignore_patterns)
40+
]
41+
42+
for i, item in enumerate(items):
43+
is_last_item = i == len(items) - 1
44+
connector = "└── " if is_last_item else "├── "
45+
46+
tree_lines.append(f"{prefix}{connector}{item.name}")
47+
48+
if item.is_dir():
49+
extension = " " if is_last_item else "│ "
50+
tree_lines.extend(
51+
self.generate_tree(
52+
item,
53+
prefix + extension,
54+
is_last_item,
55+
)
56+
)
57+
58+
return tree_lines
59+
60+
def create_tree_structure(self, path: str) -> str:
61+
"""
62+
Create and display/save a tree structure of the specified directory.
63+
64+
Args:
65+
path: Path to the directory
66+
67+
Returns:
68+
str: The tree structure
69+
"""
70+
directory = Path(path)
71+
if not directory.exists():
72+
raise FileNotFoundError(f"Directory '{path}' does not exist")
73+
74+
logger.info(f"Generating tree structure for: {directory.absolute()}")
75+
76+
tree_lines = ["# Directory Structure", directory.name]
77+
tree_lines.extend(self.generate_tree(directory))
78+
79+
# Join lines with newlines
80+
tree_structure = "\n".join(tree_lines) + "\n"
81+
82+
return tree_structure

repo_context/utils.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from fnmatch import fnmatch
2+
from pathlib import Path
3+
4+
from repo_context.ignore import EXTENSIONS, FILES, PATTERNS
5+
6+
7+
def get_relative_path(path: Path) -> str:
8+
"""
9+
Get the relative path of the given Path object with respect to the current working directory.
10+
11+
Args:
12+
path (Path): The Path object to be converted to a relative path.
13+
14+
Returns:
15+
str: The relative path as a string if the given path is within the current working directory,
16+
otherwise the absolute path as a string.
17+
"""
18+
try:
19+
return str(path.resolve().relative_to(Path.cwd()))
20+
except ValueError:
21+
return str(path)
22+
23+
24+
def should_ignore(path: Path, ignore_patterns: list[str]) -> bool:
25+
"""Check if path matches ignore patterns.
26+
27+
Args:
28+
path (Path): Path to check against ignore patterns
29+
ignore_patterns (list[str]): List of ignore patterns
30+
31+
Returns:
32+
True if path should be ignored
33+
"""
34+
if not isinstance(path, Path):
35+
path = Path(path)
36+
37+
fname = path.name
38+
path_str = str(path)
39+
relative_path = get_relative_path(path)
40+
41+
for pattern in ignore_patterns:
42+
if pattern in FILES and fname == pattern:
43+
return True
44+
45+
if pattern in EXTENSIONS and fnmatch(fname, pattern):
46+
return True
47+
48+
if pattern in PATTERNS:
49+
if pattern in path_str:
50+
return True
51+
52+
normalized_path = relative_path.replace("\\", "/")
53+
normalized_pattern = pattern.replace("\\", "/")
54+
if fnmatch(normalized_path, normalized_pattern):
55+
return True
56+
57+
if fnmatch(path_str, pattern):
58+
return True
59+
60+
return False

0 commit comments

Comments
 (0)