1
1
import logging
2
2
import tempfile
3
- from fnmatch import fnmatch
4
3
from multiprocessing import Pool , cpu_count
5
4
from pathlib import Path
6
5
9
8
from tqdm .contrib .logging import logging_redirect_tqdm
10
9
11
10
from repo_context .ignore import EXTENSIONS , FILES , PATTERNS
11
+ from repo_context .utils import should_ignore
12
+ from repo_context .structure import RepoStructure
12
13
13
14
logger = logging .getLogger ("repo_context.repo_converter" )
14
15
@@ -20,11 +21,25 @@ def __init__(
20
21
max_file_size : int = 1_000_000 ,
21
22
max_workers : int | None = None ,
22
23
) -> None :
24
+ """
25
+ Initialize the converter with specified parameters.
26
+
27
+ Args:
28
+ ignore_patterns (list[str] | None, optional): A list of patterns to ignore. Defaults to None.
29
+ max_file_size (int, optional): The maximum file size to process in bytes. Defaults to 1,000,000.
30
+ max_workers (int | None, optional): The maximum number of worker threads to use. Defaults to the number of CPU cores.
31
+
32
+ Attributes:
33
+ ignore_patterns (list[str]): The list of patterns to ignore.
34
+ max_file_size (int): The maximum file size to process in bytes.
35
+ max_workers (int): The maximum number of worker threads to use.
36
+ structure (RepoStructure): The repository structure initialized with the ignore patterns.
37
+ """
23
38
self .ignore_patterns = ignore_patterns or []
24
39
self .max_file_size = max_file_size
25
40
self .max_workers = max_workers or cpu_count ()
26
-
27
41
self .ignore_patterns += FILES + EXTENSIONS + PATTERNS
42
+ self .structure = RepoStructure (ignore_patterns = self .ignore_patterns )
28
43
29
44
def clone_repo (self , url : str ) -> Path :
30
45
"""Clone a repository from URL to temporary directory.
@@ -68,57 +83,6 @@ def progress_callback(op_code, cur_count, max_count=None, message=""):
68
83
logger .error (f"Failed to clone repository: { e } " )
69
84
raise
70
85
71
- def should_ignore (self , path : Path ) -> bool :
72
- """Check if path matches ignore patterns.
73
-
74
- Args:
75
- path: Path to check against ignore patterns
76
-
77
- Returns:
78
- True if path should be ignored
79
- """
80
- fname = path .name
81
- path_str = str (path )
82
- relative_path = self ._get_relative_path (path )
83
-
84
- for pattern in self .ignore_patterns :
85
- if pattern in FILES and fname == pattern :
86
- return True
87
-
88
- if pattern in EXTENSIONS and fnmatch (fname , pattern ):
89
- return True
90
-
91
- if pattern in PATTERNS :
92
- if pattern in path_str :
93
- return True
94
-
95
- normalized_path = relative_path .replace ("\\ " , "/" )
96
- normalized_pattern = pattern .replace ("\\ " , "/" )
97
- if fnmatch (normalized_path , normalized_pattern ):
98
- return True
99
-
100
- if fnmatch (path_str , pattern ):
101
- return True
102
-
103
- return False
104
-
105
- @staticmethod
106
- def _get_relative_path (path : Path ) -> str :
107
- """
108
- Get the relative path of the given Path object with respect to the current working directory.
109
-
110
- Args:
111
- path (Path): The Path object to be converted to a relative path.
112
-
113
- Returns:
114
- str: The relative path as a string if the given path is within the current working directory,
115
- otherwise the absolute path as a string.
116
- """
117
- try :
118
- return str (path .resolve ().relative_to (Path .cwd ()))
119
- except ValueError :
120
- return str (path )
121
-
122
86
def _process_file_wrapper (self , args : tuple [str , str ]) -> str | None :
123
87
"""
124
88
Wrapper method to process a file with given file path and repository path.
@@ -149,14 +113,22 @@ def convert(self, repo_path: Path, max_file_lines: int | None = None) -> list[st
149
113
if not repo_path .exists ():
150
114
raise FileNotFoundError (f"Repository path { repo_path } does not exist" )
151
115
116
+ context = []
117
+
118
+ # Get structure of the repository
119
+ tree_structure = self .structure .create_tree_structure (repo_path )
120
+ if tree_structure :
121
+ context .append (tree_structure )
122
+
123
+ # Get all files in the repository
152
124
with logging_redirect_tqdm ():
153
125
file_paths = [
154
126
(str (p ), str (repo_path ))
155
127
for p in tqdm (repo_path .rglob ("*" ), ncols = 120 )
156
128
if self ._is_valid_file (p )
157
129
]
158
130
159
- context = []
131
+ # Process files in parallel
160
132
with Pool (self .max_workers ) as pool :
161
133
with logging_redirect_tqdm ():
162
134
with tqdm (
@@ -182,7 +154,7 @@ def _is_valid_file(self, path: Path) -> bool:
182
154
"""Check if file should be processed."""
183
155
return (
184
156
path .is_file ()
185
- and not self . should_ignore (path )
157
+ and not should_ignore (path , self . ignore_patterns )
186
158
and path .stat ().st_size <= self .max_file_size
187
159
)
188
160
0 commit comments