Skip to content

Commit 528e48d

Browse files
committed
refactor(git): Use regex-based parsing method for generate_from_diff_output
1 parent c2bd3df commit 528e48d

File tree

2 files changed

+57
-96
lines changed

2 files changed

+57
-96
lines changed

src/dda/tools/git.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@ def author_email(self) -> str:
8080
def get_remote_details(self, remote_name: str = "origin") -> Remote:
8181
"""
8282
Get the details of the given remote for the Git repository in the current working directory.
83-
The returned tuple is (org, repo, url).
8483
"""
8584

8685
remote_url = self.capture(

src/dda/utils/git/changeset.py

Lines changed: 57 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -55,102 +55,52 @@ def generate_from_diff_output(cls, diff_output: str | list[str]) -> Generator[Se
5555
"""
5656
Generate a list of FileChanges from the output of _some_ git diff commands.
5757
Not all outputs from `git diff` are supported (ex: renames), see set of args in [Git._capture_diff_lines](dda.tools.git.Git._capture_diff_lines) method.
58-
Accepts a string or a list of lines.
5958
"""
60-
if isinstance(diff_output, str):
61-
diff_output = diff_output.strip().splitlines()
62-
63-
if len(diff_output) == 0:
64-
return
65-
66-
line_iterator = iter(diff_output)
67-
68-
current_file: Path | None = None
69-
current_type: ChangeType | None = None
70-
current_patch_lines: list[str] = []
71-
iterator_exhausted = False
72-
73-
try:
74-
line = next(line_iterator)
75-
while True:
76-
# Start processing a new file - the line looks like `diff --git a/<path> b/<path>`
77-
if not line.startswith("diff --git "):
78-
msg = f"Unexpected line in git diff output: {line}"
79-
raise ValueError(msg)
80-
81-
# Go forward until we find the 'old file' line (---)
82-
while not line.startswith("--- "):
83-
try:
84-
line = next(line_iterator)
85-
except StopIteration:
86-
msg = "Unexpected end of git diff output while looking for --- line"
87-
raise ValueError(msg) # noqa: B904
88-
89-
# When we get here, we are on the --- line
90-
# It should always be followed by a +++ line
91-
old_file_line = line
92-
93-
try:
94-
new_file_line = next(line_iterator)
95-
except StopIteration:
96-
msg = "Unexpected end of git diff output while looking for +++ line"
97-
raise ValueError(msg) # noqa: B904
98-
if not new_file_line.startswith("+++ "):
99-
msg = f"Unexpected line in git diff output, expected +++ line: {new_file_line}"
100-
raise ValueError(msg)
101-
102-
old_file_path = old_file_line[4:].strip()
103-
new_file_path = new_file_line[4:].strip()
104-
105-
if old_file_path == "/dev/null":
106-
current_type = ChangeType.ADDED
107-
current_file = Path(new_file_path)
108-
elif new_file_path == "/dev/null":
109-
current_type = ChangeType.DELETED
110-
current_file = Path(old_file_path)
111-
elif old_file_path == new_file_path:
112-
current_type = ChangeType.MODIFIED
113-
current_file = Path(new_file_path)
114-
else:
115-
msg = f"Unexpected file paths in git diff output: {old_file_path} -> {new_file_path} - this indicates a rename which we do not support"
116-
raise ValueError(
117-
msg,
118-
)
119-
120-
# Now, we should be at the start of the patch hunks (lines starting with @@)
121-
line = next(line_iterator)
122-
if not line.startswith("@@ "):
123-
msg = f"Unexpected line in git diff output, expected hunk start: {line}"
124-
raise ValueError(msg)
125-
# Collect hunk lines, i.e. lines starting with @@, +, -, or \ (\ is for the "no newline at end of file" message that can appear)
126-
127-
while line.startswith(("@@ ", "+", "-", "\\")):
128-
current_patch_lines.append(line)
129-
try:
130-
line = next(line_iterator)
131-
except StopIteration:
132-
# Just break out of the loop, we will handle yielding below
133-
# Set a flag to indicate we reached the end of the iterator
134-
iterator_exhausted = True
135-
break
136-
137-
# Yield the file we were building now that we have reached the end of its patch
138-
yield cls(
139-
file=current_file,
140-
type=current_type,
141-
binary=False, # TODO: Support binaries
142-
patch="\n".join(current_patch_lines),
143-
)
144-
current_file = None
145-
current_type = None
146-
current_patch_lines = []
147-
148-
if iterator_exhausted:
149-
return
150-
151-
except StopIteration:
152-
msg = "Unexpected end of git diff output while parsing"
153-
raise ValueError(msg) # noqa: B904
59+
import re
60+
61+
if isinstance(diff_output, list):
62+
diff_output = "\n".join(diff_output)
63+
64+
for modification in re.split(r"^diff --git ", diff_output, flags=re.MULTILINE):
65+
if not modification:
66+
continue
67+
68+
# Extract metadata. It can be in two formats, depending on if the file is a binary file or not.
69+
70+
# Binary files:
71+
# (new file mode 100644) - not always present
72+
# index 0000000000..089fd64579
73+
# Binary files /dev/null and foo/archive.tar.gz differ
74+
75+
# Regular files:
76+
# (new file mode 100644) - not always present
77+
# index 0000000000..089fd64579
78+
# --- a/file
79+
# +++ b/file
80+
# @@ ... @@ (start of hunks)
81+
sep = "@@ "
82+
metadata, *blocks = re.split(rf"^{sep}", modification, flags=re.MULTILINE)
83+
metadata_lines = metadata.strip().splitlines()
84+
85+
# Determine if the file is a binary file
86+
binary = metadata_lines[-1].startswith("Binary files ")
87+
88+
# Extract old and new file paths
89+
if binary:
90+
line = metadata_lines[-1].removeprefix("Binary files ")
91+
# This might raise an error if one of the files contains the string " and "
92+
before_filename, after_filename = line.split(" and ")
93+
else:
94+
before_filename = metadata_lines[-2].split(maxsplit=1)[1]
95+
after_filename = metadata_lines[-1].split(maxsplit=1)[1]
96+
97+
# Determine changetype
98+
current_type = _determine_change_type(before_filename, after_filename)
99+
current_file = Path(after_filename) if current_type == ChangeType.ADDED else Path(before_filename)
100+
101+
# Strip every "block" and add the missing separator
102+
patch = "" if binary else "\n".join([sep + block.strip() for block in blocks]).strip()
103+
yield cls(file=current_file, type=current_type, binary=binary, patch=patch)
154104

155105
@classmethod
156106
def enc_hook(cls, obj: Any) -> Any:
@@ -256,3 +206,15 @@ def enc_hook(cls, obj: Any) -> Any:
256206
def dec_hook(cls, obj_type: type, obj: Any) -> Any:
257207
# Only unsupported objects are Path objects
258208
return Path.dec_hook(obj_type, obj)
209+
210+
211+
def _determine_change_type(before_filename: str, after_filename: str) -> ChangeType:
212+
if before_filename == after_filename:
213+
return ChangeType.MODIFIED
214+
if before_filename == "/dev/null":
215+
return ChangeType.ADDED
216+
if after_filename == "/dev/null":
217+
return ChangeType.DELETED
218+
219+
msg = f"Unexpected file paths in git diff output: {before_filename} -> {after_filename} - this indicates a rename which we do not support"
220+
raise ValueError(msg)

0 commit comments

Comments
 (0)