-
Notifications
You must be signed in to change notification settings - Fork 0
Feature/zip support #29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
e54368c
4e5dcd9
63c64f7
dfe3df3
e6518bd
c3c2cf3
3f171f0
edd8e65
7d9df08
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,65 @@ | ||||||||||||||||||||||||||
| import os | ||||||||||||||||||||||||||
| import zipfile | ||||||||||||||||||||||||||
| import tempfile | ||||||||||||||||||||||||||
| import shutil | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| class ArchiveHandler: | ||||||||||||||||||||||||||
| def __init__(self): | ||||||||||||||||||||||||||
| self.temp_dir = None | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| def process_zip(self, zip_path, root_dir): | ||||||||||||||||||||||||||
|
||||||||||||||||||||||||||
| """ | ||||||||||||||||||||||||||
| Process a zip file and return paths to its contents. | ||||||||||||||||||||||||||
| Parameters: | ||||||||||||||||||||||||||
| ------------ | ||||||||||||||||||||||||||
| zip_path - String. Path to the zip file. | ||||||||||||||||||||||||||
| root_dir - String. Root directory for relative path calculations. | ||||||||||||||||||||||||||
| Returns: | ||||||||||||||||||||||||||
| --------- | ||||||||||||||||||||||||||
| List of tuples (file_path, relative_path) for files in the zip. | ||||||||||||||||||||||||||
| """ | ||||||||||||||||||||||||||
| if not zipfile.is_zipfile(zip_path): | ||||||||||||||||||||||||||
| return [] | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| # Create a temporary directory for extraction | ||||||||||||||||||||||||||
| self.temp_dir = tempfile.mkdtemp() | ||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||
| try: | ||||||||||||||||||||||||||
| with zipfile.ZipFile(zip_path, 'r') as zip_ref: | ||||||||||||||||||||||||||
| # Extract all contents to temp directory | ||||||||||||||||||||||||||
| zip_ref.extractall(self.temp_dir) | ||||||||||||||||||||||||||
|
Comment on lines
+27
to
+32
|
||||||||||||||||||||||||||
| self.temp_dir = tempfile.mkdtemp() | |
| try: | |
| with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
| # Extract all contents to temp directory | |
| zip_ref.extractall(self.temp_dir) | |
| temp_dir = tempfile.mkdtemp() | |
| try: | |
| with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
| # Extract all contents to temp directory | |
| zip_ref.extractall(temp_dir) |
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -5,13 +5,13 @@ class Hasher: | |||||||||
| def __init__(self, algorithm='md5'): | ||||||||||
| self.algorithm = algorithm | ||||||||||
|
|
||||||||||
| def checksum_file(self, file_path, algorithm=None, length=None): | ||||||||||
| def checksum_file(self, file_path_or_obj, algorithm=None, length=None): | ||||||||||
| """ | ||||||||||
| Calculate the checksum of a file using the specified algorithm. | ||||||||||
|
|
||||||||||
| Parameters: | ||||||||||
| ------------ | ||||||||||
| file_path - String. Path to file to apply checksum function. | ||||||||||
| file_path_or_obj - String or file-like object. Path to file or file-like object to apply checksum function. | ||||||||||
| algorithm - String. Hash function to use for checksums. Default: 'md5', see options with 'hashlib.algorithms_available'. | ||||||||||
| length - Integer [optional]. Length of the digest for SHAKE and BLAKE algorithms in bytes. | ||||||||||
|
|
||||||||||
|
|
@@ -55,9 +55,14 @@ def checksum_file(self, file_path, algorithm=None, length=None): | |||||||||
| raise LengthUsedForFixedLengthHashError(algorithm) | ||||||||||
| hash_func = hashlib.new(algorithm) | ||||||||||
|
|
||||||||||
| # Read the file and update the hash function | ||||||||||
| with open(file_path, "rb") as f: | ||||||||||
| for chunk in iter(lambda: f.read(4096), b""): | ||||||||||
| # Handle both file paths and file-like objects | ||||||||||
| if isinstance(file_path_or_obj, str): | ||||||||||
| with open(file_path_or_obj, "rb") as f: | ||||||||||
| for chunk in iter(lambda: f.read(4096), b""): | ||||||||||
| hash_func.update(chunk) | ||||||||||
| else: | ||||||||||
| # Assume it's a file-like object | ||||||||||
|
||||||||||
| # Assume it's a file-like object | |
| # Validate that the object supports the 'read' method | |
| if not hasattr(file_path_or_obj, 'read'): | |
| raise TypeError("The provided object is not a valid file-like object. It must support the 'read()' method.") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm having a hard time concocting a scenario where not hasattr(file_path_or_obj, 'read') would be true. The gather_file_paths and ArchiveHandler.stream_zip() should always return something that can be read. The only thing I can think of is if someone tries to manually do something like checksum_file(321) which would give a pretty clear error already of AttributeError: 'int' object has no attribute 'read'. I think we can leave this one alone for now.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,13 @@ | ||
| import os | ||
| import zipfile | ||
| from sumbuddy.filter import Filter | ||
| from sumbuddy.exceptions import EmptyInputDirectoryError, NoFilesAfterFilteringError, NotADirectoryError | ||
| from sumbuddy.archive import ArchiveHandler | ||
|
|
||
| class Mapper: | ||
| def __init__(self): | ||
| self.filter_manager = Filter() | ||
| self.archive_handler = ArchiveHandler() | ||
|
|
||
| def reset_filter(self, ignore_file=None, include_hidden=False): | ||
| """ | ||
|
|
@@ -28,24 +31,16 @@ def reset_filter(self, ignore_file=None, include_hidden=False): | |
| def gather_file_paths(self, input_directory, ignore_file=None, include_hidden=False): | ||
| """ | ||
| Generate list of file paths in the input directory based on ignore pattern rules. | ||
|
|
||
| Parameters: | ||
| ------------ | ||
| input_directory - String. Directory to traverse for files. | ||
| ignore_file - String [optional]. Filepath for the ignore patterns file. | ||
| include_hidden - Boolean [optional]. Whether to include hidden files. | ||
|
|
||
| Returns: | ||
| --------- | ||
| file_paths - List. Files in input_directory that are not ignored. | ||
| Returns a tuple: (regular_files, zip_archives) | ||
|
Comment on lines
33
to
+34
|
||
| """ | ||
|
|
||
| if not os.path.isdir(input_directory): | ||
| raise NotADirectoryError(input_directory) | ||
|
|
||
| self.reset_filter(ignore_file=ignore_file, include_hidden=include_hidden) | ||
|
|
||
| file_paths = [] | ||
| regular_files = [] | ||
| zip_archives = [] | ||
| root_directory = os.path.abspath(input_directory) | ||
| has_files = False | ||
|
|
||
|
|
@@ -55,11 +50,14 @@ def gather_file_paths(self, input_directory, ignore_file=None, include_hidden=Fa | |
| for name in files: | ||
| file_path = os.path.join(root, name) | ||
| if self.filter_manager.should_include(file_path, root_directory): | ||
| file_paths.append(file_path) | ||
| if zipfile.is_zipfile(file_path): | ||
| zip_archives.append(file_path) | ||
| else: | ||
| regular_files.append(file_path) | ||
|
|
||
| if not has_files: | ||
| raise EmptyInputDirectoryError(input_directory) | ||
| if not file_paths: | ||
| if not (regular_files or zip_archives): | ||
| raise NoFilesAfterFilteringError(input_directory, ignore_file) | ||
|
|
||
| return file_paths | ||
| return regular_files, zip_archives | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The total_files calculation iterates through all ZIP archives twice - once to count files for the progress bar and again during processing. This is inefficient for large ZIP files. Consider caching the file counts or restructuring the progress tracking.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Non-blocking for now. Can add a new issue to use file content headers and perhaps use more generic "Archive" language in
__main__.pyrather than referring to zip once a new archive file type is supported.