-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
archive - add reproducible_tar option #8691
base: main
Are you sure you want to change the base?
Changes from 7 commits
51a4112
370f19b
5313ff2
0f3f99c
e827a8f
b68c377
d1ba9f8
3445dee
dea03d8
38a8271
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
minor_changes: | ||
- archive - add ``reproducible_tar`` option to make tar archives vary less given the same input file content (https://github.com/ansible-collections/community.general/pull/8691). |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,6 +71,13 @@ | |
- Remove any added source files and trees after adding to archive. | ||
type: bool | ||
default: false | ||
reproducible_tar: | ||
description: | ||
- Set tar metadata and gzip headers to vary less given the same input file content. | ||
- Useful for minimizing unneeded archive changes and avoiding handlers that may trigger on such changes. | ||
type: bool | ||
default: false | ||
version_added: 9.3.0 | ||
notes: | ||
- Can produce C(gzip), C(bzip2), C(lzma), and C(zip) compressed files or archives. | ||
- This module uses C(tarfile), C(zipfile), C(gzip), and C(bz2) packages on the target host to create archives. | ||
|
@@ -189,7 +196,6 @@ | |
import tarfile | ||
import zipfile | ||
from fnmatch import fnmatch | ||
from sys import version_info | ||
from traceback import format_exc | ||
from zlib import crc32 | ||
|
||
|
@@ -218,8 +224,6 @@ | |
LZMA_IMP_ERR = format_exc() | ||
HAS_LZMA = False | ||
|
||
PY27 = version_info[0:2] >= (2, 7) | ||
|
||
STATE_ABSENT = 'absent' | ||
STATE_ARCHIVED = 'archive' | ||
STATE_COMPRESSED = 'compress' | ||
|
@@ -282,6 +286,7 @@ def __init__(self, module): | |
self.format = module.params['format'] | ||
self.must_archive = module.params['force_archive'] | ||
self.remove = module.params['remove'] | ||
self.reproducible_tar = module.params["reproducible_tar"] or False | ||
glennpratt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
self.changed = False | ||
self.destination_state = STATE_ABSENT | ||
|
@@ -490,6 +495,9 @@ def _open_compressed_file(self, path, mode): | |
|
||
return f | ||
|
||
def _reproducible_mtime(self): | ||
return 0 | ||
|
||
@abc.abstractmethod | ||
def close(self): | ||
pass | ||
|
@@ -542,6 +550,33 @@ def _get_checksums(self, path): | |
return checksums | ||
|
||
|
||
class ReproducibleTGZFile(tarfile.TarFile): | ||
def __init__( | ||
self, name=None, mode=None, compresslevel=-1, fileobj=None, mtime=None, **kwargs | ||
): | ||
if fileobj is None: | ||
fileobj = open(name, mode + "b") | ||
|
||
try: | ||
# output filename intentionally empty exclude it from gzip header | ||
gzipfileobj = gzip.GzipFile("", mode, compresslevel, fileobj, mtime) | ||
except Exception: | ||
fileobj.close() | ||
raise | ||
|
||
# Allow GzipFile to close fileobj as needed | ||
gzipfileobj.myfileobj = fileobj | ||
glennpratt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
try: | ||
super(ReproducibleTGZFile, self).__init__(mode=mode, fileobj=gzipfileobj, **kwargs) | ||
except Exception: | ||
gzipfileobj.close() | ||
raise | ||
|
||
# Allow TarFile to close GzipFile as needed | ||
self._extfileobj = False | ||
glennpratt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
class TarArchive(Archive): | ||
def __init__(self, module): | ||
super(TarArchive, self).__init__(module) | ||
|
@@ -562,7 +597,11 @@ def contains(self, name): | |
return True | ||
|
||
def open(self): | ||
if self.format in ('gz', 'bz2'): | ||
if self.reproducible_tar and self.format == "gz": | ||
self.file = ReproducibleTGZFile( | ||
_to_native_ascii(self.destination), "w", mtime=self._reproducible_mtime() | ||
) | ||
elif self.format in ('gz', 'bz2'): | ||
self.file = tarfile.open(_to_native_ascii(self.destination), 'w|' + self.format) | ||
# python3 tarfile module allows xz format but for python2 we have to create the tarfile | ||
# in memory and then compress it with lzma. | ||
|
@@ -575,16 +614,34 @@ def open(self): | |
self.module.fail_json(msg="%s is not a valid archive format" % self.format) | ||
|
||
def _add(self, path, archive_name): | ||
def py27_filter(tarinfo): | ||
return None if matches_exclusion_patterns(tarinfo.name, self.exclusion_patterns) else tarinfo | ||
def filter(tarinfo): | ||
# type: (tarfile.TarInfo) -> tarfile.TarInfo | None | ||
if matches_exclusion_patterns(tarinfo.name, self.exclusion_patterns): | ||
return None | ||
|
||
if self.reproducible_tar: | ||
# Remove unused backref that prevents copy | ||
if hasattr(tarinfo, "tarfile"): | ||
delattr(tarinfo, "tarfile") | ||
|
||
if tarinfo.isdir(): | ||
mode = 0o40000 | 0o755 | ||
else: | ||
mode = 0o100000 | (0o755 if tarinfo.mode & 0o100 else 0o644) | ||
|
||
# Copy tarfile while reducing metadata | ||
return tarinfo.replace( | ||
mtime=self._reproducible_mtime(), | ||
mode=mode, | ||
uid=0, | ||
gid=0, | ||
uname="", | ||
gname="", | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, this is a very special interpretation of 'reproducible tarfile' IMO. Also this is potentially dangerous since files that are protected (only readable by specific users/groups) before archiving are suddenly publicly readable after extraction. Maybe it would be better to make the level of reproducibility configurable? On the other hand, that would make the interface also pretty complicated. I guess this needs to be discussed first. Maybe create a thread in https://forum.ansible.com/c/project/collection-development/27 for that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not intended to be particularly special. I compare it to https://reproducible-builds.org/docs/archives/ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll make a thread when I have more time, I'll just leave some initial notes here.
|
||
|
||
def py26_filter(path): | ||
return matches_exclusion_patterns(path, self.exclusion_patterns) | ||
return tarinfo | ||
|
||
if PY27: | ||
self.file.add(path, archive_name, recursive=False, filter=py27_filter) | ||
else: | ||
self.file.add(path, archive_name, recursive=False, exclude=py26_filter) | ||
self.file.add(path, archive_name, recursive=False, filter=filter) | ||
|
||
def _get_checksums(self, path): | ||
if HAS_LZMA: | ||
|
@@ -637,6 +694,7 @@ def main(): | |
exclusion_patterns=dict(type='list', elements='path'), | ||
force_archive=dict(type='bool', default=False), | ||
remove=dict(type='bool', default=False), | ||
reproducible_tar=dict(type="bool", default=False), | ||
), | ||
add_file_common_args=True, | ||
supports_check_mode=True, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think as the bare minimum this shouldn't be of type
bool
, but of typestr
withchoices
, so that it's possible to add other ways to make it reproducible later.