Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

archive - add reproducible_tar option #8691

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
minor_changes:
- archive - add ``reproducible_tar`` option to make tar archives vary less given the same input file content (https://github.com/ansible-collections/community.general/pull/8691).
82 changes: 70 additions & 12 deletions plugins/modules/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@
- Remove any added source files and trees after adding to archive.
type: bool
default: false
reproducible_tar:
description:
- Set tar metadata and gzip headers to vary less given the same input file content.
- Useful for minimizing unneeded archive changes and avoiding handlers that may trigger on such changes.
type: bool
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think as the bare minimum this shouldn't be of type bool, but of type str with choices, so that it's possible to add other ways to make it reproducible later.

default: false
version_added: 9.3.0
notes:
- Can produce C(gzip), C(bzip2), C(lzma), and C(zip) compressed files or archives.
- This module uses C(tarfile), C(zipfile), C(gzip), and C(bz2) packages on the target host to create archives.
Expand Down Expand Up @@ -189,7 +196,6 @@
import tarfile
import zipfile
from fnmatch import fnmatch
from sys import version_info
from traceback import format_exc
from zlib import crc32

Expand Down Expand Up @@ -218,8 +224,6 @@
LZMA_IMP_ERR = format_exc()
HAS_LZMA = False

PY27 = version_info[0:2] >= (2, 7)

STATE_ABSENT = 'absent'
STATE_ARCHIVED = 'archive'
STATE_COMPRESSED = 'compress'
Expand Down Expand Up @@ -282,6 +286,7 @@ def __init__(self, module):
self.format = module.params['format']
self.must_archive = module.params['force_archive']
self.remove = module.params['remove']
self.reproducible_tar = module.params["reproducible_tar"] or False
glennpratt marked this conversation as resolved.
Show resolved Hide resolved

self.changed = False
self.destination_state = STATE_ABSENT
Expand Down Expand Up @@ -490,6 +495,9 @@ def _open_compressed_file(self, path, mode):

return f

def _reproducible_mtime(self):
return 0

@abc.abstractmethod
def close(self):
pass
Expand Down Expand Up @@ -542,6 +550,33 @@ def _get_checksums(self, path):
return checksums


class ReproducibleTGZFile(tarfile.TarFile):
def __init__(
self, name=None, mode=None, compresslevel=-1, fileobj=None, mtime=None, **kwargs
):
if fileobj is None:
fileobj = open(name, mode + "b")

try:
# output filename intentionally empty exclude it from gzip header
gzipfileobj = gzip.GzipFile("", mode, compresslevel, fileobj, mtime)
except Exception:
fileobj.close()
raise

# Allow GzipFile to close fileobj as needed
gzipfileobj.myfileobj = fileobj
glennpratt marked this conversation as resolved.
Show resolved Hide resolved

try:
super(ReproducibleTGZFile, self).__init__(mode=mode, fileobj=gzipfileobj, **kwargs)
except Exception:
gzipfileobj.close()
raise

# Allow TarFile to close GzipFile as needed
self._extfileobj = False
glennpratt marked this conversation as resolved.
Show resolved Hide resolved


class TarArchive(Archive):
def __init__(self, module):
super(TarArchive, self).__init__(module)
Expand All @@ -562,7 +597,11 @@ def contains(self, name):
return True

def open(self):
if self.format in ('gz', 'bz2'):
if self.reproducible_tar and self.format == "gz":
self.file = ReproducibleTGZFile(
_to_native_ascii(self.destination), "w", mtime=self._reproducible_mtime()
)
elif self.format in ('gz', 'bz2'):
self.file = tarfile.open(_to_native_ascii(self.destination), 'w|' + self.format)
# python3 tarfile module allows xz format but for python2 we have to create the tarfile
# in memory and then compress it with lzma.
Expand All @@ -575,16 +614,34 @@ def open(self):
self.module.fail_json(msg="%s is not a valid archive format" % self.format)

def _add(self, path, archive_name):
def py27_filter(tarinfo):
return None if matches_exclusion_patterns(tarinfo.name, self.exclusion_patterns) else tarinfo
def filter(tarinfo):
# type: (tarfile.TarInfo) -> tarfile.TarInfo | None
if matches_exclusion_patterns(tarinfo.name, self.exclusion_patterns):
return None

if self.reproducible_tar:
# Remove unused backref that prevents copy
if hasattr(tarinfo, "tarfile"):
delattr(tarinfo, "tarfile")

if tarinfo.isdir():
mode = 0o40000 | 0o755
else:
mode = 0o100000 | (0o755 if tarinfo.mode & 0o100 else 0o644)

# Copy tarfile while reducing metadata
return tarinfo.replace(
mtime=self._reproducible_mtime(),
mode=mode,
uid=0,
gid=0,
uname="",
gname="",
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, this is a very special interpretation of 'reproducible tarfile' IMO. Also this is potentially dangerous since files that are protected (only readable by specific users/groups) before archiving are suddenly publicly readable after extraction.

Maybe it would be better to make the level of reproducibility configurable? On the other hand, that would make the interface also pretty complicated.

I guess this needs to be discussed first. Maybe create a thread in https://forum.ansible.com/c/project/collection-development/27 for that?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Author

@glennpratt glennpratt Aug 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll make a thread when I have more time, I'll just leave some initial notes here.

  • The danger is on extract of course.
  • Unless you are extracting into a path that's already readable, it won't become readable:
     root@dev-master-1:~# tar -xzf coreutils-0.0.27-x86_64-unknown-linux-musl.tar.gz
     root@dev-master-1:~# sudo -u nobody ls /root/coreutils-0.0.27-x86_64-unknown-linux-musl
     ls: cannot access '/root/coreutils-0.0.27-x86_64-unknown-linux-musl': Permission denied
     root@dev-master-1:~# sudo -u nobody ls /root/coreutils-0.0.27-x86_64-unknown-linux-musl/LICENSE
     ls: cannot access '/root/coreutils-0.0.27-x86_64-unknown-linux-musl/LICENSE': Permission denied
    
  • Extracting files as root (common with ansible) with odd source attributes (localhost ansible is not running as root), which was what was happening without this for me, is also potentially dangerous - UID collision leads to write privilege escalation from non-root.


def py26_filter(path):
return matches_exclusion_patterns(path, self.exclusion_patterns)
return tarinfo

if PY27:
self.file.add(path, archive_name, recursive=False, filter=py27_filter)
else:
self.file.add(path, archive_name, recursive=False, exclude=py26_filter)
self.file.add(path, archive_name, recursive=False, filter=filter)

def _get_checksums(self, path):
if HAS_LZMA:
Expand Down Expand Up @@ -637,6 +694,7 @@ def main():
exclusion_patterns=dict(type='list', elements='path'),
force_archive=dict(type='bool', default=False),
remove=dict(type='bool', default=False),
reproducible_tar=dict(type="bool", default=False),
),
add_file_common_args=True,
supports_check_mode=True,
Expand Down
1 change: 1 addition & 0 deletions tests/unit/plugins/modules/test_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def test_archive_removal_safety(self):
exclusion_patterns=dict(type='list', elements='path'),
force_archive=dict(type='bool', default=False),
remove=dict(type='bool', default=False),
reproducible_tar=dict(type="bool", default=False),
),
add_file_common_args=True,
supports_check_mode=True,
Expand Down