From c6bba8e6d3d9874191dfea7e9e440bc8183bb7b4 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 17 Jan 2024 09:31:47 +0100 Subject: [PATCH 1/4] Only use Python-isal for levels 1 and 2 --- src/xopen/__init__.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index ff7b2f2..3c6e883 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1021,17 +1021,17 @@ def _open_gz( # noqa: C901 compresslevel = XOPEN_DEFAULT_GZIP_COMPRESSION if threads != 0: - if igzip_threaded: - try: - return igzip_threaded.open( # type: ignore - filename, - mode, - compresslevel, - **text_mode_kwargs, - threads=1, - ) - except ValueError: # Wrong compression level - pass + # igzip level 0 compresses, while zlib outputs data in uncompressed + # deflate format. igzip level 3 is similar in size to 1 and 2 and slower. + # Other libraries do a better job at these levels. + if igzip_threaded and (compresslevel in (1, 2) or "r" in mode): + return igzip_threaded.open( # type: ignore + filename, + mode, + compresslevel, + **text_mode_kwargs, + threads=1, + ) if gzip_ng_threaded and zlib_ng: try: return gzip_ng_threaded.open( @@ -1099,12 +1099,8 @@ def _open_reproducible_gzip(filename, mode: str, compresslevel: int): mtime=0, ) gzip_file = None - if igzip is not None: - try: - gzip_file = igzip.IGzipFile(**kwargs, compresslevel=compresslevel) - except ValueError: - # Compression level not supported, move to built-in gzip. - pass + if igzip is not None and (compresslevel in (1, 2) or "r" in mode): + gzip_file = igzip.IGzipFile(**kwargs, compresslevel=compresslevel) elif gzip_ng is not None: # Compression level should be at least 2 for zlib-ng to prevent very big files. gzip_file = gzip_ng.GzipNGFile(**kwargs, compresslevel=max(compresslevel, 2)) From db36947fa903e6ec859fae2f76d69442a373d634 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 17 Jan 2024 09:33:04 +0100 Subject: [PATCH 2/4] Simplify logic in open reproducible gzip --- src/xopen/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 3c6e883..4970301 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1098,14 +1098,12 @@ def _open_reproducible_gzip(filename, mode: str, compresslevel: int): mode=mode, mtime=0, ) - gzip_file = None if igzip is not None and (compresslevel in (1, 2) or "r" in mode): gzip_file = igzip.IGzipFile(**kwargs, compresslevel=compresslevel) elif gzip_ng is not None: # Compression level should be at least 2 for zlib-ng to prevent very big files. gzip_file = gzip_ng.GzipNGFile(**kwargs, compresslevel=max(compresslevel, 2)) - - if gzip_file is None: + else: gzip_file = gzip.GzipFile(**kwargs, compresslevel=compresslevel) # type: ignore # When (I)GzipFile is created with a fileobj instead of a filename, # the passed file object is not closed when (I)GzipFile.close() From b53ec1937857869f3deef589b76a5d9843197bc4 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 17 Jan 2024 09:34:53 +0100 Subject: [PATCH 3/4] Make sure a comment is added in both instances --- src/xopen/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 4970301..b2fa021 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1021,9 +1021,8 @@ def _open_gz( # noqa: C901 compresslevel = XOPEN_DEFAULT_GZIP_COMPRESSION if threads != 0: - # igzip level 0 compresses, while zlib outputs data in uncompressed - # deflate format. igzip level 3 is similar in size to 1 and 2 and slower. - # Other libraries do a better job at these levels. + # Igzip level 0 does not output uncompressed deflate blocks as zlib does + # and level 3 is slower but does not compress better than level 1 and 2. if igzip_threaded and (compresslevel in (1, 2) or "r" in mode): return igzip_threaded.open( # type: ignore filename, @@ -1098,6 +1097,8 @@ def _open_reproducible_gzip(filename, mode: str, compresslevel: int): mode=mode, mtime=0, ) + # Igzip level 0 does not output uncompressed deflate blocks as zlib does + # and level 3 is slower but does not compress better than level 1 and 2. if igzip is not None and (compresslevel in (1, 2) or "r" in mode): gzip_file = igzip.IGzipFile(**kwargs, compresslevel=compresslevel) elif gzip_ng is not None: From 522fe1cca8d42f19c032e10d8f3735f1677adef0 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 17 Jan 2024 09:58:40 +0100 Subject: [PATCH 4/4] Fix a bug where level 0 could not be used by zlib-ng --- src/xopen/__init__.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index b2fa021..b23b17e 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1036,10 +1036,9 @@ def _open_gz( # noqa: C901 return gzip_ng_threaded.open( filename, mode, - # zlib-ng level 1 is 50% bigger than zlib level 1. - # This will be wildly outside user ballpark expectations, so - # increase the level - max(compresslevel, 2), + # zlib-ng level 1 is 50% bigger than zlib level 1. Level + # 2 gives a size close to expectations. + compresslevel=2 if compresslevel == 1 else compresslevel, **text_mode_kwargs, threads=threads or max(_available_cpu_count(), 4), ) @@ -1102,8 +1101,10 @@ def _open_reproducible_gzip(filename, mode: str, compresslevel: int): if igzip is not None and (compresslevel in (1, 2) or "r" in mode): gzip_file = igzip.IGzipFile(**kwargs, compresslevel=compresslevel) elif gzip_ng is not None: - # Compression level should be at least 2 for zlib-ng to prevent very big files. - gzip_file = gzip_ng.GzipNGFile(**kwargs, compresslevel=max(compresslevel, 2)) + # Zlib-ng level 1 creates much bigger files than zlib level 1. + gzip_file = gzip_ng.GzipNGFile( + **kwargs, compresslevel=2 if compresslevel == 1 else compresslevel + ) else: gzip_file = gzip.GzipFile(**kwargs, compresslevel=compresslevel) # type: ignore # When (I)GzipFile is created with a fileobj instead of a filename,