From 6d2cb3b56da3daa97e1c9632fff3f313617c09a6 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 12 Jan 2024 12:29:28 +0100 Subject: [PATCH 01/12] Remove redundant if clauses --- src/xopen/__init__.py | 69 ++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 287176e..b89056b 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1060,41 +1060,42 @@ def _open_gz( # noqa: C901 assert mode in ("rt", "rb", "wt", "wb", "at", "ab") # With threads == 0 igzip_threaded defers to igzip.open, but that is not # desirable as a reproducible header is required. - if igzip_threaded and threads != 0: - try: - return igzip_threaded.open( # type: ignore - filename, - mode, - isal_zlib.ISAL_DEFAULT_COMPRESSION # type: ignore - if compresslevel is None - else compresslevel, - **text_mode_kwargs, - threads=1 if threads is None else threads, - ) - except ValueError: # Wrong compression level - pass - if gzip_ng_threaded and zlib_ng and threads != 0: - try: - if compresslevel is None: - level = zlib_ng.Z_DEFAULT_COMPRESSION - elif compresslevel == 1: - # zlib-ng level 1 is 50% bigger than zlib level 1. - # This will be wildly outside user ballpark expectations, so - # increase the level - level = 2 - else: - level = compresslevel - - return gzip_ng_threaded.open( - filename, - mode, - level, - **text_mode_kwargs, - threads=1 if threads is None else threads, - ) - except zlib_ng.error: # Bad compression level - pass + if threads != 0: + if igzip_threaded: + try: + return igzip_threaded.open( # type: ignore + filename, + mode, + isal_zlib.ISAL_DEFAULT_COMPRESSION # type: ignore + if compresslevel is None + else compresslevel, + **text_mode_kwargs, + threads=1 if threads is None else threads, + ) + except ValueError: # Wrong compression level + pass + if gzip_ng_threaded and zlib_ng: + try: + if compresslevel is None: + level = zlib_ng.Z_DEFAULT_COMPRESSION + elif compresslevel == 1: + # zlib-ng level 1 is 50% bigger than zlib level 1. + # This will be wildly outside user ballpark expectations, so + # increase the level + level = 2 + else: + level = compresslevel + + return gzip_ng_threaded.open( + filename, + mode, + level, + **text_mode_kwargs, + threads=1 if threads is None else threads, + ) + except zlib_ng.error: # Bad compression level + pass try: if "r" in mode: return _open_external_gzip_reader( From 2953d9c365178ebe414f5b1407f205dc06b33757 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 12 Jan 2024 12:31:55 +0100 Subject: [PATCH 02/12] Remove external functions for opening for more clarity --- src/xopen/__init__.py | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index b89056b..180763c 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1032,28 +1032,6 @@ def _open_zst( # noqa: C901 return f -def _open_external_gzip_reader( - filename, mode, compresslevel, threads, **text_mode_kwargs -): - assert mode in ("rt", "rb") - try: - return PipedPigzReader(filename, mode, threads=threads, **text_mode_kwargs) - except OSError: - return PipedGzipReader(filename, mode, **text_mode_kwargs) - - -def _open_external_gzip_writer( - filename, mode, compresslevel, threads, **text_mode_kwargs -): - assert mode in ("wt", "wb", "at", "ab") - try: - return PipedPigzWriter( - filename, mode, compresslevel, threads=threads, **text_mode_kwargs - ) - except OSError: - return PipedGzipWriter(filename, mode, compresslevel, **text_mode_kwargs) - - def _open_gz( # noqa: C901 filename, mode: str, compresslevel, threads, **text_mode_kwargs ): @@ -1098,13 +1076,20 @@ def _open_gz( # noqa: C901 pass try: if "r" in mode: - return _open_external_gzip_reader( - filename, mode, compresslevel, threads, **text_mode_kwargs - ) + try: + return PipedPigzReader(filename, mode, threads=threads, + **text_mode_kwargs) + except OSError: + return PipedGzipReader(filename, mode, **text_mode_kwargs) else: - return _open_external_gzip_writer( - filename, mode, compresslevel, threads, **text_mode_kwargs - ) + try: + return PipedPigzWriter( + filename, mode, compresslevel, threads=threads, + **text_mode_kwargs + ) + except OSError: + return PipedGzipWriter(filename, mode, compresslevel, + **text_mode_kwargs) except OSError: pass # We try without threads. From 47817bd7e224a690250f8364c09bc44809b0be15 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 12 Jan 2024 12:32:46 +0100 Subject: [PATCH 03/12] Remove redundant read special case --- src/xopen/__init__.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 180763c..3521a1a 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1093,13 +1093,6 @@ def _open_gz( # noqa: C901 except OSError: pass # We try without threads. - if "r" in mode: - if igzip is not None: - return igzip.open(filename, mode, **text_mode_kwargs) - elif gzip_ng is not None: - return gzip_ng.open(filename, mode, **text_mode_kwargs) - return gzip.open(filename, mode, **text_mode_kwargs) - g = _open_reproducible_gzip( filename, mode=mode[0] + "b", From 484e1318e55d84d905538b2731f2450de5d1a942 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 12 Jan 2024 12:33:28 +0100 Subject: [PATCH 04/12] Black formatting --- src/xopen/__init__.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 3521a1a..a480a6a 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1077,19 +1077,24 @@ def _open_gz( # noqa: C901 try: if "r" in mode: try: - return PipedPigzReader(filename, mode, threads=threads, - **text_mode_kwargs) + return PipedPigzReader( + filename, mode, threads=threads, **text_mode_kwargs + ) except OSError: return PipedGzipReader(filename, mode, **text_mode_kwargs) else: try: return PipedPigzWriter( - filename, mode, compresslevel, threads=threads, - **text_mode_kwargs + filename, + mode, + compresslevel, + threads=threads, + **text_mode_kwargs, ) except OSError: - return PipedGzipWriter(filename, mode, compresslevel, - **text_mode_kwargs) + return PipedGzipWriter( + filename, mode, compresslevel, **text_mode_kwargs + ) except OSError: pass # We try without threads. From 6ba7f228574fdd0db5a6859d6d8a76d479fe80b7 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 12 Jan 2024 12:59:11 +0100 Subject: [PATCH 05/12] Force defaults in a single place rather than anywhere --- src/xopen/__init__.py | 63 +++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 41 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index a480a6a..4c65853 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -33,7 +33,6 @@ import subprocess import tempfile import time -import zlib from abc import ABC, abstractmethod from subprocess import Popen, PIPE, DEVNULL from typing import ( @@ -55,6 +54,9 @@ # 128K buffer size also used by cat, pigz etc. It is faster than the 8K default. BUFFER_SIZE = max(io.DEFAULT_BUFFER_SIZE, 128 * 1024) +# "xopen selects the most efficient method for reading or writing a compressed file." +XOPEN_DEFAULT_GZIP_COMPRESSION = 1 + igzip: Optional[ModuleType] isal_zlib: Optional[ModuleType] igzip_threaded: Optional[ModuleType] @@ -1036,8 +1038,12 @@ def _open_gz( # noqa: C901 filename, mode: str, compresslevel, threads, **text_mode_kwargs ): assert mode in ("rt", "rb", "wt", "wb", "at", "ab") - # With threads == 0 igzip_threaded defers to igzip.open, but that is not - # desirable as a reproducible header is required. + if compresslevel is None: + # Force the same compression level on every tool regardless of + # library defaults + compresslevel = XOPEN_DEFAULT_GZIP_COMPRESSION + if threads is None: + threads = 1 if threads != 0: if igzip_threaded: @@ -1045,32 +1051,23 @@ def _open_gz( # noqa: C901 return igzip_threaded.open( # type: ignore filename, mode, - isal_zlib.ISAL_DEFAULT_COMPRESSION # type: ignore - if compresslevel is None - else compresslevel, + compresslevel, **text_mode_kwargs, - threads=1 if threads is None else threads, + threads=threads, ) except ValueError: # Wrong compression level pass if gzip_ng_threaded and zlib_ng: try: - if compresslevel is None: - level = zlib_ng.Z_DEFAULT_COMPRESSION - elif compresslevel == 1: - # zlib-ng level 1 is 50% bigger than zlib level 1. - # This will be wildly outside user ballpark expectations, so - # increase the level - level = 2 - else: - level = compresslevel - return gzip_ng_threaded.open( filename, mode, - level, + # zlib-ng level 1 is 50% bigger than zlib level 1. + # This will be wildly outside user ballpark expectations, so + # increase the level + max(compresslevel, 2), **text_mode_kwargs, - threads=1 if threads is None else threads, + threads=threads, ) except zlib_ng.error: # Bad compression level pass @@ -1108,13 +1105,14 @@ def _open_gz( # noqa: C901 return g -def _open_reproducible_gzip(filename, mode, compresslevel): +def _open_reproducible_gzip(filename, mode: str, compresslevel: int): """ Open a gzip file for writing (without external processes) that has neither mtime nor the file name in the header (equivalent to gzip --no-name) """ assert mode in ("rb", "wb", "ab") + assert compresslevel is not None # Neither gzip.open nor igzip.open have an mtime option, and they will # always write the file name, so we need to open the file separately # and pass it to gzip.GzipFile/igzip.IGzipFile. @@ -1128,33 +1126,16 @@ def _open_reproducible_gzip(filename, mode, compresslevel): gzip_file = None if igzip is not None: try: - gzip_file = igzip.IGzipFile( - **kwargs, - compresslevel=isal_zlib.ISAL_DEFAULT_COMPRESSION - if compresslevel is None - else compresslevel, - ) + gzip_file = igzip.IGzipFile(**kwargs, compresslevel=compresslevel) except ValueError: # Compression level not supported, move to built-in gzip. pass elif gzip_ng is not None: - if compresslevel == 1: - level = 2 - elif compresslevel is None: - level = zlib_ng.Z_DEFAULT_COMPRESSION - else: - level = compresslevel - gzip_file = gzip_ng.GzipNGFile(**kwargs, compresslevel=level) + # Compression level should be at least 2 for zlib-ng to prevent very big files. + gzip_file = gzip_ng.GzipNGFile(**kwargs, compresslevel=max(compresslevel, 2)) if gzip_file is None: - gzip_file = gzip.GzipFile( - **kwargs, - # Override gzip.open's default of 9 for consistency - # with command-line gzip. - compresslevel=zlib.Z_DEFAULT_COMPRESSION - if compresslevel is None - else compresslevel, - ) + gzip_file = gzip.GzipFile(**kwargs, compresslevel=compresslevel) # type: ignore # When (I)GzipFile is created with a fileobj instead of a filename, # the passed file object is not closed when (I)GzipFile.close() # is called. This forces it to be closed. From e4856edc306b217f47c8f97cd73322862ba8b454 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 12 Jan 2024 13:46:34 +0100 Subject: [PATCH 06/12] Set default gzip compression level to 6 --- src/xopen/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 4c65853..b6d6d7d 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -54,8 +54,8 @@ # 128K buffer size also used by cat, pigz etc. It is faster than the 8K default. BUFFER_SIZE = max(io.DEFAULT_BUFFER_SIZE, 128 * 1024) -# "xopen selects the most efficient method for reading or writing a compressed file." -XOPEN_DEFAULT_GZIP_COMPRESSION = 1 +# Compression level 6 is the default for most gzip applications +XOPEN_DEFAULT_GZIP_COMPRESSION = 6 igzip: Optional[ModuleType] isal_zlib: Optional[ModuleType] From d0758de8e01ade9585d190687c5db2a557353bd7 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 12 Jan 2024 13:53:33 +0100 Subject: [PATCH 07/12] Do not set threads default globally in the function --- src/xopen/__init__.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index b6d6d7d..99021bb 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1042,8 +1042,6 @@ def _open_gz( # noqa: C901 # Force the same compression level on every tool regardless of # library defaults compresslevel = XOPEN_DEFAULT_GZIP_COMPRESSION - if threads is None: - threads = 1 if threads != 0: if igzip_threaded: @@ -1053,7 +1051,7 @@ def _open_gz( # noqa: C901 mode, compresslevel, **text_mode_kwargs, - threads=threads, + threads=threads or max(_available_cpu_count(), 4), ) except ValueError: # Wrong compression level pass @@ -1067,7 +1065,7 @@ def _open_gz( # noqa: C901 # increase the level max(compresslevel, 2), **text_mode_kwargs, - threads=threads, + threads=threads or max(_available_cpu_count(), 4) ) except zlib_ng.error: # Bad compression level pass From 4bebae81fef91a90ef410f765bb7791d6811278d Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 12 Jan 2024 14:00:56 +0100 Subject: [PATCH 08/12] Add an entry to the changelog --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index 8611aab..c07837e 100644 --- a/README.rst +++ b/README.rst @@ -113,6 +113,8 @@ Changelog in-development ~~~~~~~~~~~~~~~~~~~ +* #138: The gzip default compression level is now 1 when no value is provided + by the calling function. The default used to be determined by the backend. * #135: xopen now uses zlib-ng when available and applicable. v1.8.0 (2023-11-03) From 16794dd244ececc8110e74aaf06b5f75f611ac57 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 12 Jan 2024 14:01:34 +0100 Subject: [PATCH 09/12] Choose level 1 as default --- src/xopen/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 99021bb..21105a0 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -54,8 +54,8 @@ # 128K buffer size also used by cat, pigz etc. It is faster than the 8K default. BUFFER_SIZE = max(io.DEFAULT_BUFFER_SIZE, 128 * 1024) -# Compression level 6 is the default for most gzip applications -XOPEN_DEFAULT_GZIP_COMPRESSION = 6 +# 1 is by far the fastest and most efficient level. +XOPEN_DEFAULT_GZIP_COMPRESSION = 1 igzip: Optional[ModuleType] isal_zlib: Optional[ModuleType] From a1205861229eb2163ad23b6e6032a17f7d693b77 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 12 Jan 2024 14:01:59 +0100 Subject: [PATCH 10/12] Add forgotten comma --- src/xopen/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 21105a0..7c03644 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1065,7 +1065,7 @@ def _open_gz( # noqa: C901 # increase the level max(compresslevel, 2), **text_mode_kwargs, - threads=threads or max(_available_cpu_count(), 4) + threads=threads or max(_available_cpu_count(), 4), ) except zlib_ng.error: # Bad compression level pass From afaf77f9c55f23b01ee8ced2b746112cfe2c7b73 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 12 Jan 2024 14:03:58 +0100 Subject: [PATCH 11/12] Use one thread for igzip_threaded as that is more efficient than multiple --- src/xopen/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 7c03644..bfcdcb6 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1051,7 +1051,7 @@ def _open_gz( # noqa: C901 mode, compresslevel, **text_mode_kwargs, - threads=threads or max(_available_cpu_count(), 4), + threads=1, ) except ValueError: # Wrong compression level pass From 7ea3ae47e79510bafd62775eb7fa445132b6a337 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Tue, 16 Jan 2024 09:36:17 +0100 Subject: [PATCH 12/12] Remove redundant comment --- src/xopen/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index bfcdcb6..3d00694 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -54,7 +54,6 @@ # 128K buffer size also used by cat, pigz etc. It is faster than the 8K default. BUFFER_SIZE = max(io.DEFAULT_BUFFER_SIZE, 128 * 1024) -# 1 is by far the fastest and most efficient level. XOPEN_DEFAULT_GZIP_COMPRESSION = 1 igzip: Optional[ModuleType]