Skip to content

Commit

Permalink
Text wrapping occurs in xopen function rather than in helper functions
Browse files Browse the repository at this point in the history
  • Loading branch information
rhpvorderman committed Jan 17, 2024
1 parent 6d35655 commit 9e73de6
Showing 1 changed file with 48 additions and 110 deletions.
158 changes: 48 additions & 110 deletions src/xopen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
Optional,
Union,
TextIO,
AnyStr,
IO,
List,
Set,
Expand Down Expand Up @@ -193,9 +192,7 @@ def __init__(
"""
if mode not in ("w", "wb", "a", "ab"):
raise ValueError(
"Mode is '{}', but it must be 'w', 'wb', 'a', or 'ab'".format(
mode
)
"Mode is '{}', but it must be 'w', 'wb', 'a', or 'ab'".format(mode)
)

self.name: str = str(os.fspath(path))
Expand Down Expand Up @@ -470,11 +467,10 @@ class PipedGzipReader(PipedCompressionReader):
"""

def __init__(
self, path, mode: str = "r", *, encoding="utf-8", errors=None, newline=None
self,
path,
):
super().__init__(
path, ["gzip"]
)
super().__init__(path, ["gzip"])


class PipedGzipWriter(PipedCompressionWriter):
Expand Down Expand Up @@ -618,17 +614,8 @@ class PipedXzReader(PipedCompressionReader):
master branch.)
"""

def __init__(
self,
path,
threads: Optional[int] = None
):
super().__init__(
path,
["xz"],
"-T",
threads
)
def __init__(self, path, threads: Optional[int] = None):
super().__init__(path, ["xz"], "-T", threads)


class PipedXzWriter(PipedCompressionWriter):
Expand Down Expand Up @@ -658,14 +645,7 @@ def __init__(
and compresslevel not in self._accepted_compression_levels
):
raise ValueError("compresslevel must be between 0 and 9")
super().__init__(
path,
["xz"],
mode,
compresslevel,
"-T",
threads
)
super().__init__(path, ["xz"], mode, compresslevel, "-T", threads)


class PipedIGzipReader(PipedCompressionReader):
Expand All @@ -676,9 +656,7 @@ class PipedIGzipReader(PipedCompressionReader):
architecture-specific optimizations as a result.
"""

def __init__(
self, path
):
def __init__(self, path):
if not _can_read_concatenated_gz("igzip"):
# Instead of elaborate version string checking once the problem is
# fixed, it is much easier to use this, "proof in the pudding" type
Expand All @@ -688,9 +666,7 @@ def __init__(
"concatenated gzip files and is therefore not "
"safe to use. See: https://github.com/intel/isa-l/issues/143"
)
super().__init__(
path, ["igzip"]
)
super().__init__(path, ["igzip"])


class PipedZstdReader(PipedCompressionReader):
Expand Down Expand Up @@ -778,21 +754,14 @@ def __init__(

class PipedPythonIsalReader(PipedCompressionReader):
def __init__(
self, path,
self,
path,
):
super().__init__(
path,
[sys.executable, "-m", "isal.igzip"]
)
super().__init__(path, [sys.executable, "-m", "isal.igzip"])


class PipedPythonIsalWriter(PipedCompressionWriter):
def __init__(
self,
path,
mode: str = "wb",
compresslevel: Optional[int] = None
):
def __init__(self, path, mode: str = "wb", compresslevel: Optional[int] = None):
if compresslevel is not None and compresslevel not in range(0, 4):
raise ValueError("compresslevel must be between 0 and 3")
super().__init__(
Expand All @@ -803,52 +772,48 @@ def __init__(
)


def _open_stdin_or_out(mode: str, **text_mode_kwargs) -> IO:
# Do not return sys.stdin or sys.stdout directly as we want the returned object
# to be closable without closing sys.stdout.
std = dict(r=sys.stdin, w=sys.stdout)[mode[0]]
return open(std.fileno(), mode=mode, closefd=False, **text_mode_kwargs)
def _open_stdin_or_out(mode: str):
assert "b" in mode
std = sys.stdout if "w" in mode else sys.stdin
return open(std.fileno(), mode=mode, closefd=False)


def _open_bz2(filename, mode: str, threads: Optional[int], **text_mode_kwargs):
def _open_bz2(filename, mode: str, threads: Optional[int]):
assert "b" in mode
if threads != 0:
try:
if "r" in mode:
return PipedPBzip2Reader(filename, mode, threads, **text_mode_kwargs)
return PipedPBzip2Reader(filename, threads)
else:
return PipedPBzip2Writer(filename, mode, threads, **text_mode_kwargs)
return PipedPBzip2Writer(filename, mode, threads)
except OSError:
pass # We try without threads.

return bz2.open(filename, mode, **text_mode_kwargs)
return bz2.open(filename, mode)


def _open_xz(
filename,
mode: str,
compresslevel: Optional[int],
threads: Optional[int],
**text_mode_kwargs,
):
if compresslevel is None:
compresslevel = 6

if threads != 0:
try:
if "r" in mode:
return PipedXzReader(filename, mode, threads, **text_mode_kwargs)
return PipedXzReader(filename, threads)
else:
return PipedXzWriter(
filename, mode, compresslevel, threads, **text_mode_kwargs
)
return PipedXzWriter(filename, mode, compresslevel, threads)
except OSError:
pass # We try without threads.

return lzma.open(
filename,
mode,
preset=compresslevel if "w" in mode else None,
**text_mode_kwargs,
)


Expand All @@ -857,19 +822,17 @@ def _open_zst( # noqa: C901
mode: str,
compresslevel: Optional[int],
threads: Optional[int],
**text_mode_kwargs,
):
assert "b" in mode
assert compresslevel != 0
if compresslevel is None:
compresslevel = 3
if threads != 0:
try:
if "r" in mode:
return PipedZstdReader(filename, mode, **text_mode_kwargs)
return PipedZstdReader(filename)
else:
return PipedZstdWriter(
filename, mode, compresslevel, threads, **text_mode_kwargs
)
return PipedZstdWriter(filename, mode, compresslevel, threads)
except OSError:
if zstandard is None:
# No fallback available
Expand All @@ -881,12 +844,7 @@ def _open_zst( # noqa: C901
cctx = zstandard.ZstdCompressor(level=compresslevel)
else:
cctx = None
f = zstandard.open(
filename,
mode,
cctx=cctx,
**text_mode_kwargs,
)
f = zstandard.open(filename, mode, cctx=cctx)
if mode == "rb":
return io.BufferedReader(f)
elif mode == "wb":
Expand All @@ -897,7 +855,7 @@ def _open_zst( # noqa: C901
def _open_gz( # noqa: C901
filename, mode: str, compresslevel, threads, **text_mode_kwargs
):
assert mode in ("rt", "rb", "wt", "wb", "at", "ab")
assert "b" in mode
if compresslevel is None:
# Force the same compression level on every tool regardless of
# library defaults
Expand All @@ -911,7 +869,6 @@ def _open_gz( # noqa: C901
filename,
mode,
compresslevel,
**text_mode_kwargs,
threads=1,
)
if gzip_ng_threaded and zlib_ng:
Expand All @@ -922,43 +879,31 @@ def _open_gz( # noqa: C901
# zlib-ng level 1 is 50% bigger than zlib level 1. Level
# 2 gives a size close to expectations.
compresslevel=2 if compresslevel == 1 else compresslevel,
**text_mode_kwargs,
threads=threads or max(_available_cpu_count(), 4),
)
except zlib_ng.error: # Bad compression level
pass
try:
if "r" in mode:
try:
return PipedPigzReader(
filename, mode, threads=threads, **text_mode_kwargs
)
return PipedPigzReader(filename, threads=threads)
except OSError:
return PipedGzipReader(filename, mode, **text_mode_kwargs)
return PipedGzipReader(filename)
else:
try:
return PipedPigzWriter(
filename,
mode,
compresslevel,
threads=threads,
**text_mode_kwargs,
filename, mode, compresslevel, threads=threads
)
except OSError:
return PipedGzipWriter(
filename, mode, compresslevel, **text_mode_kwargs
)
return PipedGzipWriter(filename, mode, compresslevel)
except OSError:
pass # We try without threads.

g = _open_reproducible_gzip(
return _open_reproducible_gzip(
filename,
mode=mode[0] + "b",
mode=mode,
compresslevel=compresslevel,
)
if "t" in mode:
return io.TextIOWrapper(g, **text_mode_kwargs)
return g


def _open_reproducible_gzip(filename, mode: str, compresslevel: int):
Expand Down Expand Up @@ -1126,18 +1071,13 @@ def xopen( # noqa: C901 # The function is complex, but readable.
"""
if mode in ("r", "w", "a"):
mode += "t" # type: ignore
binary_mode = mode + "b"
else:
binary_mode = mode
if mode not in ("rt", "rb", "wt", "wb", "at", "ab"):
raise ValueError("Mode '{}' not supported".format(mode))
filename = os.fspath(filename)

if "b" in mode:
# Do not pass encoding etc. in binary mode as this raises errors.
text_mode_kwargs = dict()
else:
text_mode_kwargs = dict(encoding=encoding, errors=errors, newline=newline)
if filename == "-":
return _open_stdin_or_out(mode, **text_mode_kwargs)

if format not in (None, "gz", "xz", "bz2", "zst"):
raise ValueError(
f"Format not supported: {format}. "
Expand All @@ -1147,22 +1087,18 @@ def xopen( # noqa: C901 # The function is complex, but readable.
if detected_format is None and "w" not in mode:
detected_format = _detect_format_from_content(filename)

if detected_format == "gz":
opened_file = _open_gz(
filename, mode, compresslevel, threads, **text_mode_kwargs
)
if filename == "-":
opened_file = _open_stdin_or_out(binary_mode)
elif detected_format == "gz":
opened_file = _open_gz(filename, binary_mode, compresslevel, threads)
elif detected_format == "xz":
opened_file = _open_xz(
filename, mode, compresslevel, threads, **text_mode_kwargs
)
opened_file = _open_xz(filename, binary_mode, compresslevel, threads)
elif detected_format == "bz2":
opened_file = _open_bz2(filename, mode, threads, **text_mode_kwargs)
opened_file = _open_bz2(filename, binary_mode, threads)
elif detected_format == "zst":
opened_file = _open_zst(
filename, mode, compresslevel, threads, **text_mode_kwargs
)
opened_file = _open_zst(filename, binary_mode, compresslevel, threads)
else:
opened_file = open(filename, mode, **text_mode_kwargs) # type: ignore
opened_file = open(filename, binary_mode) # type: ignore

# The "write" method for GzipFile is very costly. Lots of python calls are
# made. To a lesser extent this is true for LzmaFile and BZ2File. By
Expand All @@ -1176,4 +1112,6 @@ def xopen( # noqa: C901 # The function is complex, but readable.
opened_file = io.BufferedWriter(
opened_file, buffer_size=BUFFER_SIZE # type: ignore
)
if "t" in mode:
return io.TextIOWrapper(opened_file, encoding, errors, newline)
return opened_file

0 comments on commit 9e73de6

Please sign in to comment.