feat!: Change default character error handling to "surrogateescape"

This addresses long-standing ergonomic issue #43 when dealing with files that have various or unknown character encoding. Previously, the library assumed both input and output files should be UTF-8, and it failed in case this was incorrect, forcing the user to provide appropriate character encoding. After this commit, UTF-8 is still the default input/output encoding, but default error handling changed from "strict" to "surrogateescape", ie. non-UTF-8 characters will be read into Unicode surrogate pairs which will be turned to the original non-UTF-8 characters on output. To get the previous behaviour, use `SSAFile.load(..., errors=None)` and `SSAFile.save(..., errors=None)`. For text processing, you still should specify the encoding explicitly, otherwise you will get surrogate pairs instead of non-ASCII characters when inspecting the SSAFile. Note that multi-byte encodings may still break the parser; parsing with surrogate escapes will work best with ASCII-like encodings.
tkarabela · May 5, 2024 · 5c98e6c · 5c98e6c
1 parent ed2257e
commit 5c98e6c
Show file tree

Hide file tree

Showing 5 changed files with 253 additions and 16 deletions.
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -30,20 +30,40 @@ Now that we have a real file on the harddrive, let's import pysubs2 and load it.
     >>> subs
     <SSAFile with 2 events and 1 styles, last timestamp 0:02:00>
 
-.. tip:: By default, pysubs2 uses UTF-8 encoding when reading and writing files. Use the ``encoding`` keyword argument in case you need something else.
+.. note::
+   By default, pysubs2 uses UTF-8 encoding when reading and writing files, with surrogate pair escape error handling.
+   This works best if your file is either:
+
+      * in UTF-8 encoding or
+      * in a similar ASCII-like encoding (line ``latin-1``) and you don't need to work with the text (only convert subtitle format, shift time, etc.).
+
+   Use the ``encoding`` and ``errors`` keyword arguments in the :meth:`pysubs2.SSAFile.load()` and :meth:`pysubs2.SSAFile.save()` methods in case you need something else,
+   or you can do the processing yourself and work only with ``str`` using :meth:`pysubs2.SSAFile.from_string()` and :meth:`pysubs2.SSAFile.to_string()`.
+
+   If you use the default settings, you can get the input ``bytes`` for a particular subtitle using:
+
+   >>> subs[0].text.encode("utf-8", "surrogateescape")
 
 Now we have a subtitle file, the :class:`pysubs2.SSAFile` object. It has two "events", ie. subtitles. You can treat ``subs`` as a list:
 
     >>> subs[0].text
     "Once upon a time,"
     >>> for line in subs:
     ...     print(line.text)
-    "Once upon a time,"
-    "there was a SubRip file\\Nwith two subtitles."
+    Once upon a time,
+    there was a SubRip file\\Nwith two subtitles.
 
 Individual subtitles are :class:`pysubs2.SSAEvent` objects and have the attributes you'd expect, like ``start``, ``end`` and ``text``. Notice that the second subtitle text doesn't contain a newline, but literal "backlash N", which is how SubStation represents newlines. There could also be override tags like ``{\i1}`` for italics.
 
-.. tip:: If you don't entertain SubStation, there is also a :attr:`pysubs2.SSAEvent.plaintext` property which hides override tags and translates newlines for you. Be warned, however, that writing to this property throws away any override tags.
+.. tip::
+   If you don't entertain SubStation, there is also a :attr:`pysubs2.SSAEvent.plaintext` property which hides override tags
+   and translates newlines for you. Be warned, however, that writing to this property throws away any override tags.
+
+    >>> for line in subs:
+    ...     print(line.plaintext)
+    Once upon a time,
+    there was a SubRip file
+    with two subtitles.
 
 Working with timing
 -------------------

diff --git a/pysubs2/cli.py b/pysubs2/cli.py
@@ -3,7 +3,7 @@
 import os
 import re
 import os.path as op
-import io
+from io import TextIOWrapper
 import sys
 from textwrap import dedent
 from typing import List
@@ -81,6 +81,12 @@ def __init__(self) -> None:
                                  "If you wish to convert between encodings, make sure --input-enc is set correctly! "
                                  "Otherwise, your output files will probably be corrupted. It's a good idea to "
                                  "back up your files or use the -o option.")
+        parser.add_argument("--enc-error-handling", choices=("strict", "surrogateescape"),
+                            default="surrogateescape",
+                            help="Character encoding error handling for input and output. Defaults to 'surrogateescape' "
+                                 "which passes through unrecognized characters to output unchanged. Use 'strict' if "
+                                 "you want the command to fail when encountering a character incompatible with selected "
+                                 "input/output encoding.")
         parser.add_argument("--fps", metavar="FPS", type=positive_float,
                             help="This argument specifies framerate for MicroDVD files. By default, framerate "
                                  "is detected from the file. Use this when framerate specification is missing "
@@ -159,7 +165,7 @@ def main(self, argv: List[str]) -> int:
                     print("Skipping", path, "(not a file)")
                     errors += 1
                 else:
-                    with open(path, encoding=args.input_enc) as infile:
+                    with open(path, encoding=args.input_enc, errors=args.enc_error_handling) as infile:
                         subs = SSAFile.from_file(infile, args.input_format, args.fps, **extra_input_args)
 
                     self.process(subs, args)
@@ -178,12 +184,12 @@ def main(self, argv: List[str]) -> int:
                         _, filename = op.split(outpath)
                         outpath = op.join(args.output_dir, filename)
 
-                    with open(outpath, "w", encoding=args.output_enc) as outfile:
+                    with open(outpath, "w", encoding=args.output_enc, errors=args.enc_error_handling) as outfile:
                         subs.to_file(outfile, output_format, args.fps, apply_styles=not args.clean,
                                      **extra_output_args)
         elif not sys.stdin.isatty():
-            infile = io.TextIOWrapper(sys.stdin.buffer, args.input_enc)
-            outfile = io.TextIOWrapper(sys.stdout.buffer, args.output_enc)
+            infile = TextIOWrapper(sys.stdin.buffer, encoding=args.input_enc, errors=args.enc_error_handling)
+            outfile = TextIOWrapper(sys.stdout.buffer, encoding=args.output_enc, errors=args.enc_error_handling)
 
             subs = SSAFile.from_file(infile, args.input_format, args.fps)
             self.process(subs, args)

diff --git a/pysubs2/ssafile.py b/pysubs2/ssafile.py
@@ -50,7 +50,7 @@ def __init__(self) -> None:
 
     @classmethod
     def load(cls, path: str, encoding: str = "utf-8", format_: Optional[str] = None, fps: Optional[float] = None,
-             **kwargs: Any) -> "SSAFile":
+             errors: Optional[str] = "surrogateescape", **kwargs: Any) -> "SSAFile":
         """
         Load subtitle file from given path.
 
@@ -65,6 +65,15 @@ def load(cls, path: str, encoding: str = "utf-8", format_: Optional[str] = None,
             path (str): Path to subtitle file.
             encoding (str): Character encoding of input file.
                 Defaults to UTF-8, you may need to change this.
+            errors (Optional[str]): Error handling for character encoding
+                of input file. Defaults to ``"surrogateescape"``. See documentation
+                of builtin ``open()`` function for more.
+
+                .. versionchanged:: 2.0.0
+                    The ``errors`` parameter was introduced to facilitate
+                    pass-through of subtitle files with unknown text encoding.
+                    Previous versions of the library behaved as if ``errors=None``.
+
             format_ (str): Optional, forces use of specific parser
                 (eg. `"srt"`, `"ass"`). Otherwise, format is detected
                 automatically from file contents. This argument should
@@ -93,11 +102,11 @@ def load(cls, path: str, encoding: str = "utf-8", format_: Optional[str] = None,
 
         Example:
             >>> subs1 = pysubs2.load("subrip-subtitles.srt")
-            >>> subs2 = pysubs2.load("microdvd-subtitles.sub", fps=23.976)
-            >>> subs3 = pysubs2.load("subrip-subtitles-with-fancy-tags.srt", keep_unknown_html_tags=True)
+            >>> subs2 = pysubs2.load("microdvd-subtitles.sub",fps=23.976)
+            >>> subs3 = pysubs2.load("subrip-subtitles-with-fancy-tags.srt",keep_unknown_html_tags=True)
 
         """
-        with open(path, encoding=encoding) as fp:
+        with open(path, encoding=encoding, errors=errors) as fp:
             return cls.from_file(fp, format_, fps=fps, **kwargs)
 
     @classmethod
@@ -181,7 +190,7 @@ def from_file(cls, fp: TextIO, format_: Optional[str] = None, fps: Optional[floa
         return subs
 
     def save(self, path: str, encoding: str = "utf-8", format_: Optional[str] = None, fps: Optional[float] = None,
-             **kwargs: Any) -> None:
+             errors: Optional[str] = "surrogateescape", **kwargs: Any) -> None:
         """
         Save subtitle file to given path.
 
@@ -208,6 +217,15 @@ def save(self, path: str, encoding: str = "utf-8", format_: Optional[str] = None
                 different framerate, use this argument. See also
                 :meth:`SSAFile.transform_framerate()` for fixing bad
                 frame-based to time-based conversions.
+            errors (Optional[str]): Error handling for character encoding,
+                defaults to ``"surrogateescape"``. See documentation
+                of builtin ``open()`` function for more.
+
+                .. versionchanged:: 2.0.0
+                    The ``errors`` parameter was introduced to facilitate
+                    pass-through of subtitle files with unknown text encoding.
+                    Previous versions of the library behaved as if ``errors=None``.
+
             kwargs: Extra options for the writer.
 
         Raises:
@@ -222,7 +240,7 @@ def save(self, path: str, encoding: str = "utf-8", format_: Optional[str] = None
             ext = os.path.splitext(path)[1].lower()
             format_ = get_format_identifier(ext)
 
-        with open(path, "w", encoding=encoding) as fp:
+        with open(path, "w", encoding=encoding, errors=errors) as fp:
             self.to_file(fp, format_, fps=fps, **kwargs)
 
     def to_string(self, format_: str, fps: Optional[float] = None, **kwargs: Any) -> str:

diff --git a/tests/formats/test_subrip.py b/tests/formats/test_subrip.py
@@ -2,7 +2,8 @@
 pysubs2.formats.subrip tests
 
 """
-
+import os.path as op
+import tempfile
 from textwrap import dedent
 import pytest
 
@@ -302,3 +303,168 @@ def test_overflow_timestamp_write() -> None:
         text = ref.to_string("srt")
     subs = SSAFile.from_string(text)
     assert subs[0].end == MAX_REPRESENTABLE_TIME
+
+
+def test_win1250_passthrough_with_surrogateescape() -> None:
+    input_text = dedent("""\
+    1
+    00:00:00,000 --> 00:01:00,000
+    The quick brown fox jumps over the lazy dog
+
+    2
+    00:01:00,000 --> 00:02:00,000
+    Příliš žluťoučký kůň úpěl ďábelské ódy
+    
+    """)
+
+    input_bytes_win1250 = input_text.encode("windows-1250")
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        input_path = op.join(temp_dir, "input.srt")
+        output_path = op.join(temp_dir, "output.srt")
+        with open(input_path, "wb") as fp:
+            fp.write(input_bytes_win1250)
+
+        with pytest.raises(UnicodeDecodeError):
+            # legacy behaviour
+            SSAFile.load(input_path, errors=None)
+
+        subs = SSAFile.load(input_path)
+
+        assert subs[0].text == "The quick brown fox jumps over the lazy dog"
+        assert subs[1].text.startswith("P") and subs[1].text.endswith("dy")
+
+        subs.save(output_path)
+
+        with open(output_path, "rb") as fp:
+            output_bytes = fp.read()
+
+        assert input_bytes_win1250 == output_bytes
+
+
+def test_multiencoding_passthrough_with_surrogateescape() -> None:
+    input_text = dedent("""\
+    1
+    00:00:00,000 --> 00:01:00,000
+    The quick brown fox jumps over the lazy dog""")
+
+    input_bytes = input_text.encode("ascii")
+    input_bytes += b"\n" + "Příliš žluťoučký kůň úpěl ďábelské ódy".encode("windows-1250")
+    input_bytes += b"\n" + "Vamp quäkt: Grüß Felix bzw. Jody schön!".encode("utf-8")
+    input_bytes += b"\n" + "日本国".encode("shift-jis")
+    input_bytes += b"\n" + "道德經".encode("big5")
+    input_bytes += b"\n\n"
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        input_path = op.join(temp_dir, "input.srt")
+        output_path = op.join(temp_dir, "output.srt")
+        with open(input_path, "wb") as fp:
+            fp.write(input_bytes)
+
+        with pytest.raises(UnicodeDecodeError):
+            # legacy behaviour
+            SSAFile.load(input_path, errors=None)
+
+        subs = SSAFile.load(input_path)
+
+        assert subs[0].text.startswith("The quick brown fox jumps over the lazy dog")
+        assert "Felix bzw. Jody" in subs[0].text
+
+        subs.save(output_path)
+
+        with open(output_path, "rb") as fp:
+            output_bytes = fp.read()
+
+        assert input_bytes == output_bytes
+
+
+def test_utf8_read_write() -> None:
+    input_text = dedent("""\
+    1
+    00:00:00,000 --> 00:01:00,000
+    The quick brown fox jumps over the lazy dog
+    Příliš žluťoučký kůň úpěl ďábelské ódy
+    Vamp quäkt: Grüß Felix bzw. Jody schön!
+    日本国
+    道德經
+    
+    """)
+
+    input_bytes = input_text.encode("utf-8")
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        input_path = op.join(temp_dir, "input.srt")
+        output_path = op.join(temp_dir, "output.srt")
+        with open(input_path, "wb") as fp:
+            fp.write(input_bytes)
+
+        # legacy behaviour
+        subs_legacy = SSAFile.load(input_path, errors=None)
+        subs = SSAFile.load(input_path)
+        assert subs.equals(subs_legacy)
+
+        subs.save(output_path)
+
+        with open(output_path, "rb") as fp:
+            output_bytes = fp.read()
+
+        assert input_bytes == output_bytes
+
+
+def test_win1250_read_write() -> None:
+    input_text = dedent("""\
+    1
+    00:00:00,000 --> 00:01:00,000
+    The quick brown fox jumps over the lazy dog
+    Příliš žluťoučký kůň úpěl ďábelské ódy
+
+    """)
+
+    input_bytes = input_text.encode("windows-1250")
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        input_path = op.join(temp_dir, "input.srt")
+        output_path = op.join(temp_dir, "output.srt")
+        with open(input_path, "wb") as fp:
+            fp.write(input_bytes)
+
+        # legacy behaviour
+        subs_legacy = SSAFile.load(input_path, encoding="windows-1250", errors=None)
+        subs = SSAFile.load(input_path, encoding="windows-1250")
+        assert subs.equals(subs_legacy)
+
+        subs.save(output_path, encoding="windows-1250")
+
+        with open(output_path, "rb") as fp:
+            output_bytes = fp.read()
+
+        assert input_bytes == output_bytes
+
+
+def test_big5_read_write() -> None:
+    input_text = dedent("""\
+    1
+    00:00:00,000 --> 00:01:00,000
+    道德經
+
+    """)
+
+    input_bytes = input_text.encode("big5")
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        input_path = op.join(temp_dir, "input.srt")
+        output_path = op.join(temp_dir, "output.srt")
+        with open(input_path, "wb") as fp:
+            fp.write(input_bytes)
+
+        # legacy behaviour
+        subs_legacy = SSAFile.load(input_path, encoding="big5", errors=None)
+        subs = SSAFile.load(input_path, encoding="big5")
+        assert subs.equals(subs_legacy)
+
+        subs.save(output_path, encoding="big5")
+
+        with open(output_path, "rb") as fp:
+            output_bytes = fp.read()
+
+        assert input_bytes == output_bytes
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -18,6 +18,14 @@
 
 """
 
+TEST_SRT_FILE_WIN1250 = """\
+1
+00:00:00,000 --> 00:01:00,000
+An example subtitle.
+Příliš žluťoučký kůň úpěl ďábelské ódy
+
+"""
+
 TEST_MICRODVD_FILE = """\
 {1}{1}1000.0
 {0}{60000}An example subtitle.
@@ -407,3 +415,22 @@ def test_empty_notty_input_doesnt_print_help(capsys: Any, monkeypatch: Any) -> N
             assert p.returncode == 1
             assert not p.stdout.startswith("usage: pysubs2")
             assert "FormatAutodetectionError" in p.stderr
+
+
+def test_win1250_passthrough_with_surrogateescape() -> None:
+    input_bytes_win1250 = TEST_SRT_FILE_WIN1250.encode("windows-1250")
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        input_path = op.join(temp_dir, "input.srt")
+        output_dir = op.join(temp_dir, "output")
+        output_path = op.join(output_dir, "input.srt")
+        with open(input_path, "wb") as fp:
+            fp.write(input_bytes_win1250)
+
+        cmd = ["python", "-m", "pysubs2", "-o", output_dir, input_path]
+        subprocess.check_call(cmd)
+
+        with open(output_path, "rb") as fp:
+            output_bytes = fp.read()
+
+        assert input_bytes_win1250 == output_bytes