diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 1e3a5d5..93d1a1a 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -30,7 +30,19 @@ Now that we have a real file on the harddrive, let's import pysubs2 and load it. >>> subs -.. tip:: By default, pysubs2 uses UTF-8 encoding when reading and writing files. Use the ``encoding`` keyword argument in case you need something else. +.. note:: + By default, pysubs2 uses UTF-8 encoding when reading and writing files, with surrogate pair escape error handling. + This works best if your file is either: + + * in UTF-8 encoding or + * in a similar ASCII-like encoding (line ``latin-1``) and you don't need to work with the text (only convert subtitle format, shift time, etc.). + + Use the ``encoding`` and ``errors`` keyword arguments in the :meth:`pysubs2.SSAFile.load()` and :meth:`pysubs2.SSAFile.save()` methods in case you need something else, + or you can do the processing yourself and work only with ``str`` using :meth:`pysubs2.SSAFile.from_string()` and :meth:`pysubs2.SSAFile.to_string()`. + + If you use the default settings, you can get the input ``bytes`` for a particular subtitle using: + + >>> subs[0].text.encode("utf-8", "surrogateescape") Now we have a subtitle file, the :class:`pysubs2.SSAFile` object. It has two "events", ie. subtitles. You can treat ``subs`` as a list: @@ -38,12 +50,20 @@ Now we have a subtitle file, the :class:`pysubs2.SSAFile` object. It has two "ev "Once upon a time," >>> for line in subs: ... print(line.text) - "Once upon a time," - "there was a SubRip file\\Nwith two subtitles." + Once upon a time, + there was a SubRip file\\Nwith two subtitles. Individual subtitles are :class:`pysubs2.SSAEvent` objects and have the attributes you'd expect, like ``start``, ``end`` and ``text``. Notice that the second subtitle text doesn't contain a newline, but literal "backlash N", which is how SubStation represents newlines. There could also be override tags like ``{\i1}`` for italics. -.. tip:: If you don't entertain SubStation, there is also a :attr:`pysubs2.SSAEvent.plaintext` property which hides override tags and translates newlines for you. Be warned, however, that writing to this property throws away any override tags. +.. tip:: + If you don't entertain SubStation, there is also a :attr:`pysubs2.SSAEvent.plaintext` property which hides override tags + and translates newlines for you. Be warned, however, that writing to this property throws away any override tags. + + >>> for line in subs: + ... print(line.plaintext) + Once upon a time, + there was a SubRip file + with two subtitles. Working with timing ------------------- diff --git a/pysubs2/cli.py b/pysubs2/cli.py index 5dd6a6f..14dc676 100644 --- a/pysubs2/cli.py +++ b/pysubs2/cli.py @@ -3,7 +3,7 @@ import os import re import os.path as op -import io +from io import TextIOWrapper import sys from textwrap import dedent from typing import List @@ -81,6 +81,12 @@ def __init__(self) -> None: "If you wish to convert between encodings, make sure --input-enc is set correctly! " "Otherwise, your output files will probably be corrupted. It's a good idea to " "back up your files or use the -o option.") + parser.add_argument("--enc-error-handling", choices=("strict", "surrogateescape"), + default="surrogateescape", + help="Character encoding error handling for input and output. Defaults to 'surrogateescape' " + "which passes through unrecognized characters to output unchanged. Use 'strict' if " + "you want the command to fail when encountering a character incompatible with selected " + "input/output encoding.") parser.add_argument("--fps", metavar="FPS", type=positive_float, help="This argument specifies framerate for MicroDVD files. By default, framerate " "is detected from the file. Use this when framerate specification is missing " @@ -159,7 +165,7 @@ def main(self, argv: List[str]) -> int: print("Skipping", path, "(not a file)") errors += 1 else: - with open(path, encoding=args.input_enc) as infile: + with open(path, encoding=args.input_enc, errors=args.enc_error_handling) as infile: subs = SSAFile.from_file(infile, args.input_format, args.fps, **extra_input_args) self.process(subs, args) @@ -178,12 +184,12 @@ def main(self, argv: List[str]) -> int: _, filename = op.split(outpath) outpath = op.join(args.output_dir, filename) - with open(outpath, "w", encoding=args.output_enc) as outfile: + with open(outpath, "w", encoding=args.output_enc, errors=args.enc_error_handling) as outfile: subs.to_file(outfile, output_format, args.fps, apply_styles=not args.clean, **extra_output_args) elif not sys.stdin.isatty(): - infile = io.TextIOWrapper(sys.stdin.buffer, args.input_enc) - outfile = io.TextIOWrapper(sys.stdout.buffer, args.output_enc) + infile = TextIOWrapper(sys.stdin.buffer, encoding=args.input_enc, errors=args.enc_error_handling) + outfile = TextIOWrapper(sys.stdout.buffer, encoding=args.output_enc, errors=args.enc_error_handling) subs = SSAFile.from_file(infile, args.input_format, args.fps) self.process(subs, args) diff --git a/pysubs2/ssafile.py b/pysubs2/ssafile.py index defdf22..7d91ae5 100644 --- a/pysubs2/ssafile.py +++ b/pysubs2/ssafile.py @@ -50,7 +50,7 @@ def __init__(self) -> None: @classmethod def load(cls, path: str, encoding: str = "utf-8", format_: Optional[str] = None, fps: Optional[float] = None, - **kwargs: Any) -> "SSAFile": + errors: Optional[str] = "surrogateescape", **kwargs: Any) -> "SSAFile": """ Load subtitle file from given path. @@ -65,6 +65,15 @@ def load(cls, path: str, encoding: str = "utf-8", format_: Optional[str] = None, path (str): Path to subtitle file. encoding (str): Character encoding of input file. Defaults to UTF-8, you may need to change this. + errors (Optional[str]): Error handling for character encoding + of input file. Defaults to ``"surrogateescape"``. See documentation + of builtin ``open()`` function for more. + + .. versionchanged:: 2.0.0 + The ``errors`` parameter was introduced to facilitate + pass-through of subtitle files with unknown text encoding. + Previous versions of the library behaved as if ``errors=None``. + format_ (str): Optional, forces use of specific parser (eg. `"srt"`, `"ass"`). Otherwise, format is detected automatically from file contents. This argument should @@ -93,11 +102,11 @@ def load(cls, path: str, encoding: str = "utf-8", format_: Optional[str] = None, Example: >>> subs1 = pysubs2.load("subrip-subtitles.srt") - >>> subs2 = pysubs2.load("microdvd-subtitles.sub", fps=23.976) - >>> subs3 = pysubs2.load("subrip-subtitles-with-fancy-tags.srt", keep_unknown_html_tags=True) + >>> subs2 = pysubs2.load("microdvd-subtitles.sub",fps=23.976) + >>> subs3 = pysubs2.load("subrip-subtitles-with-fancy-tags.srt",keep_unknown_html_tags=True) """ - with open(path, encoding=encoding) as fp: + with open(path, encoding=encoding, errors=errors) as fp: return cls.from_file(fp, format_, fps=fps, **kwargs) @classmethod @@ -181,7 +190,7 @@ def from_file(cls, fp: TextIO, format_: Optional[str] = None, fps: Optional[floa return subs def save(self, path: str, encoding: str = "utf-8", format_: Optional[str] = None, fps: Optional[float] = None, - **kwargs: Any) -> None: + errors: Optional[str] = "surrogateescape", **kwargs: Any) -> None: """ Save subtitle file to given path. @@ -208,6 +217,15 @@ def save(self, path: str, encoding: str = "utf-8", format_: Optional[str] = None different framerate, use this argument. See also :meth:`SSAFile.transform_framerate()` for fixing bad frame-based to time-based conversions. + errors (Optional[str]): Error handling for character encoding, + defaults to ``"surrogateescape"``. See documentation + of builtin ``open()`` function for more. + + .. versionchanged:: 2.0.0 + The ``errors`` parameter was introduced to facilitate + pass-through of subtitle files with unknown text encoding. + Previous versions of the library behaved as if ``errors=None``. + kwargs: Extra options for the writer. Raises: @@ -222,7 +240,7 @@ def save(self, path: str, encoding: str = "utf-8", format_: Optional[str] = None ext = os.path.splitext(path)[1].lower() format_ = get_format_identifier(ext) - with open(path, "w", encoding=encoding) as fp: + with open(path, "w", encoding=encoding, errors=errors) as fp: self.to_file(fp, format_, fps=fps, **kwargs) def to_string(self, format_: str, fps: Optional[float] = None, **kwargs: Any) -> str: diff --git a/tests/formats/test_subrip.py b/tests/formats/test_subrip.py index 483d69a..74c2246 100644 --- a/tests/formats/test_subrip.py +++ b/tests/formats/test_subrip.py @@ -2,7 +2,8 @@ pysubs2.formats.subrip tests """ - +import os.path as op +import tempfile from textwrap import dedent import pytest @@ -302,3 +303,168 @@ def test_overflow_timestamp_write() -> None: text = ref.to_string("srt") subs = SSAFile.from_string(text) assert subs[0].end == MAX_REPRESENTABLE_TIME + + +def test_win1250_passthrough_with_surrogateescape() -> None: + input_text = dedent("""\ + 1 + 00:00:00,000 --> 00:01:00,000 + The quick brown fox jumps over the lazy dog + + 2 + 00:01:00,000 --> 00:02:00,000 + Příliš žluťoučký kůň úpěl ďábelské ódy + + """) + + input_bytes_win1250 = input_text.encode("windows-1250") + + with tempfile.TemporaryDirectory() as temp_dir: + input_path = op.join(temp_dir, "input.srt") + output_path = op.join(temp_dir, "output.srt") + with open(input_path, "wb") as fp: + fp.write(input_bytes_win1250) + + with pytest.raises(UnicodeDecodeError): + # legacy behaviour + SSAFile.load(input_path, errors=None) + + subs = SSAFile.load(input_path) + + assert subs[0].text == "The quick brown fox jumps over the lazy dog" + assert subs[1].text.startswith("P") and subs[1].text.endswith("dy") + + subs.save(output_path) + + with open(output_path, "rb") as fp: + output_bytes = fp.read() + + assert input_bytes_win1250 == output_bytes + + +def test_multiencoding_passthrough_with_surrogateescape() -> None: + input_text = dedent("""\ + 1 + 00:00:00,000 --> 00:01:00,000 + The quick brown fox jumps over the lazy dog""") + + input_bytes = input_text.encode("ascii") + input_bytes += b"\n" + "Příliš žluťoučký kůň úpěl ďábelské ódy".encode("windows-1250") + input_bytes += b"\n" + "Vamp quäkt: Grüß Felix bzw. Jody schön!".encode("utf-8") + input_bytes += b"\n" + "日本国".encode("shift-jis") + input_bytes += b"\n" + "道德經".encode("big5") + input_bytes += b"\n\n" + + with tempfile.TemporaryDirectory() as temp_dir: + input_path = op.join(temp_dir, "input.srt") + output_path = op.join(temp_dir, "output.srt") + with open(input_path, "wb") as fp: + fp.write(input_bytes) + + with pytest.raises(UnicodeDecodeError): + # legacy behaviour + SSAFile.load(input_path, errors=None) + + subs = SSAFile.load(input_path) + + assert subs[0].text.startswith("The quick brown fox jumps over the lazy dog") + assert "Felix bzw. Jody" in subs[0].text + + subs.save(output_path) + + with open(output_path, "rb") as fp: + output_bytes = fp.read() + + assert input_bytes == output_bytes + + +def test_utf8_read_write() -> None: + input_text = dedent("""\ + 1 + 00:00:00,000 --> 00:01:00,000 + The quick brown fox jumps over the lazy dog + Příliš žluťoučký kůň úpěl ďábelské ódy + Vamp quäkt: Grüß Felix bzw. Jody schön! + 日本国 + 道德經 + + """) + + input_bytes = input_text.encode("utf-8") + + with tempfile.TemporaryDirectory() as temp_dir: + input_path = op.join(temp_dir, "input.srt") + output_path = op.join(temp_dir, "output.srt") + with open(input_path, "wb") as fp: + fp.write(input_bytes) + + # legacy behaviour + subs_legacy = SSAFile.load(input_path, errors=None) + subs = SSAFile.load(input_path) + assert subs.equals(subs_legacy) + + subs.save(output_path) + + with open(output_path, "rb") as fp: + output_bytes = fp.read() + + assert input_bytes == output_bytes + + +def test_win1250_read_write() -> None: + input_text = dedent("""\ + 1 + 00:00:00,000 --> 00:01:00,000 + The quick brown fox jumps over the lazy dog + Příliš žluťoučký kůň úpěl ďábelské ódy + + """) + + input_bytes = input_text.encode("windows-1250") + + with tempfile.TemporaryDirectory() as temp_dir: + input_path = op.join(temp_dir, "input.srt") + output_path = op.join(temp_dir, "output.srt") + with open(input_path, "wb") as fp: + fp.write(input_bytes) + + # legacy behaviour + subs_legacy = SSAFile.load(input_path, encoding="windows-1250", errors=None) + subs = SSAFile.load(input_path, encoding="windows-1250") + assert subs.equals(subs_legacy) + + subs.save(output_path, encoding="windows-1250") + + with open(output_path, "rb") as fp: + output_bytes = fp.read() + + assert input_bytes == output_bytes + + +def test_big5_read_write() -> None: + input_text = dedent("""\ + 1 + 00:00:00,000 --> 00:01:00,000 + 道德經 + + """) + + input_bytes = input_text.encode("big5") + + with tempfile.TemporaryDirectory() as temp_dir: + input_path = op.join(temp_dir, "input.srt") + output_path = op.join(temp_dir, "output.srt") + with open(input_path, "wb") as fp: + fp.write(input_bytes) + + # legacy behaviour + subs_legacy = SSAFile.load(input_path, encoding="big5", errors=None) + subs = SSAFile.load(input_path, encoding="big5") + assert subs.equals(subs_legacy) + + subs.save(output_path, encoding="big5") + + with open(output_path, "rb") as fp: + output_bytes = fp.read() + + assert input_bytes == output_bytes diff --git a/tests/test_cli.py b/tests/test_cli.py index bc6d17e..b526f0b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -18,6 +18,14 @@ """ +TEST_SRT_FILE_WIN1250 = """\ +1 +00:00:00,000 --> 00:01:00,000 +An example subtitle. +Příliš žluťoučký kůň úpěl ďábelské ódy + +""" + TEST_MICRODVD_FILE = """\ {1}{1}1000.0 {0}{60000}An example subtitle. @@ -407,3 +415,22 @@ def test_empty_notty_input_doesnt_print_help(capsys: Any, monkeypatch: Any) -> N assert p.returncode == 1 assert not p.stdout.startswith("usage: pysubs2") assert "FormatAutodetectionError" in p.stderr + + +def test_win1250_passthrough_with_surrogateescape() -> None: + input_bytes_win1250 = TEST_SRT_FILE_WIN1250.encode("windows-1250") + + with tempfile.TemporaryDirectory() as temp_dir: + input_path = op.join(temp_dir, "input.srt") + output_dir = op.join(temp_dir, "output") + output_path = op.join(output_dir, "input.srt") + with open(input_path, "wb") as fp: + fp.write(input_bytes_win1250) + + cmd = ["python", "-m", "pysubs2", "-o", output_dir, input_path] + subprocess.check_call(cmd) + + with open(output_path, "rb") as fp: + output_bytes = fp.read() + + assert input_bytes_win1250 == output_bytes