Skip to content

Commit

Permalink
feat!: Change default character error handling to "surrogateescape"
Browse files Browse the repository at this point in the history
This addresses long-standing ergonomic issue #43 when dealing with
files that have various or unknown character encoding. Previously,
the library assumed both input and output files should be UTF-8,
and it failed in case this was incorrect, forcing the user to provide
appropriate character encoding.

After this commit, UTF-8 is still the default input/output encoding,
but default error handling changed from "strict" to "surrogateescape",
ie. non-UTF-8 characters will be read into Unicode surrogate pairs which
will be turned to the original non-UTF-8 characters on output.

To get the previous behaviour, use `SSAFile.load(..., errors=None)` and
`SSAFile.save(..., errors=None)`.

For text processing, you still should specify the encoding explicitly,
otherwise you will get surrogate pairs instead of non-ASCII characters
when inspecting the SSAFile.

Note that multi-byte encodings may still break the parser; parsing with
surrogate escapes will work best with ASCII-like encodings.
  • Loading branch information
tkarabela committed May 5, 2024
1 parent ed2257e commit 5c98e6c
Show file tree
Hide file tree
Showing 5 changed files with 253 additions and 16 deletions.
28 changes: 24 additions & 4 deletions docs/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,40 @@ Now that we have a real file on the harddrive, let's import pysubs2 and load it.
>>> subs
<SSAFile with 2 events and 1 styles, last timestamp 0:02:00>

.. tip:: By default, pysubs2 uses UTF-8 encoding when reading and writing files. Use the ``encoding`` keyword argument in case you need something else.
.. note::
By default, pysubs2 uses UTF-8 encoding when reading and writing files, with surrogate pair escape error handling.
This works best if your file is either:

* in UTF-8 encoding or
* in a similar ASCII-like encoding (line ``latin-1``) and you don't need to work with the text (only convert subtitle format, shift time, etc.).

Use the ``encoding`` and ``errors`` keyword arguments in the :meth:`pysubs2.SSAFile.load()` and :meth:`pysubs2.SSAFile.save()` methods in case you need something else,
or you can do the processing yourself and work only with ``str`` using :meth:`pysubs2.SSAFile.from_string()` and :meth:`pysubs2.SSAFile.to_string()`.

If you use the default settings, you can get the input ``bytes`` for a particular subtitle using:

>>> subs[0].text.encode("utf-8", "surrogateescape")

Now we have a subtitle file, the :class:`pysubs2.SSAFile` object. It has two "events", ie. subtitles. You can treat ``subs`` as a list:

>>> subs[0].text
"Once upon a time,"
>>> for line in subs:
... print(line.text)
"Once upon a time,"
"there was a SubRip file\\Nwith two subtitles."
Once upon a time,
there was a SubRip file\\Nwith two subtitles.

Individual subtitles are :class:`pysubs2.SSAEvent` objects and have the attributes you'd expect, like ``start``, ``end`` and ``text``. Notice that the second subtitle text doesn't contain a newline, but literal "backlash N", which is how SubStation represents newlines. There could also be override tags like ``{\i1}`` for italics.

.. tip:: If you don't entertain SubStation, there is also a :attr:`pysubs2.SSAEvent.plaintext` property which hides override tags and translates newlines for you. Be warned, however, that writing to this property throws away any override tags.
.. tip::
If you don't entertain SubStation, there is also a :attr:`pysubs2.SSAEvent.plaintext` property which hides override tags
and translates newlines for you. Be warned, however, that writing to this property throws away any override tags.

>>> for line in subs:
... print(line.plaintext)
Once upon a time,
there was a SubRip file
with two subtitles.

Working with timing
-------------------
Expand Down
16 changes: 11 additions & 5 deletions pysubs2/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import re
import os.path as op
import io
from io import TextIOWrapper
import sys
from textwrap import dedent
from typing import List
Expand Down Expand Up @@ -81,6 +81,12 @@ def __init__(self) -> None:
"If you wish to convert between encodings, make sure --input-enc is set correctly! "
"Otherwise, your output files will probably be corrupted. It's a good idea to "
"back up your files or use the -o option.")
parser.add_argument("--enc-error-handling", choices=("strict", "surrogateescape"),
default="surrogateescape",
help="Character encoding error handling for input and output. Defaults to 'surrogateescape' "
"which passes through unrecognized characters to output unchanged. Use 'strict' if "
"you want the command to fail when encountering a character incompatible with selected "
"input/output encoding.")
parser.add_argument("--fps", metavar="FPS", type=positive_float,
help="This argument specifies framerate for MicroDVD files. By default, framerate "
"is detected from the file. Use this when framerate specification is missing "
Expand Down Expand Up @@ -159,7 +165,7 @@ def main(self, argv: List[str]) -> int:
print("Skipping", path, "(not a file)")
errors += 1
else:
with open(path, encoding=args.input_enc) as infile:
with open(path, encoding=args.input_enc, errors=args.enc_error_handling) as infile:
subs = SSAFile.from_file(infile, args.input_format, args.fps, **extra_input_args)

self.process(subs, args)
Expand All @@ -178,12 +184,12 @@ def main(self, argv: List[str]) -> int:
_, filename = op.split(outpath)
outpath = op.join(args.output_dir, filename)

with open(outpath, "w", encoding=args.output_enc) as outfile:
with open(outpath, "w", encoding=args.output_enc, errors=args.enc_error_handling) as outfile:
subs.to_file(outfile, output_format, args.fps, apply_styles=not args.clean,
**extra_output_args)
elif not sys.stdin.isatty():
infile = io.TextIOWrapper(sys.stdin.buffer, args.input_enc)
outfile = io.TextIOWrapper(sys.stdout.buffer, args.output_enc)
infile = TextIOWrapper(sys.stdin.buffer, encoding=args.input_enc, errors=args.enc_error_handling)
outfile = TextIOWrapper(sys.stdout.buffer, encoding=args.output_enc, errors=args.enc_error_handling)

subs = SSAFile.from_file(infile, args.input_format, args.fps)
self.process(subs, args)
Expand Down
30 changes: 24 additions & 6 deletions pysubs2/ssafile.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def __init__(self) -> None:

@classmethod
def load(cls, path: str, encoding: str = "utf-8", format_: Optional[str] = None, fps: Optional[float] = None,
**kwargs: Any) -> "SSAFile":
errors: Optional[str] = "surrogateescape", **kwargs: Any) -> "SSAFile":
"""
Load subtitle file from given path.
Expand All @@ -65,6 +65,15 @@ def load(cls, path: str, encoding: str = "utf-8", format_: Optional[str] = None,
path (str): Path to subtitle file.
encoding (str): Character encoding of input file.
Defaults to UTF-8, you may need to change this.
errors (Optional[str]): Error handling for character encoding
of input file. Defaults to ``"surrogateescape"``. See documentation
of builtin ``open()`` function for more.
.. versionchanged:: 2.0.0
The ``errors`` parameter was introduced to facilitate
pass-through of subtitle files with unknown text encoding.
Previous versions of the library behaved as if ``errors=None``.
format_ (str): Optional, forces use of specific parser
(eg. `"srt"`, `"ass"`). Otherwise, format is detected
automatically from file contents. This argument should
Expand Down Expand Up @@ -93,11 +102,11 @@ def load(cls, path: str, encoding: str = "utf-8", format_: Optional[str] = None,
Example:
>>> subs1 = pysubs2.load("subrip-subtitles.srt")
>>> subs2 = pysubs2.load("microdvd-subtitles.sub", fps=23.976)
>>> subs3 = pysubs2.load("subrip-subtitles-with-fancy-tags.srt", keep_unknown_html_tags=True)
>>> subs2 = pysubs2.load("microdvd-subtitles.sub",fps=23.976)
>>> subs3 = pysubs2.load("subrip-subtitles-with-fancy-tags.srt",keep_unknown_html_tags=True)
"""
with open(path, encoding=encoding) as fp:
with open(path, encoding=encoding, errors=errors) as fp:
return cls.from_file(fp, format_, fps=fps, **kwargs)

@classmethod
Expand Down Expand Up @@ -181,7 +190,7 @@ def from_file(cls, fp: TextIO, format_: Optional[str] = None, fps: Optional[floa
return subs

def save(self, path: str, encoding: str = "utf-8", format_: Optional[str] = None, fps: Optional[float] = None,
**kwargs: Any) -> None:
errors: Optional[str] = "surrogateescape", **kwargs: Any) -> None:
"""
Save subtitle file to given path.
Expand All @@ -208,6 +217,15 @@ def save(self, path: str, encoding: str = "utf-8", format_: Optional[str] = None
different framerate, use this argument. See also
:meth:`SSAFile.transform_framerate()` for fixing bad
frame-based to time-based conversions.
errors (Optional[str]): Error handling for character encoding,
defaults to ``"surrogateescape"``. See documentation
of builtin ``open()`` function for more.
.. versionchanged:: 2.0.0
The ``errors`` parameter was introduced to facilitate
pass-through of subtitle files with unknown text encoding.
Previous versions of the library behaved as if ``errors=None``.
kwargs: Extra options for the writer.
Raises:
Expand All @@ -222,7 +240,7 @@ def save(self, path: str, encoding: str = "utf-8", format_: Optional[str] = None
ext = os.path.splitext(path)[1].lower()
format_ = get_format_identifier(ext)

with open(path, "w", encoding=encoding) as fp:
with open(path, "w", encoding=encoding, errors=errors) as fp:
self.to_file(fp, format_, fps=fps, **kwargs)

def to_string(self, format_: str, fps: Optional[float] = None, **kwargs: Any) -> str:
Expand Down
168 changes: 167 additions & 1 deletion tests/formats/test_subrip.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
pysubs2.formats.subrip tests
"""

import os.path as op
import tempfile
from textwrap import dedent
import pytest

Expand Down Expand Up @@ -302,3 +303,168 @@ def test_overflow_timestamp_write() -> None:
text = ref.to_string("srt")
subs = SSAFile.from_string(text)
assert subs[0].end == MAX_REPRESENTABLE_TIME


def test_win1250_passthrough_with_surrogateescape() -> None:
input_text = dedent("""\
1
00:00:00,000 --> 00:01:00,000
The quick brown fox jumps over the lazy dog
2
00:01:00,000 --> 00:02:00,000
Příliš žluťoučký kůň úpěl ďábelské ódy
""")

input_bytes_win1250 = input_text.encode("windows-1250")

with tempfile.TemporaryDirectory() as temp_dir:
input_path = op.join(temp_dir, "input.srt")
output_path = op.join(temp_dir, "output.srt")
with open(input_path, "wb") as fp:
fp.write(input_bytes_win1250)

with pytest.raises(UnicodeDecodeError):
# legacy behaviour
SSAFile.load(input_path, errors=None)

subs = SSAFile.load(input_path)

assert subs[0].text == "The quick brown fox jumps over the lazy dog"
assert subs[1].text.startswith("P") and subs[1].text.endswith("dy")

subs.save(output_path)

with open(output_path, "rb") as fp:
output_bytes = fp.read()

assert input_bytes_win1250 == output_bytes


def test_multiencoding_passthrough_with_surrogateescape() -> None:
input_text = dedent("""\
1
00:00:00,000 --> 00:01:00,000
The quick brown fox jumps over the lazy dog""")

input_bytes = input_text.encode("ascii")
input_bytes += b"\n" + "Příliš žluťoučký kůň úpěl ďábelské ódy".encode("windows-1250")
input_bytes += b"\n" + "Vamp quäkt: Grüß Felix bzw. Jody schön!".encode("utf-8")
input_bytes += b"\n" + "日本国".encode("shift-jis")
input_bytes += b"\n" + "道德經".encode("big5")
input_bytes += b"\n\n"

with tempfile.TemporaryDirectory() as temp_dir:
input_path = op.join(temp_dir, "input.srt")
output_path = op.join(temp_dir, "output.srt")
with open(input_path, "wb") as fp:
fp.write(input_bytes)

with pytest.raises(UnicodeDecodeError):
# legacy behaviour
SSAFile.load(input_path, errors=None)

subs = SSAFile.load(input_path)

assert subs[0].text.startswith("The quick brown fox jumps over the lazy dog")
assert "Felix bzw. Jody" in subs[0].text

subs.save(output_path)

with open(output_path, "rb") as fp:
output_bytes = fp.read()

assert input_bytes == output_bytes


def test_utf8_read_write() -> None:
input_text = dedent("""\
1
00:00:00,000 --> 00:01:00,000
The quick brown fox jumps over the lazy dog
Příliš žluťoučký kůň úpěl ďábelské ódy
Vamp quäkt: Grüß Felix bzw. Jody schön!
日本国
道德經
""")

input_bytes = input_text.encode("utf-8")

with tempfile.TemporaryDirectory() as temp_dir:
input_path = op.join(temp_dir, "input.srt")
output_path = op.join(temp_dir, "output.srt")
with open(input_path, "wb") as fp:
fp.write(input_bytes)

# legacy behaviour
subs_legacy = SSAFile.load(input_path, errors=None)
subs = SSAFile.load(input_path)
assert subs.equals(subs_legacy)

subs.save(output_path)

with open(output_path, "rb") as fp:
output_bytes = fp.read()

assert input_bytes == output_bytes


def test_win1250_read_write() -> None:
input_text = dedent("""\
1
00:00:00,000 --> 00:01:00,000
The quick brown fox jumps over the lazy dog
Příliš žluťoučký kůň úpěl ďábelské ódy
""")

input_bytes = input_text.encode("windows-1250")

with tempfile.TemporaryDirectory() as temp_dir:
input_path = op.join(temp_dir, "input.srt")
output_path = op.join(temp_dir, "output.srt")
with open(input_path, "wb") as fp:
fp.write(input_bytes)

# legacy behaviour
subs_legacy = SSAFile.load(input_path, encoding="windows-1250", errors=None)
subs = SSAFile.load(input_path, encoding="windows-1250")
assert subs.equals(subs_legacy)

subs.save(output_path, encoding="windows-1250")

with open(output_path, "rb") as fp:
output_bytes = fp.read()

assert input_bytes == output_bytes


def test_big5_read_write() -> None:
input_text = dedent("""\
1
00:00:00,000 --> 00:01:00,000
道德經
""")

input_bytes = input_text.encode("big5")

with tempfile.TemporaryDirectory() as temp_dir:
input_path = op.join(temp_dir, "input.srt")
output_path = op.join(temp_dir, "output.srt")
with open(input_path, "wb") as fp:
fp.write(input_bytes)

# legacy behaviour
subs_legacy = SSAFile.load(input_path, encoding="big5", errors=None)
subs = SSAFile.load(input_path, encoding="big5")
assert subs.equals(subs_legacy)

subs.save(output_path, encoding="big5")

with open(output_path, "rb") as fp:
output_bytes = fp.read()

assert input_bytes == output_bytes
27 changes: 27 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@
"""

TEST_SRT_FILE_WIN1250 = """\
1
00:00:00,000 --> 00:01:00,000
An example subtitle.
Příliš žluťoučký kůň úpěl ďábelské ódy
"""

TEST_MICRODVD_FILE = """\
{1}{1}1000.0
{0}{60000}An example subtitle.
Expand Down Expand Up @@ -407,3 +415,22 @@ def test_empty_notty_input_doesnt_print_help(capsys: Any, monkeypatch: Any) -> N
assert p.returncode == 1
assert not p.stdout.startswith("usage: pysubs2")
assert "FormatAutodetectionError" in p.stderr


def test_win1250_passthrough_with_surrogateescape() -> None:
input_bytes_win1250 = TEST_SRT_FILE_WIN1250.encode("windows-1250")

with tempfile.TemporaryDirectory() as temp_dir:
input_path = op.join(temp_dir, "input.srt")
output_dir = op.join(temp_dir, "output")
output_path = op.join(output_dir, "input.srt")
with open(input_path, "wb") as fp:
fp.write(input_bytes_win1250)

cmd = ["python", "-m", "pysubs2", "-o", output_dir, input_path]
subprocess.check_call(cmd)

with open(output_path, "rb") as fp:
output_bytes = fp.read()

assert input_bytes_win1250 == output_bytes

0 comments on commit 5c98e6c

Please sign in to comment.