From 8538e1a9aec42b38dd5a3176adce8e4a5405a1b4 Mon Sep 17 00:00:00 2001 From: Alejandro Mendez Date: Thu, 9 Apr 2020 11:10:58 +0200 Subject: [PATCH] Fix parsing issue #29 --- docs/history.rst | 45 ++++++++++++++++++++++++-------------------- tests/test_webvtt.py | 18 ++++++++++++++++++ webvtt/__init__.py | 2 +- webvtt/parsers.py | 2 +- 4 files changed, 45 insertions(+), 22 deletions(-) diff --git a/docs/history.rst b/docs/history.rst index 3817c98..e5bad12 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -1,6 +1,11 @@ History ======= +0.4.5 (09-04-2020) +------------------ + +* Fix issue reading buffer + 0.4.4 (27-03-2020) ------------------ @@ -11,15 +16,15 @@ History * Added Python 3.8 support * Improve parsing empty lines -0.4.3 (22-11-2019) Few improvements ------------------------------------ +0.4.3 (22-11-2019) +------------------ * Parsing improvements, thanks to `@sontek `_ (#18) * Add support for reading content from a file-like object, thanks to `@omerholz `_ (#23) * Documentation fixes thanks to `@sontek `_ (#22) and `@netcmcc `_ (#24) -0.4.2 (08-06-2018) Rename of modules and usability improvements ---------------------------------------------------------------- +0.4.2 (08-06-2018) +------------------ * Renamed and reorganized few of the modules * Parsing methods are now class methods: read, from_srt and from_sbv @@ -30,13 +35,13 @@ import webvtt webvtt.read('captions.vtt') # this will return a WebVTT instance -0.4.1 (24-12-2017) Hot fix on cue identifiers ---------------------------------------------- +0.4.1 (24-12-2017) +------------------ * Support for saving cue identifiers -0.4.0 (18-09-2017) Refactor and parse compatibility ---------------------------------------------------- +0.4.0 (18-09-2017) +------------------ The main goal of this release is a refactor of the WebVTT parser to be able to parse easier and give support to new features of the format. @@ -55,8 +60,8 @@ Other: * Refactored WebVTT parser -0.3.3 (23-08-2017) Hot fix on cue tags --------------------------------------- +0.3.3 (23-08-2017) +------------------ The text for the caption is now returned clean (tags removed). The cue text could contain tags like: * timestamp tags: *<00:19.000>* @@ -66,20 +71,20 @@ The text for the caption is now returned clean (tags removed). The cue text coul Also a new attribute is available on captions to retrieve the text without cleaning tags: **raw_text** -0.3.2 (11-08-2017) Hot fix for compatibility --------------------------------------------- +0.3.2 (11-08-2017) +------------------ The goal of this release if to allow the WebVTT parser to be able to read caption files that contain metadata headers that extend to more than one line. -0.3.1 (08-08-2017) Compatibility updates ----------------------------------------- +0.3.1 (08-08-2017) +------------------ * Made hours in WebVTT parser optional as per specs. * Added support to parse WebVTT files that contain metadata headers. -0.3.0 (02-06-2016) YouTube SBV ------------------------------- +0.3.0 (02-06-2016) +------------------ New features: @@ -93,14 +98,14 @@ Other: * Added an exception for invalid timestamps in captions. * Added an exception when saving without a filename. -0.2.0 (23-05-2016) Module refactor ----------------------------------- +0.2.0 (23-05-2016) +------------------ * Refactor of the main module and parsers. -0.1.0 (20-05-2016) First release --------------------------------- +0.1.0 (20-05-2016) +------------------ This module is released with the following initial features: diff --git a/tests/test_webvtt.py b/tests/test_webvtt.py index e54a0b8..2c633b0 100644 --- a/tests/test_webvtt.py +++ b/tests/test_webvtt.py @@ -1,5 +1,6 @@ import os import io +import textwrap from shutil import rmtree, copy import webvtt @@ -237,6 +238,23 @@ def test_read_memory_buffer(self): vtt = webvtt.read_buffer(buffer) self.assertIsInstance(vtt.captions, list) + def test_read_memory_buffer_carriage_return(self): + """https://github.com/glut23/webvtt-py/issues/29""" + buffer = io.StringIO(textwrap.dedent('''\ + WEBVTT\r + \r + 00:00:00.500 --> 00:00:07.000\r + Caption text #1\r + \r + 00:00:07.000 --> 00:00:11.890\r + Caption text #2\r + \r + 00:00:11.890 --> 00:00:16.320\r + Caption text #3\r + ''')) + vtt = webvtt.read_buffer(buffer) + self.assertEqual(len(vtt.captions), 3) + def test_read_malformed_buffer(self): malformed_payloads = ['', 'MOCK MELFORMED CONTENT'] for payload in malformed_payloads: diff --git a/webvtt/__init__.py b/webvtt/__init__.py index ca1f767..e0a17a9 100644 --- a/webvtt/__init__.py +++ b/webvtt/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.4.4' +__version__ = '0.4.5' from .webvtt import * from .segmenter import * diff --git a/webvtt/parsers.py b/webvtt/parsers.py index 55ab4f7..3a978ca 100644 --- a/webvtt/parsers.py +++ b/webvtt/parsers.py @@ -51,7 +51,7 @@ def _read_file_encoding(self, file_path): def _read_content_lines(self, file_obj): - lines = [line.rstrip('\n') for line in file_obj.readlines()] + lines = [line.rstrip('\n\r') for line in file_obj.readlines()] if not lines: raise MalformedFileError('The file is empty.')