-
Notifications
You must be signed in to change notification settings - Fork 0
/
fixer.py
154 lines (112 loc) · 4.24 KB
/
fixer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import argparse
from collections import namedtuple
import re
SrtLine = namedtuple("SrtLine", ["sequence", "timing", "text"])
SPECIAL_CHARS_RE_SET = r'.,:;\'()\-?!+=*&$^%#@~`" \/'
class DecodeError(Exception): pass
def parse_args(args: list[str]) -> argparse.Namespace:
argparser = argparse.ArgumentParser()
argparser.add_argument("file")
return argparser.parse_args(args)
def get_file_lines(file: str) -> list[str]:
try:
with open(file, "r", encoding="utf-8") as f:
content = f.read()
if "י" in content:
return content.splitlines()
except UnicodeDecodeError:
pass
with open(file, "r", encoding="cp1255") as f:
content = f.read()
if "י" in content:
return content.splitlines()
raise DecodeError()
class SrtParser:
STATE_SEQ = 1
STATE_TIMING = 2
STATE_TEXT = 3
def __init__(self):
self._seq = ""
self._timing = ""
self._text = []
self._state = SrtParser.STATE_SEQ
self._lines = []
def _handle_line(self, line):
if self._state == SrtParser.STATE_SEQ:
self._handle_seq(line)
elif self._state == SrtParser.STATE_TIMING:
self._handle_timing(line)
elif self._state == SrtParser.STATE_TEXT:
self._handle_text(line)
def _handle_seq(self, line: str):
self._seq = line
self._state = SrtParser.STATE_TIMING
def _handle_timing(self, line: str):
self._timing = line
self._state = SrtParser.STATE_TEXT
def _handle_text(self, line: str):
if line.strip() == "":
self._lines.append(SrtLine(self._seq, self._timing, self._text))
self._text = []
self._state = SrtParser.STATE_SEQ
else:
self._text.append(line)
def parse_lines(self, lines: list[str]) -> list[SrtLine]:
for line in lines:
self._handle_line(line)
# Handle new new line at the end of the file
if self._state == SrtParser.STATE_TEXT:
self._handle_text("")
return self._lines[:]
@property
def lines(self):
return self._lines[:]
class SrtWriter:
def __init__(self, file: str):
self._file = open(file, "w", encoding="utf-8")
def write_lines(self, subtitles: list[SrtLine]):
for i, sub in enumerate(subtitles):
self._file.write(sub.sequence + "\n")
self._file.write(sub.timing + "\n")
for line in sub.text:
self._file.write(line + '\n')
if i != len(subtitles)-1:
self._file.write("\n")
def close(self):
self._file.close()
def __enter__(self):
return self
def __exit__(self, type, value, tb):
self.close()
def fix_line(line: str) -> str:
prefix_match = re.search(f"^([{SPECIAL_CHARS_RE_SET}]*)", line)
suffix_match = re.search(f"[^{SPECIAL_CHARS_RE_SET}]([{SPECIAL_CHARS_RE_SET}]*)$", line)
prefix = prefix_match.group(1) if prefix_match else ""
suffix = suffix_match.group(1) if suffix_match else ""
# Ad-hoc fix for quotation lines
if suffix.endswith(" -"):
suffix = suffix[:-2] + "- "
return suffix + line[len(prefix):len(line)-len(suffix)] + prefix
def fix_subtitles(subtitles: list[SrtLine]) -> list[SrtLine]:
new_subtitles = []
for subtitle_line in subtitles:
new_text = [fix_line(text_line) for text_line in subtitle_line.text]
new_subtitles.append(
SrtLine(
subtitle_line.sequence,
subtitle_line.timing,
new_text
)
)
return new_subtitles
def main(args: argparse.Namespace):
lines = get_file_lines(args.file)
srt_parser = SrtParser()
subtitles = srt_parser.parse_lines(lines)
modified_subtitles = fix_subtitles(subtitles)
with SrtWriter(args.file[:-3]+"fix.srt") as srt_writer:
srt_writer.write_lines(modified_subtitles)
if __name__=="__main__":
import sys
args = parse_args(sys.argv[1:])
main(args)