Skip to content

Commit 009385f

Browse files
authored
Add WebVTT lexer (#707) (#1032)
Implements a WebVTT lexer (#707)
1 parent f3ff20b commit 009385f

17 files changed

+1906
-0
lines changed

lexers/embedded/webvtt.xml

+283
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
<lexer>
2+
<config>
3+
<name>WebVTT</name>
4+
<alias>vtt</alias>
5+
<filename>*.vtt</filename>
6+
<mime_type>text/vtt</mime_type>
7+
</config>
8+
<!--
9+
The WebVTT spec refers to a WebVTT line terminator as either CRLF, CR or LF.
10+
(https://www.w3.org/TR/webvtt1/#webvtt-line-terminator) However, with this
11+
definition it is unclear whether CRLF is one line terminator (CRLF) or two
12+
line terminators (CR and LF).
13+
14+
To work around this ambiguity, only CRLF and LF are considered as line terminators.
15+
To my knowledge only classic Mac OS uses CR as line terminators, so the lexer should
16+
still work for most files.
17+
-->
18+
<rules>
19+
<!-- https://www.w3.org/TR/webvtt1/#webvtt-file-body -->
20+
<state name="root">
21+
<rule pattern="(\AWEBVTT)((?:[ \t][^\r\n]*)?(?:\r?\n){2,})">
22+
<bygroups>
23+
<token type="Keyword" />
24+
<token type="Text" />
25+
</bygroups>
26+
</rule>
27+
<rule pattern="(^REGION)([ \t]*$)">
28+
<bygroups>
29+
<token type="Keyword" />
30+
<token type="Text" />
31+
</bygroups>
32+
<push state="region-settings-list" />
33+
</rule>
34+
<rule
35+
pattern="(^STYLE)([ \t]*$)((?:(?!&#45;&#45;&gt;)[\s\S])*?)((?:\r?\n){2})">
36+
<bygroups>
37+
<token type="Keyword" />
38+
<token type="Text" />
39+
<using lexer="CSS" />
40+
<token type="Text" />
41+
</bygroups>
42+
</rule>
43+
<rule>
44+
<include state="comment" />
45+
</rule>
46+
<rule
47+
pattern="(?=((?![^\r\n]*&#45;&#45;&gt;)[^\r\n]*\r?\n)?(\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3}[ \t]+&#45;&#45;&gt;[ \t]+(\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})"
48+
>
49+
<push state="cues" />
50+
</rule>
51+
</state>
52+
53+
<!-- https://www.w3.org/TR/webvtt1/#webvtt-region-settings-list -->
54+
<state name="region-settings-list">
55+
<rule pattern="(?: |\t|\r?\n(?!\r?\n))+">
56+
<token type="Text" />
57+
</rule>
58+
<rule pattern="(?:\r?\n){2}">
59+
<token type="Text" />
60+
<pop depth="1" />
61+
</rule>
62+
<rule pattern="(id)(:)(?!&#45;&#45;&gt;)(\S+)">
63+
<bygroups>
64+
<token type="Keyword" />
65+
<token type="Punctuation" />
66+
<token type="Literal" />
67+
</bygroups>
68+
</rule>
69+
<rule pattern="(width)(:)((?:[1-9]?\d|100)(?:\.\d+)?)(%)">
70+
<bygroups>
71+
<token type="Keyword" />
72+
<token type="Punctuation" />
73+
<token type="Literal" />
74+
<token type="KeywordType" />
75+
</bygroups>
76+
</rule>
77+
<rule pattern="(lines)(:)(\d+)">
78+
<bygroups>
79+
<token type="Keyword" />
80+
<token type="Punctuation" />
81+
<token type="Literal" />
82+
</bygroups>
83+
</rule>
84+
<rule
85+
pattern="(regionanchor|viewportanchor)(:)((?:[1-9]?\d|100)(?:\.\d+)?)(%)(,)((?:[1-9]?\d|100)(?:\.\d+)?)(%)">
86+
<bygroups>
87+
<token type="Keyword" />
88+
<token type="Punctuation" />
89+
<token type="Literal" />
90+
<token type="KeywordType" />
91+
<token type="Punctuation" />
92+
<token type="Literal" />
93+
<token type="KeywordType" />
94+
</bygroups>
95+
</rule>
96+
<rule pattern="(scroll)(:)(up)">
97+
<bygroups>
98+
<token type="Keyword" />
99+
<token type="Punctuation" />
100+
<token type="KeywordConstant" />
101+
</bygroups>
102+
</rule>
103+
</state>
104+
105+
<!-- https://www.w3.org/TR/webvtt1/#webvtt-comment-block -->
106+
<state name="comment">
107+
<rule
108+
pattern="^NOTE( |\t|\r?\n)((?!&#45;&#45;&gt;)[\s\S])*?(?:(\r?\n){2}|\Z)">
109+
<token type="Comment" />
110+
</rule>
111+
</state>
112+
113+
<!--
114+
"Zero or more WebVTT cue blocks and WebVTT comment blocks separated from each other by one or more
115+
WebVTT line terminators." (https://www.w3.org/TR/webvtt1/#file-structure)
116+
-->
117+
<state name="cues">
118+
<rule
119+
pattern="(?:((?!&#45;&#45;&gt;)[^\r\n]+)?(\r?\n))?((?:\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})([ \t]+)(&#45;&#45;&gt;)([ \t]+)((?:\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})([ \t]*)">
120+
<bygroups>
121+
<token type="Name" />
122+
<token type="Text" />
123+
<token type="LiteralDate" />
124+
<token type="Text" />
125+
<token type="Operator" />
126+
<token type="Text" />
127+
<token type="LiteralDate" />
128+
<token type="Text" />
129+
</bygroups>
130+
<push state="cue-settings-list" />
131+
</rule>
132+
<rule>
133+
<include state="comment" />
134+
</rule>
135+
</state>
136+
137+
<!-- https://www.w3.org/TR/webvtt1/#webvtt-cue-settings-list -->
138+
<state name="cue-settings-list">
139+
<rule pattern="[ \t]+">
140+
<token type="Text" />
141+
</rule>
142+
<rule pattern="(vertical)(:)?(rl|lr)?">
143+
<bygroups>
144+
<token type="Keyword" />
145+
<token type="Punctuation" />
146+
<token type="KeywordConstant" />
147+
</bygroups>
148+
</rule>
149+
<rule
150+
pattern="(line)(:)?(?:(?:((?:[1-9]?\d|100)(?:\.\d+)?)(%)|(-?\d+))(?:(,)(start|center|end))?)?">
151+
<bygroups>
152+
<token type="Keyword" />
153+
<token type="Punctuation" />
154+
<token type="Literal" />
155+
<token type="KeywordType" />
156+
<token type="Literal" />
157+
<token type="Punctuation" />
158+
<token type="KeywordConstant" />
159+
</bygroups>
160+
</rule>
161+
<rule
162+
pattern="(position)(:)?(?:(?:((?:[1-9]?\d|100)(?:\.\d+)?)(%)|(-?\d+))(?:(,)(line-left|center|line-right))?)?">
163+
<bygroups>
164+
<token type="Keyword" />
165+
<token type="Punctuation" />
166+
<token type="Literal" />
167+
<token type="KeywordType" />
168+
<token type="Literal" />
169+
<token type="Punctuation" />
170+
<token type="KeywordConstant" />
171+
</bygroups>
172+
</rule>
173+
<rule pattern="(size)(:)?(?:((?:[1-9]?\d|100)(?:\.\d+)?)(%))?">
174+
<bygroups>
175+
<token type="Keyword" />
176+
<token type="Punctuation" />
177+
<token type="Literal" />
178+
<token type="KeywordType" />
179+
</bygroups>
180+
</rule>
181+
<rule pattern="(align)(:)?(start|center|end|left|right)?">
182+
<bygroups>
183+
<token type="Keyword" />
184+
<token type="Punctuation" />
185+
<token type="KeywordConstant" />
186+
</bygroups>
187+
</rule>
188+
<rule pattern="(region)(:)?((?![^\r\n]*&#45;&#45;&gt;(?=[ \t]+?))[^ \t\r\n]+)?">
189+
<bygroups>
190+
<token type="Keyword" />
191+
<token type="Punctuation" />
192+
<token type="Literal" />
193+
</bygroups>
194+
</rule>
195+
<rule
196+
pattern="(?=\r?\n)">
197+
<push state="cue-payload" />
198+
</rule>
199+
</state>
200+
201+
<!-- https://www.w3.org/TR/webvtt1/#cue-payload -->
202+
<state name="cue-payload">
203+
<rule pattern="(\r?\n){2,}">
204+
<token type="Text" />
205+
<pop depth="2" />
206+
</rule>
207+
<rule pattern="[^&lt;&amp;]+?">
208+
<token type="Text" />
209+
</rule>
210+
<rule pattern="&amp;(#\d+|#x[0-9A-Fa-f]+|[a-zA-Z0-9]+);">
211+
<token type="Text" />
212+
</rule>
213+
<rule pattern="(?=&lt;)">
214+
<token type="Text" />
215+
<push state="cue-span-tag" />
216+
</rule>
217+
</state>
218+
<state name="cue-span-tag">
219+
<rule
220+
pattern="&lt;(?=c|i|b|u|ruby|rt|v|lang|(?:\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})">
221+
<token type="Punctuation" />
222+
<push state="cue-span-start-tag-name" />
223+
</rule>
224+
<rule pattern="(&lt;/)(c|i|b|u|ruby|rt|v|lang)">
225+
<bygroups>
226+
<token type="Punctuation" />
227+
<token type="NameTag" />
228+
</bygroups>
229+
</rule>
230+
<rule pattern="&gt;">
231+
<token type="Punctuation" />
232+
<pop depth="1" />
233+
</rule>
234+
</state>
235+
<state name="cue-span-start-tag-name">
236+
<rule pattern="(c|i|b|u|ruby|rt)|((?:\d{2}:)?(?:[0-5][0-9]):(?:[0-5][0-9])\.\d{3})">
237+
<bygroups>
238+
<token type="NameTag" />
239+
<token type="LiteralDate" />
240+
</bygroups>
241+
<push state="cue-span-classes-without-annotations" />
242+
</rule>
243+
<rule pattern="v|lang">
244+
<token type="NameTag" />
245+
<push state="cue-span-classes-with-annotations" />
246+
</rule>
247+
</state>
248+
<state name="cue-span-classes-without-annotations">
249+
<rule>
250+
<include state="cue-span-classes" />
251+
</rule>
252+
<rule pattern="(?=&gt;)">
253+
<pop depth="2" />
254+
</rule>
255+
</state>
256+
<state name="cue-span-classes-with-annotations">
257+
<rule>
258+
<include state="cue-span-classes" />
259+
</rule>
260+
<rule pattern="(?=[ \t])">
261+
<push state="cue-span-start-tag-annotations" />
262+
</rule>
263+
</state>
264+
<state name="cue-span-classes">
265+
<rule pattern="(\.)([^ \t\n\r&amp;&lt;&gt;\.]+)">
266+
<bygroups>
267+
<token type="Punctuation" />
268+
<token type="NameTag" />
269+
</bygroups>
270+
</rule>
271+
</state>
272+
<state name="cue-span-start-tag-annotations">
273+
<rule
274+
pattern="[ \t](?:[^\n\r&amp;&gt;]|&amp;(?:#\d+|#x[0-9A-Fa-f]+|[a-zA-Z0-9]+);)+">
275+
<token type="Text" />
276+
</rule>
277+
<rule pattern="(?=&gt;)">
278+
<token type="Text" />
279+
<pop depth="3" />
280+
</rule>
281+
</state>
282+
</rules>
283+
</lexer>

0 commit comments

Comments
 (0)