-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathpreprocessorParser.py
265 lines (206 loc) · 20.4 KB
/
preprocessorParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
""" preprocessor
<definition>
# Codes
LF : '
'
CR : '
'
EOL : LF / CR
TAB : " "
L_BRACKET : "["
R_BRACKET : "\]"
L_BRACE : "{" : drop
R_BRACE : "}" : drop
SPACE : " " : drop
SPACETAB : SPACE / TAB : drop
SPACETABEOL : SPACE / TAB / EOL : drop
PIPE : "|" : drop
BANG : "!" : drop
EQUAL : "=" : drop
LT : "<" : drop
GT : ">" : drop
HASH : "#" : drop
DASH : "-" : drop
AMP : "&" : drop
SEMICOLON : ";" : drop
TEMPLATE_BEGIN : L_BRACE{2} : drop
TEMPLATE_END : R_BRACE{2} : drop
PARAMETER_BEGIN : L_BRACE{3} : drop
PARAMETER_END : R_BRACE{3} : drop
# Predefined tags
NOWIKI_BEGIN : "<nowiki>"
NOWIKI_END : "</nowiki>"
PRE_BEGIN : "<pre>"
PRE_END : "</pre>"
special_tag : NOWIKI_BEGIN/NOWIKI_END/PRE_BEGIN/PRE_END
# Characters
any_char : [\x20..\xff] / '/'
esc_char : L_BRACKET/R_BRACKET/PIPE/L_BRACE/R_BRACE/LT/GT/AMP/SEMICOLON
raw_char : !esc_char any_char
raw_text : (raw_char / TAB)+ : join
# HTML comments
# HTML comments are totally ignored and do not appear in the final text
comment_content : ((!(DASH{2} GT) [\x20..\xff])+ / SPACETABEOL)*
html_comment : LT BANG DASH{2} comment_content DASH{2} GT : drop
# Text
page_name : raw_char+ : join
# Template parameters
# Those parameters should be substituted by their value when the current page is a template
# or by their optional default value in any case
parameter_id : raw_char+ : join
parameter_value : inline? : keep
optional_default_value : (PIPE SPACETABEOL* parameter_value)? SPACETABEOL* : liftNode
template_parameter : PARAMETER_BEGIN parameter_id optional_default_value PARAMETER_END : substitute_template_parameter
# Links
LINK_PIPE : PIPE : restore
internal_link : L_BRACKET{2} inline (LINK_PIPE inline)* R_BRACKET{2} : join
external_link : L_BRACKET inline (SPACE inline)* R_BRACKET : join
link : internal_link / external_link
# Templates
value_content : (inline / (!(SPACETABEOL* (TEMPLATE_END / PIPE)) (any_char / EOL)))* : keep
parameter_value : value_content SPACETABEOL*
optional_value : parameter_value?
parameter_equal : SPACETABEOL* EQUAL SPACETABEOL*
parameter_name : (!(esc_char/parameter_equal) raw_char)+ : join
named_parameter : parameter_name parameter_equal optional_value
standalone_parameter : value_content? : join
parameter : SPACETABEOL* PIPE SPACETABEOL* (named_parameter/standalone_parameter) : liftValue
parameters : parameter*
template : TEMPLATE_BEGIN page_name parameters SPACETABEOL* TEMPLATE_END : substitute_template
# inline allows to have templates/links inside templates/links
structure : link / template / template_parameter
inline : (structure / raw_text)+ : @
numbered_entity : AMP HASH [0..9]+ SEMICOLON : substitute_numbered_entity
named_entity : AMP [a..zA..Z]+ SEMICOLON : substitute_named_entity
entity : named_entity / numbered_entity
# Pre and nowiki tags
# Preformatted acts like nowiki (disables wikitext parsing)
# We allow any char without parsing them as long as the tag is not closed
pre_text : (!PRE_END any_char)* : join
preformatted : PRE_BEGIN pre_text PRE_END : liftValue
eol_to_space : EOL* : replace_by_space
nowiki_text : (!NOWIKI_END (any_char/eol_to_space))* : join
nowiki : NOWIKI_BEGIN nowiki_text NOWIKI_END : liftValue
# Text types
styled_text : template / template_parameter / entity
not_styled_text : html_comment / preformatted / nowiki
allowed_char : esc_char{1} : restore liftValue
allowed_text : raw_text / allowed_char
wikitext : (not_styled_text / styled_text / allowed_text / EOL)+ : join
"""
from pijnu.library import *
def make_parser(actions=None):
"""Return a parser.
The parser's toolset functions are (optionally) augmented (or overridden)
by a map of additional ones passed in.
"""
if actions is None:
actions = {}
# Start off with the imported pijnu library functions:
toolset = globals().copy()
parser = Parser()
state = parser.state
### title: preprocessor ###
def toolset_from_grammar():
"""Return a map of toolset functions hard-coded into the grammar."""
### <toolset>
def replace_by_space(node):
node.value = ' '
return locals().copy()
toolset.update(toolset_from_grammar())
toolset.update(actions)
### <definition>
# recursive pattern(s)
inline = Recursive(name='inline')
# Codes
LF = Char('\n', expression="'\n'", name='LF')
CR = Char('\n', expression="'\n'", name='CR')
EOL = Choice([LF, CR], expression='LF / CR', name='EOL')
TAB = Word('\t', expression='"\t"', name='TAB')
L_BRACKET = Word('[', expression='"["', name='L_BRACKET')
R_BRACKET = Word(']', expression='"\\]"', name='R_BRACKET')
L_BRACE = Word('{', expression='"{"', name='L_BRACE')(toolset['drop'])
R_BRACE = Word('}', expression='"}"', name='R_BRACE')(toolset['drop'])
SPACE = Word(' ', expression='" "', name='SPACE')(toolset['drop'])
SPACETAB = Choice([SPACE, TAB], expression='SPACE / TAB', name='SPACETAB')(toolset['drop'])
SPACETABEOL = Choice([SPACE, TAB, EOL], expression='SPACE / TAB / EOL', name='SPACETABEOL')(toolset['drop'])
PIPE = Word('|', expression='"|"', name='PIPE')(toolset['drop'])
BANG = Word('!', expression='"!"', name='BANG')(toolset['drop'])
EQUAL = Word('=', expression='"="', name='EQUAL')(toolset['drop'])
LT = Word('<', expression='"<"', name='LT')(toolset['drop'])
GT = Word('>', expression='">"', name='GT')(toolset['drop'])
HASH = Word('#', expression='"#"', name='HASH')(toolset['drop'])
DASH = Word('-', expression='"-"', name='DASH')(toolset['drop'])
AMP = Word('&', expression='"&"', name='AMP')(toolset['drop'])
SEMICOLON = Word(';', expression='";"', name='SEMICOLON')(toolset['drop'])
TEMPLATE_BEGIN = Repetition(L_BRACE, numMin=2, numMax=2, expression='L_BRACE{2}', name='TEMPLATE_BEGIN')(toolset['drop'])
TEMPLATE_END = Repetition(R_BRACE, numMin=2, numMax=2, expression='R_BRACE{2}', name='TEMPLATE_END')(toolset['drop'])
PARAMETER_BEGIN = Repetition(L_BRACE, numMin=3, numMax=3, expression='L_BRACE{3}', name='PARAMETER_BEGIN')(toolset['drop'])
PARAMETER_END = Repetition(R_BRACE, numMin=3, numMax=3, expression='R_BRACE{3}', name='PARAMETER_END')(toolset['drop'])
# Predefined tags
NOWIKI_BEGIN = Word('<nowiki>', expression='"<nowiki>"', name='NOWIKI_BEGIN')
NOWIKI_END = Word('</nowiki>', expression='"</nowiki>"', name='NOWIKI_END')
PRE_BEGIN = Word('<pre>', expression='"<pre>"', name='PRE_BEGIN')
PRE_END = Word('</pre>', expression='"</pre>"', name='PRE_END')
special_tag = Choice([NOWIKI_BEGIN, NOWIKI_END, PRE_BEGIN, PRE_END], expression='NOWIKI_BEGIN/NOWIKI_END/PRE_BEGIN/PRE_END', name='special_tag')
# Characters
any_char = Choice([Klass(u' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff', expression='[\\x20..\\xff]'), Char('/', expression="'/'")], expression="[\\x20..\\xff] / '/'", name='any_char')
esc_char = Choice([L_BRACKET, R_BRACKET, PIPE, L_BRACE, R_BRACE, LT, GT, AMP, SEMICOLON], expression='L_BRACKET/R_BRACKET/PIPE/L_BRACE/R_BRACE/LT/GT/AMP/SEMICOLON', name='esc_char')
raw_char = Sequence([NextNot(esc_char, expression='!esc_char'), any_char], expression='!esc_char any_char', name='raw_char')
raw_text = Repetition(Choice([raw_char, TAB], expression='raw_char / TAB'), numMin=1, numMax=False, expression='(raw_char / TAB)+', name='raw_text')(toolset['join'])
# HTML comments
# HTML comments are totally ignored and do not appear in the final text
comment_content = Repetition(Choice([Repetition(Sequence([NextNot(Sequence([Repetition(DASH, numMin=2, numMax=2, expression='DASH{2}'), GT], expression='DASH{2} GT'), expression='!(DASH{2} GT)'), Klass(u' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff', expression='[\\x20..\\xff]')], expression='!(DASH{2} GT) [\\x20..\\xff]'), numMin=1, numMax=False, expression='(!(DASH{2} GT) [\\x20..\\xff])+'), SPACETABEOL], expression='(!(DASH{2} GT) [\\x20..\\xff])+ / SPACETABEOL'), numMin=False, numMax=False, expression='((!(DASH{2} GT) [\\x20..\\xff])+ / SPACETABEOL)*', name='comment_content')
html_comment = Sequence([LT, BANG, Repetition(DASH, numMin=2, numMax=2, expression='DASH{2}'), comment_content, Repetition(DASH, numMin=2, numMax=2, expression='DASH{2}'), GT], expression='LT BANG DASH{2} comment_content DASH{2} GT', name='html_comment')(toolset['drop'])
# Text
page_name = Repetition(raw_char, numMin=1, numMax=False, expression='raw_char+', name='page_name')(toolset['join'])
# Template parameters
# Those parameters should be substituted by their value when the current page is a template
# or by their optional default value in any case
parameter_id = Repetition(raw_char, numMin=1, numMax=False, expression='raw_char+', name='parameter_id')(toolset['join'])
parameter_value = Option(inline, expression='inline?', name='parameter_value')(toolset['keep'])
optional_default_value = Sequence([Option(Sequence([PIPE, Repetition(SPACETABEOL, numMin=False, numMax=False, expression='SPACETABEOL*'), parameter_value], expression='PIPE SPACETABEOL* parameter_value'), expression='(PIPE SPACETABEOL* parameter_value)?'), Repetition(SPACETABEOL, numMin=False, numMax=False, expression='SPACETABEOL*')], expression='(PIPE SPACETABEOL* parameter_value)? SPACETABEOL*', name='optional_default_value')(toolset['liftNode'])
template_parameter = Sequence([PARAMETER_BEGIN, parameter_id, optional_default_value, PARAMETER_END], expression='PARAMETER_BEGIN parameter_id optional_default_value PARAMETER_END', name='template_parameter')(toolset['substitute_template_parameter'])
# Links
LINK_PIPE = Clone(PIPE, expression='PIPE', name='LINK_PIPE')(toolset['restore'])
internal_link = Sequence([Repetition(L_BRACKET, numMin=2, numMax=2, expression='L_BRACKET{2}'), inline, Repetition(Sequence([LINK_PIPE, inline], expression='LINK_PIPE inline'), numMin=False, numMax=False, expression='(LINK_PIPE inline)*'), Repetition(R_BRACKET, numMin=2, numMax=2, expression='R_BRACKET{2}')], expression='L_BRACKET{2} inline (LINK_PIPE inline)* R_BRACKET{2}', name='internal_link')(toolset['join'])
external_link = Sequence([L_BRACKET, inline, Repetition(Sequence([SPACE, inline], expression='SPACE inline'), numMin=False, numMax=False, expression='(SPACE inline)*'), R_BRACKET], expression='L_BRACKET inline (SPACE inline)* R_BRACKET', name='external_link')(toolset['join'])
link = Choice([internal_link, external_link], expression='internal_link / external_link', name='link')
# Templates
value_content = Repetition(Choice([inline, Sequence([NextNot(Sequence([Repetition(SPACETABEOL, numMin=False, numMax=False, expression='SPACETABEOL*'), Choice([TEMPLATE_END, PIPE], expression='TEMPLATE_END / PIPE')], expression='SPACETABEOL* (TEMPLATE_END / PIPE)'), expression='!(SPACETABEOL* (TEMPLATE_END / PIPE))'), Choice([any_char, EOL], expression='any_char / EOL')], expression='!(SPACETABEOL* (TEMPLATE_END / PIPE)) (any_char / EOL)')], expression='inline / (!(SPACETABEOL* (TEMPLATE_END / PIPE)) (any_char / EOL))'), numMin=False, numMax=False, expression='(inline / (!(SPACETABEOL* (TEMPLATE_END / PIPE)) (any_char / EOL)))*', name='value_content')(toolset['keep'])
parameter_value = Sequence([value_content, Repetition(SPACETABEOL, numMin=False, numMax=False, expression='SPACETABEOL*')], expression='value_content SPACETABEOL*', name='parameter_value')
optional_value = Option(parameter_value, expression='parameter_value?', name='optional_value')
parameter_equal = Sequence([Repetition(SPACETABEOL, numMin=False, numMax=False, expression='SPACETABEOL*'), EQUAL, Repetition(SPACETABEOL, numMin=False, numMax=False, expression='SPACETABEOL*')], expression='SPACETABEOL* EQUAL SPACETABEOL*', name='parameter_equal')
parameter_name = Repetition(Sequence([NextNot(Choice([esc_char, parameter_equal], expression='esc_char/parameter_equal'), expression='!(esc_char/parameter_equal)'), raw_char], expression='!(esc_char/parameter_equal) raw_char'), numMin=1, numMax=False, expression='(!(esc_char/parameter_equal) raw_char)+', name='parameter_name')(toolset['join'])
named_parameter = Sequence([parameter_name, parameter_equal, optional_value], expression='parameter_name parameter_equal optional_value', name='named_parameter')
standalone_parameter = Option(value_content, expression='value_content?', name='standalone_parameter')(toolset['join'])
parameter = Sequence([Repetition(SPACETABEOL, numMin=False, numMax=False, expression='SPACETABEOL*'), PIPE, Repetition(SPACETABEOL, numMin=False, numMax=False, expression='SPACETABEOL*'), Choice([named_parameter, standalone_parameter], expression='named_parameter/standalone_parameter')], expression='SPACETABEOL* PIPE SPACETABEOL* (named_parameter/standalone_parameter)', name='parameter')(toolset['liftValue'])
parameters = Repetition(parameter, numMin=False, numMax=False, expression='parameter*', name='parameters')
template = Sequence([TEMPLATE_BEGIN, page_name, parameters, Repetition(SPACETABEOL, numMin=False, numMax=False, expression='SPACETABEOL*'), TEMPLATE_END], expression='TEMPLATE_BEGIN page_name parameters SPACETABEOL* TEMPLATE_END', name='template')(toolset['substitute_template'])
# inline allows to have templates/links inside templates/links
structure = Choice([link, template, template_parameter], expression='link / template / template_parameter', name='structure')
inline **= Repetition(Choice([structure, raw_text], expression='structure / raw_text'), numMin=1, numMax=False, expression='(structure / raw_text)+', name='inline')
numbered_entity = Sequence([AMP, HASH, Repetition(Klass(u'0123456789', expression='[0..9]'), numMin=1, numMax=False, expression='[0..9]+'), SEMICOLON], expression='AMP HASH [0..9]+ SEMICOLON', name='numbered_entity')(toolset['substitute_numbered_entity'])
named_entity = Sequence([AMP, Repetition(Klass(u'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', expression='[a..zA..Z]'), numMin=1, numMax=False, expression='[a..zA..Z]+'), SEMICOLON], expression='AMP [a..zA..Z]+ SEMICOLON', name='named_entity')(toolset['substitute_named_entity'])
entity = Choice([named_entity, numbered_entity], expression='named_entity / numbered_entity', name='entity')
# Pre and nowiki tags
# Preformatted acts like nowiki (disables wikitext parsing)
# We allow any char without parsing them as long as the tag is not closed
pre_text = Repetition(Sequence([NextNot(PRE_END, expression='!PRE_END'), any_char], expression='!PRE_END any_char'), numMin=False, numMax=False, expression='(!PRE_END any_char)*', name='pre_text')(toolset['join'])
preformatted = Sequence([PRE_BEGIN, pre_text, PRE_END], expression='PRE_BEGIN pre_text PRE_END', name='preformatted')(toolset['liftValue'])
eol_to_space = Repetition(EOL, numMin=False, numMax=False, expression='EOL*', name='eol_to_space')(toolset['replace_by_space'])
nowiki_text = Repetition(Sequence([NextNot(NOWIKI_END, expression='!NOWIKI_END'), Choice([any_char, eol_to_space], expression='any_char/eol_to_space')], expression='!NOWIKI_END (any_char/eol_to_space)'), numMin=False, numMax=False, expression='(!NOWIKI_END (any_char/eol_to_space))*', name='nowiki_text')(toolset['join'])
nowiki = Sequence([NOWIKI_BEGIN, nowiki_text, NOWIKI_END], expression='NOWIKI_BEGIN nowiki_text NOWIKI_END', name='nowiki')(toolset['liftValue'])
# Text types
styled_text = Choice([template, template_parameter, entity], expression='template / template_parameter / entity', name='styled_text')
not_styled_text = Choice([html_comment, preformatted, nowiki], expression='html_comment / preformatted / nowiki', name='not_styled_text')
allowed_char = Repetition(esc_char, numMin=1, numMax=1, expression='esc_char{1}', name='allowed_char')(toolset['restore'], toolset['liftValue'])
allowed_text = Choice([raw_text, allowed_char], expression='raw_text / allowed_char', name='allowed_text')
wikitext = Repetition(Choice([not_styled_text, styled_text, allowed_text, EOL], expression='not_styled_text / styled_text / allowed_text / EOL'), numMin=1, numMax=False, expression='(not_styled_text / styled_text / allowed_text / EOL)+', name='wikitext')(toolset['join'])
symbols = locals().copy()
symbols.update(actions)
parser._recordPatterns(symbols)
parser._setTopPattern("wikitext")
parser.grammarTitle = "preprocessor"
parser.filename = "preprocessorParser.py"
return parser