-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
204 lines (180 loc) · 8.2 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
r"""Lexical analyzer for the "Tiny-Extended" programming language.
The recognized tokens are:
Keywords: main, int, real, boolean, if, then, else, while, repeat, until, cin, cout
Symbols: { } ; , ( ) := ++ -- < <= > >= == != + - * /
Id: [A-Za-z][A-Za-z0-9_]*
Number: [0-9]+(\.[0-9]+)?
Block comment: /\*.*?\*/
Inline comment: //[^\n]*
This module exports:
- Token class containing the lexeme, category and location of the found token.
- Flix class that converts an input file into a sequence of tokens.
"""
from pathlib import Path
import sys
import re
class Token:
"""Class containing the lexeme, category and location of a token."""
__slots__ = ('lexeme', 'category', 'location')
def __init__(self, lexeme, category, location):
"""A simple token instance.
:param lexeme: The token's lexeme.
:param category: The token's category.
:param location: The token's location on the scanned source file.
"""
self.lexeme = lexeme
self.category = category
self.location = location
def __repr__(self):
classname = self.__class__.__name__
args = [f'{self.lexeme!r}', f'{self.category!r}', f'{self.location!r}']
return f'{classname}({", ".join(args)})'
class Flix:
"""Convert an input file into a sequence of tokens.
Output can be customized by subclassing and redefining the following:
Token category labels (cannot contain whitespace):
- id_label
- keyword_label
- int_label
- real_label
- boolean_label
- op_label
- special_label
Formatting separators:
- tokens_separator
- errors_separator
Tab character length:
- tab_length
Invalid file error message:
- invalid_file_msg
Output directory:
- output_dir
Public methods:
- scan Read the input file one line at a time and yield the tokens and errors found.
- write_output Write tokens and errors to their respective specified files.
"""
id_label = 'ID'
keyword_label = 'KEYWORD'
int_label = 'INT'
real_label = 'REAL'
boolean_label = 'BOOLEAN'
op_label = 'OP'
special_label = 'SPECIAL'
tokens_separator = '.' * 5
errors_separator = ' unexpected at '
tab_length = 4
invalid_file_msg = 'The input file does not exist or is empty.'
output_dir = Path('Lexicon')
_keywords = frozenset(
{'main', 'int', 'real', 'boolean', 'if', 'then', 'else', 'while', 'repeat', 'until', 'cin', 'cout', 'coutln', 'rompe'})
_block_end = re.compile(r'[^\n]*\*/')
def __init__(self, input_file):
"""A Flix object used for scanning a file.
:param input_file: Source code to be scanned.
"""
self._input_file = Path(input_file)
self._token_groups = [
('skip', r'\s+|//[^\n]*|/\*.*?\*/'),
('block_start', r'/\*[^\n]*'),
(self.real_label, r'[0-9]+\.[0-9]+'),
(self.boolean_label, r'True|False'),
(self.int_label, r'[0-9]+'),
(self.id_label, r'[A-Za-z][A-Za-z0-9_]*'),
(self.special_label, r'[{};,()]'),
(self.op_label, r'\+{1,2}|-{1,2}|(?:<|>)=?|(?::|=|!)=|[*/]'),
('error', r'.'),
]
self._token_pattern = re.compile('|'.join(f'(?P<{tg[0]}>{tg[1]})' for tg in self._token_groups))
@staticmethod
def _replace_tabs(string, tab_length):
column = 0
newstring = ''
for c in string:
if c == '\t':
if column % tab_length == 0:
n = tab_length
else:
n = 0
while column % tab_length != 0:
n += 1
column += 1
spaces = ' ' * n
newstring = f'{newstring}{spaces}'
elif c == '\n':
column = 0
newstring = f'{newstring}{c}'
else:
column += 1
newstring = f'{newstring}{c}'
return newstring
def scan(self):
"""Read the input file one line at a time and yield the tokens and errors found."""
if self.input_file.is_file() and self.input_file.stat().st_size > 0:
line_num = 0
block_comment_mode = False
with self.input_file.open(encoding='utf-8') as source_code:
for line in source_code:
if '\t' in line:
line = self._replace_tabs(line, self.tab_length)
line_num += 1
start_pos = 0
if block_comment_mode and self._block_end.match(line):
block_comment_mode = False
start_pos = self._block_end.match(line).end()
if not block_comment_mode:
for m in self._token_pattern.finditer(line, start_pos):
category = m.lastgroup
lexeme = m.group(category)
if category == 'skip':
pass
elif category == 'block_start':
block_comment_mode = True
else:
if category == self.id_label and lexeme in self._keywords:
category = self.keyword_label
column = m.start() + 1 # Column number starts at 1
yield Token(lexeme, category, f'{line_num}:{column}')
else:
raise RuntimeError(self.invalid_file_msg)
def write_output(self, tokens_filename='tokens.txt', errors_filename='errors.txt'):
"""Write tokens and errors to their respective specified files.
:param tokens_filename: Name of the file to write the tokens into. Default: tokens.txt.
:param errors_filename: Name of the file to write the errors into. Default: errors.txt.
"""
self.output_dir.mkdir(parents=True, exist_ok=True)
with (self.output_dir / tokens_filename).open('w', encoding='utf-8') as tokens_file, \
(self.output_dir / errors_filename).open('w', encoding='utf-8') as errors_file:
for tkn in self.scan():
if tkn.category == 'error':
errors_file.write(f'\'{tkn.lexeme}\'{self.errors_separator}{tkn.location}\n')
else:
tokens_file.write(f'{tkn.lexeme}{self.tokens_separator}{tkn.category}{self.tokens_separator}{tkn.location}\n')
@property
def input_file(self):
"""Source code to be scanned."""
return self._input_file
@input_file.setter
def input_file(self, value):
self._input_file = Path(value)
def __repr__(self):
classname = self.__class__.__name__
args = [f'input_file={self.input_file!r}',
f'id_label={self.id_label!r}',
f'keyword_label={self.keyword_label!r}',
f'int_label={self.int_label!r}',
f'real_label={self.real_label!r}',
f'boolean_label={self.boolean_label!r}',
f'op_label={self.op_label!r}',
f'special_label={self.special_label!r}',
f'tokens_separator={self.tokens_separator!r}',
f'errors_separator={self.errors_separator!r}',
f'tab_length={self.tab_length!r}',
f'invalid_file_msg={self.invalid_file_msg!r}',
f'output_dir={self.output_dir!r}',
f'_keywords={self._keywords!r}',
f'_token_groups={self._token_groups!r}',
f'_block_end={self._block_end!r}',
f'_token_pattern={self._token_pattern!r}']
return f'{classname}({", ".join(args)})'
if __name__ == '__main__':
Flix(sys.argv[1]).write_output()