-
Notifications
You must be signed in to change notification settings - Fork 0
/
lex.py
217 lines (195 loc) · 7.3 KB
/
lex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# Author: Jackson szekeres
#purpose: seprate input in to tokens
import sys
import enum
# Lexer object keeps track of current position in the source code and produces each token.
class Lexer:
def __init__(self, input):
self.source = input + '\n' # Source code to lex as a string. Append a newline to simplify lexing/parsing the last token/statement.
self.curChar = '' # Current character in the string.
self.curPos = -1 # Current position in the string.
self.nextChar()
#setup
# Process the next character.
def nextChar(self):
self.curPos += 1
if self.curPos >= len(self.source):
self.curChar = '\0' # EOF
else:
self.curChar = self.source[self.curPos]
# Return the lookahead character.
def peek(self):
if self.curPos + 1 >= len(self.source):
return '\0'
return self.source[self.curPos+1]
# Invalid token found, print error message and exit.
def abort(self, message):
sys.exit("Lexing error. " + message)
# Return the next token.
def getToken(self):
self.skipWhitespace()
self.skipMultilineComment()
self.skipComment()
token = None
# Check the first character of this token to see if we can decide what it is.
# If it is a multiple character operator (e.g., !=), number, identifier, or keyword, then we will process the rest.
if self.curChar == '+':
token = Token(self.curChar, TokenType.PLUS)
elif self.curChar == '-':
token = Token(self.curChar, TokenType.MINUS)
elif self.curChar == '*':
token = Token(self.curChar, TokenType.ASTERISK)
elif self.curChar == '/':
token = Token(self.curChar, TokenType.SLASH)
elif self.curChar == '%':
token = Token(self.curChar, TokenType.MOD)
elif self.curChar == '=':
# Check whether this token is = or ==
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token(lastChar + self.curChar, TokenType.EQEQ)
else:
token = Token(self.curChar, TokenType.EQ)
elif self.curChar == '>':
# Check whether this is token is > or >=
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token(lastChar + self.curChar, TokenType.GTEQ)
else:
token = Token(self.curChar, TokenType.GT)
elif self.curChar == '<':
# Check whether this is token is < or <=
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token(lastChar + self.curChar, TokenType.LTEQ)
else:
token = Token(self.curChar, TokenType.LT)
elif self.curChar == '!':
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token(lastChar + self.curChar, TokenType.NOTEQ)
else:
self.abort("Expected !=, got !" + self.peek())
elif self.curChar == "(":
token = Token(self.curChar, TokenType.LEFTPARENTHESIS)
elif self.curChar == ")":
token = Token(self.curChar, TokenType.RIGHTPARENTHESIS)
elif self.curChar == '\"':
# Get characters between quotations.
self.nextChar()
startPos = self.curPos
while self.curChar != '\"':
self.nextChar()
tokText = self.source[startPos : self.curPos] # Get the substring.
token = Token(tokText, TokenType.STRING)
#elif self.curChar == "T" or Self.curChar == "F":
elif self.curChar.isdigit():
# Leading character is a digit, so this must be a number.
# Get all consecutive digits and decimal if there is one.
startPos = self.curPos
while self.peek().isdigit():
self.nextChar()
if self.peek() == '.': # Decimal!
self.nextChar()
# Must have at least one digit after decimal.
if not self.peek().isdigit():
# Error!
self.abort("Illegal character in number.")
while self.peek().isdigit():
self.nextChar()
tokText = self.source[startPos : self.curPos + 1] # Get the substring.
token = Token(tokText, TokenType.FLOAT)
elif self.curChar.isalpha():
# Leading character is a letter, so this must be an identifier or a keyword.
# Get all consecutive alpha numeric characters.
startPos = self.curPos
while self.peek().isalnum():
self.nextChar()
# Check if the token is in the list of keywords.
tokText = self.source[startPos : self.curPos + 1] # Get the substring.
keyword = Token.checkIfKeyword(tokText)
if keyword == None: # Identifier
token = Token(tokText, TokenType.IDENT)
else: # Keyword
token = Token(tokText, keyword)
elif self.curChar == '\n':
# Newline.
token = Token('\n', TokenType.NEWLINE)
elif self.curChar == '\0':
# EOF.
token = Token('', TokenType.EOF)
else:
# Unknown token!
self.abort("Unknown token: " + self.curChar)
self.nextChar()
return token
# Skip whitespace except newlines, which we will use to indicate the end of a statement.
def skipWhitespace(self):
while self.curChar == ' ' or self.curChar == '\t' or self.curChar == '\r':
self.nextChar()
#skip comments
def skipComment(self):
if self.curChar == '#':
while self.curChar != '\n':
self.nextChar()
elif self.curChar == '|':
while self.curChar != '|':
self.nextChar()
#not working
def skipMultilineComment(self):
pass
# Token contains the original text and the type of token.
class Token:
def __init__(self, tokenText, tokenKind):
self.text = tokenText # The token's actual text. Used for identifiers, strings, and numbers.
self.kind = tokenKind # The TokenType that this token is classified as.
@staticmethod
def checkIfKeyword(tokenText):
for kind in TokenType:
# Relies on all keyword enum values being 1XX.
if kind.name == tokenText and kind.value >= 100 and kind.value < 200:
return kind
return None
# TokenType is our enum for all the types of tokens.
class TokenType(enum.Enum):
EOF = -1
NEWLINE = 0
FLOAT = 1
TUPLE = 2
IDENT = 3
STRING = 4
BOOL = 5
# Keywords.
LABEL = 101
GOTO = 102
PRINT = 103
INPUT = 104
LET = 105
IF = 106
THEN = 107
ENDIF = 108
REPEAT = 109
ENDREPEAT = 110
WAIT = 111
CCODE = 112
RAISE = 113
EXPORT = 114
# Operators.
EQ = 201
PLUS = 202
MINUS = 203
ASTERISK = 204
SLASH = 205
MOD = 206
EQEQ = 207
NOTEQ = 208
LT = 209
LTEQ = 210
GT = 211
GTEQ = 212
LEFTPARENTHESIS = 213
RIGHTPARENTHESIS = 214