-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexer.py
348 lines (285 loc) · 12.7 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
"""Objects for the lexing phase of the compiler. The lexing phase takes the entire contents of a raw input file and
generates a flat list of tokens present in that input file.
"""
from errors import CompilerError, Position, Range, error_collector
from token_kinds import symbol_kinds, keyword_kinds
from tokens import Token
from re import match
import token_kinds
STR_EX = []
class Tagged:
"""Class representing tagged characters.
c (char) - the character that is tagged.
p (Position) - position of the tagged character.
r (Range) - a length-one range for the character.
"""
def __init__(self, c, p):
""" Initialize object """
self.c = c
self.p = p
self.r = Range(p, p)
def tokenize(code, filename):
"""Convert given code into a flat list of Tokens.
lines - List of list of Tagged objects, where each embedded list is a separate line in the input program.
return - List of Token objects.
"""
# Store tokens as they are generated
tokens = []
lines = split_to_tagged_lines(code, filename)
join_extended_lines(lines)
in_comment = False
for line in lines:
try:
line_tokens, in_comment = tokenize_line(line, in_comment)
tokens += line_tokens
except CompilerError as e:
error_collector.add(e)
return tokens
def split_to_tagged_lines(text, filename):
"""Split the input text into tagged lines. No newline escaping or other preprocessing is done by this function.
text (str) - Input file contents as a string.
filename (str) - Input file name.
return - Tagged lines. List of list of Tagged objects, where each second order list is a separate line in the
input program. No newline characters.
"""
lines = text.splitlines()
tagged_lines = []
for line_num, line in enumerate(lines):
tagged_line = []
for col, char in enumerate(line):
p = Position(filename, line_num + 1, col + 1, line)
tagged_line.append(Tagged(char, p))
tagged_lines.append(tagged_line)
line_num += 1
return tagged_lines
def join_extended_lines(lines):
"""Join together any lines which end in an escaped newline. This function modifies the given lines object in place.
lines - List of list of Tagged objects, where each embedded list is a separate line in the input program.
"""
i = 0
while i < len(lines):
if lines[i] and lines[i][-1].c == "\\":
# There is a next line to collapse into this one
if i + 1 < len(lines):
del lines[i][-1] # remove trailing backslash
lines[i] += lines[i + 1] # concatenate with next line
del lines[i + 1] # remove next line
# Decrement i, so this line is checked for a new trailing backslash.
i -= 1
# There is no next line to collapse into this one
else:
del lines[i][-1] # remove trailing backslash
i += 1
def tokenize_line(line, in_comment):
"""Tokenize the given single line.
line - List of Tagged objects.
in_comment - Whether the first character in this line is part of a C-style comment body.
return - List of Token objects, and boolean indicating whether the next character is part of a comment body.
"""
tokens = []
# line[block_start:block_end] is the section of the line currently being considered for conversion into a token;
# this string will be called the 'block'. Everything before the block has already been tokenized, and everything
# after has not yet been examined
block_start = 0
block_end = 0
while block_end < len(line):
symbol_kind = match_symbol_kind_at(line, block_end)
next_ = match_symbol_kind_at(line, block_end + 1)
if in_comment:
# If next characters end the comment...
if symbol_kind == token_kinds.star and next_ == token_kinds.slash:
in_comment = False
block_start = block_end + 2
block_end = block_start
# Otherwise, just skip one character.
else:
block_start = block_end + 1
block_end = block_start
# If next characters start a comment, process previous block and set in_comment to true.
elif symbol_kind == token_kinds.slash and next_ == token_kinds.star:
add_block(line[block_start:block_end], tokens)
in_comment = True
# If next two characters are //, we skip the rest of this line.
elif symbol_kind == token_kinds.slash and next_ == token_kinds.slash:
break
# Skip spaces and process previous block.
elif line[block_end].c.isspace():
add_block(line[block_start:block_end], tokens)
block_start = block_end + 1
block_end = block_start
# If next character is a quote, we read the whole string as a token.
elif symbol_kind in {token_kinds.dquote, token_kinds.squote}:
if symbol_kind == token_kinds.dquote:
global STR_EX
STR_EX.append(1)
quote_str = '"'
kind = token_kinds.string
add_null = True
else:
quote_str = "'"
kind = token_kinds.char_string
add_null = False
chars, end = read_string(line, block_end + 1, quote_str, add_null)
rep = block_to_str(line[block_end:end + 1])
r = Range(line[block_end].p, line[end].p)
if kind == token_kinds.char_string and len(chars) == 0:
err = "empty character constant"
error_collector.add(CompilerError(err, r))
elif kind == token_kinds.char_string and len(chars) > 1:
err = "multiple characters in character constant"
error_collector.add(CompilerError(err, r))
tokens.append(Token(kind, chars, rep, r=r))
block_start = end + 1
block_end = block_start
# If next character is another symbol, add previous block and then add the symbol.
elif symbol_kind:
symbol_start_index = block_end
symbol_end_index = block_end + len(symbol_kind.text_repr) - 1
r = Range(line[symbol_start_index].p, line[symbol_end_index].p)
symbol_token = Token(symbol_kind, r=r)
add_block(line[block_start:block_end], tokens)
tokens.append(symbol_token)
block_start = block_end + len(symbol_kind.text_repr)
block_end = block_start
# Include another character in the block.
else:
block_end += 1
# Flush out anything that is left in the block to the output
add_block(line[block_start:block_end], tokens)
return tokens, in_comment
def block_to_str(block):
"""Convert the given block to a string.
block - list of Tagged characters.
return - string representation of the list of Tagged characters.
"""
return "".join(c.c for c in block)
def match_symbol_kind_at(content, start):
"""Return the longest matching symbol token kind.
content - List of Tagged objects in which to search for match.
start (int) - Index, inclusive, at which to start searching for a match.
returns (TokenType or None) - Symbol token found, or None if no token is found.
"""
for symbol_kind in symbol_kinds:
try:
for i, c in enumerate(symbol_kind.text_repr):
if content[start + i].c != c:
break
else:
return symbol_kind
except IndexError:
pass
return None
def read_string(line, start, delim, null):
"""Return a lexed string list in input characters. Also returns the index of the string end quote.
line[start] should be the first character after the opening quote of the string to be lexed. This function continues
reading characters until an unescaped closing quote is reached. The length returned is the number of input character
that were read, not the length of the string. The latter is the length of the lexed string list. The lexed string is
a list of integers, where each integer is the ASCII value (between 0 and 128) of the corresponding character in the
string. The returned lexed string includes a null-terminator.
line - List of Tagged objects for each character in the line.
start - Index at which to start reading the string.
delim - Delimiter with which the string ends, like `"` or `'`
null - Whether to add a null-terminator to the returned character list
"""
i = start
chars = []
escapes = {"'": 39,
'"': 34,
"?": 63,
"\\": 92,
"a": 7,
"b": 8,
"f": 12,
"n": 10,
"r": 13,
"t": 9,
"v": 11}
octdigits = "01234567"
hexdigits = "0123456789abcdefABCDEF"
while True:
if i >= len(line):
descr = "missing terminating quote"
raise CompilerError(descr, line[start - 1].r)
elif line[i].c == delim:
if null: chars.append(0)
return chars, i
elif i + 1 < len(line) and line[i].c == "\\" and line[i + 1].c in escapes:
chars.append(escapes[line[i + 1].c])
i += 2
elif i + 1 < len(line) and line[i].c == "\\" and line[i + 1].c in octdigits:
octal = line[i + 1].c
i += 2
while i < len(line) and len(octal) < 3 and line[i].c in octdigits:
octal += line[i].c
i += 1
chars.append(int(octal, 8))
elif i + 2 < len(line) and line[i].c == "\\" and line[i + 1].c == "x" and line[i + 2].c in hexdigits:
hexa = line[i + 2].c
i += 3
while i < len(line) and line[i].c in hexdigits:
hexa += line[i].c
i += 1
chars.append(int(hexa, 16))
else:
chars.append(ord(line[i].c))
i += 1
def add_block(block, tokens):
"""Convert block into a token if possible and add to tokens. If block is non-empty but cannot be made into a token,
this function records a compiler error. We don't need to check for symbol kind tokens here because they are
converted before they are shifted into the block.
block - block to convert into a token, as list of Tagged characters.
tokens (List[Token]) - List of the tokens so fat parsed.
"""
if block:
range_ = Range(block[0].p, block[-1].p)
keyword_kind = match_keyword_kind(block)
if keyword_kind:
tokens.append(Token(keyword_kind, r=range_))
return
number_string = match_number_string(block)
if number_string:
tokens.append(Token(token_kinds.number, number_string, r=range_))
return
identifier_name = match_identifier_name(block)
if identifier_name:
tokens.append(Token(
token_kinds.identifier, identifier_name, r=range_))
return
descr = f"unrecognized token at '{block_to_str(block)}'"
raise CompilerError(descr, range_)
def isbinary(token):
"""Check whether the submitted token is a binary number
token (int) - the int to check if it binary or not.
returns (boolean) - True (1) if given int is in binary representation, False (0) if not.
"""
return 1 if match(r'^0[bB][01]+$', token) else 0
def match_keyword_kind(token_repr):
"""Find the longest keyword token kind with representation token_repr.
token_repr - Token representation to match exactly, as list of Tagged characters.
returns (TokenKind, or None) - Keyword token kind that matched.
"""
token_str = block_to_str(token_repr)
for keyword_kind in keyword_kinds:
if keyword_kind.text_repr == token_str:
return keyword_kind
return None
def match_number_string(token_repr):
"""Return a string that represents the given constant number.
token_repr - List of Tagged characters.
returns (str, or None) - String representation of the number.
"""
token_str = block_to_str(token_repr)
if token_str.isdigit():
return token_str
elif isbinary(token_str):
return str(int(token_str, 2))
def match_identifier_name(token_repr):
"""Return a string that represents the name of an identifier.
token_repr - List of Tagged characters.
returns (str, or None) - String name of the identifier.
"""
token_str = block_to_str(token_repr)
if match(r"[_a-zA-Z][_a-zA-Z0-9]*$", token_str):
return token_str
else:
return None