1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
from dataclasses import dataclass
from typing import Optional, List
import regex as re # type: ignore
@dataclass
class Token:
type: str
data: Optional[str] = None
def __repr__(self) -> str:
if self.data is not None:
return "{}: {}".format(self.type, repr(self.data))
else:
return repr(self.type)
class GenerousTokenList(List[Token]):
def __getitem__(self, i):
try:
return super(GenerousTokenList, self).__getitem__(i)
except IndexError:
return Token('')
KEYWORD = re.compile(r"""
bool|break|
case|char|const|continue|
default|do|
else|enum|
false|float32|float64|for|fragile|function|
if|include|int8|int16|int32|int64|intmax|intsize|
opaque|
return|
sizeof|struct|switch|
true|
uint8|uint16|uint32|uint64|uintaddr|uintmax|uintsize|union|
void|
while""", re.VERBOSE)
IDENTIFIER = re.compile(r"[\p{L}\p{Pc}\p{Sk}\p{Mn}][\p{L}\p{Pc}\p{Sk}\p{Mn}\p{N}]*")
DECIMAL_CONSTANT = re.compile(r"[0-9_]+")
BINARY_CONSTANT = re.compile(r"0[bB][01_]+")
OCTAL_CONSTANT = re.compile(r"0o[0-7_]+")
HEX_CONSTANT = re.compile(r"0[xX][0-9a-fA-F]+")
FLOAT_CONSTANT = re.compile(r"[0-9_]+\.[0-9_]+([eE][+-]?[0-9_]+)?")
HEX_FLOAT_CONSTANT = re.compile(r"0(fx|FX)[0-9a-fA-F_]+\.[0-9a-fA-F_]+[pP][+-]?[0-9_]+")
_ESCAPE_SEQUENCE = r"""\\['"\\rnt0]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}"""
CHAR_CONSTANT = re.compile(r"'([^'\\]|" + _ESCAPE_SEQUENCE + r")'")
STRING_LITERAL = re.compile(r'"([^"\\]|' + _ESCAPE_SEQUENCE + r')+"')
PUNCTUATOR = re.compile(r"->|\+\+|--|>>|<<|<=|>=|&&|\|\||[=!+\-*/%&|^]=|[\[\](){}.,+\-*/%;:!&|^~><=]")
WHITESPACE = re.compile(r"[\p{Z}\p{Cc}]+")
COMMENT = re.compile(r"(//[^\n]*\n)|(/\*.*?\*/)", re.DOTALL)
def scan(code):
result = []
remaining = code
while len(remaining) > 0:
match = COMMENT.match(remaining)
if match:
remaining = remaining[match.end():]
continue
match = WHITESPACE.match(remaining)
if match:
remaining = remaining[match.end():]
continue
kw_match = KEYWORD.match(remaining)
id_match = IDENTIFIER.match(remaining)
if kw_match and ((not id_match) or len(kw_match.group()) == len(id_match.group())):
result.append(Token(kw_match.group()))
remaining = remaining[kw_match.end():]
continue
if id_match:
result.append(Token('identifier', id_match.group()))
remaining = remaining[id_match.end():]
continue
was_constant = False
for constant in [HEX_CONSTANT, BINARY_CONSTANT, OCTAL_CONSTANT, HEX_FLOAT_CONSTANT, FLOAT_CONSTANT, DECIMAL_CONSTANT, CHAR_CONSTANT]:
match = constant.match(remaining)
if match:
result.append(Token('constant', match.group()))
remaining = remaining[match.end():]
was_constant = True
break
if was_constant:
continue
match = STRING_LITERAL.match(remaining)
if match:
result.append(Token('string_literal', match.group()))
remaining = remaining[match.end():]
continue
match = PUNCTUATOR.match(remaining)
if match:
result.append(Token(match.group()))
remaining = remaining[match.end():]
continue
raise ValueError("unrecognized code in scanner: {}".format(repr(remaining[:20])))
return GenerousTokenList(result)
|