blob: fff8c3520fdabd6b1fd67c976ec82564ecef82b9 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
from dataclasses import dataclass
from typing import Optional, overload, List, Union
import regex as re # type: ignore
@dataclass
class Token:
type: str
data: Optional[str] = None
def __repr__(self) -> str:
if self.data is not None:
return "{}: {}".format(self.type, repr(self.data))
else:
return repr(self.type)
class GenerousTokenList(List[Token]):
def __getitem__(self, i):
try:
return super(GenerousTokenList, self).__getitem__(i)
except IndexError:
return Token('')
KEYWORD = re.compile("bool|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|function|if|include|int|long|return|short|signed|sizeof|struct|switch|typedef|unsigned|void|while")
IDENTIFIER = re.compile(r"[\p{L}\p{Pc}\p{Cf}\p{Sk}\p{Mn}][\p{L}\p{Pc}\p{Cf}\p{Sk}\p{Mn}\p{N}]*")
CONSTANT = re.compile(r"""([0-9_]+)|(0[bB][01_]+)|(0[xX][0-9a-fA-F_]+)|([0-9_]+(\.[0-9_]+|[eE][0-9_]+|\.[0-9_]+[eE][0-9_]+))|('([^\'\\]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})')""")
STRING_LITERAL = re.compile(r'''"([^\\"]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"''')
PUNCTUATOR = re.compile(r"->|\+\+|--|>>|<<|<=|>=|&&|\|\||[=!+\-*/%&|^]=|[\[\](){}.,+\-*/%;!&|^~><=]")
WHITESPACE = re.compile(r"[\p{Zs}\p{Cc}]+")
COMMENT = re.compile(r"(//[^\n]*\n)|(/\*.*?\*/)", re.DOTALL)
def scan(code):
result = []
remaining = code
while len(remaining) > 0:
match = COMMENT.match(remaining)
if match:
remaining = remaining[match.end():]
continue
match = WHITESPACE.match(remaining)
if match:
remaining = remaining[match.end():]
continue
match = KEYWORD.match(remaining)
if match:
result.append(Token(match.group()))
remaining = remaining[match.end():]
continue
match = IDENTIFIER.match(remaining)
if match:
result.append(Token('identifier', match.group()))
remaining = remaining[match.end():]
continue
match = CONSTANT.match(remaining)
if match:
result.append(Token('constant', match.group()))
remaining = remaining[match.end():]
continue
match = STRING_LITERAL.match(remaining)
if match:
result.append(Token('string_literal', match.group()))
remaining = remaining[match.end():]
continue
match = PUNCTUATOR.match(remaining)
if match:
result.append(Token(match.group()))
remaining = remaining[match.end():]
continue
raise ValueError("unrecognized code in scanner: {}".format(repr(remaining[:20])))
return GenerousTokenList(result)
|