From 338049020a17831e68b6a437bb038d8f10bfc45e Mon Sep 17 00:00:00 2001 From: Melody Horn Date: Wed, 4 Nov 2020 02:07:00 -0700 Subject: bring in updates to spec --- crowbar_reference_compiler/scanner.py | 60 +++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 17 deletions(-) (limited to 'crowbar_reference_compiler/scanner.py') diff --git a/crowbar_reference_compiler/scanner.py b/crowbar_reference_compiler/scanner.py index ea34536..1d846f3 100644 --- a/crowbar_reference_compiler/scanner.py +++ b/crowbar_reference_compiler/scanner.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Optional, overload, List, Union +from typing import Optional, List import regex as re # type: ignore @@ -24,12 +24,33 @@ class GenerousTokenList(List[Token]): return Token('') -KEYWORD = re.compile("bool|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|fragile|function|if|include|int|long|return|short|signed|sizeof|struct|switch|typedef|unsigned|void|while") +KEYWORD = re.compile(r""" + bool|break| + case|char|const|continue| + default|do| + else|enum| + false|float32|float64|for|fragile|function| + if|include|int8|int16|int32|int64|intmax|intsize| + opaque| + return| + sizeof|struct|switch| + true| + uint8|uint16|uint32|uint64|uintaddr|uintmax|uintsize|union| + void| + while""", re.VERBOSE) IDENTIFIER = re.compile(r"[\p{L}\p{Pc}\p{Sk}\p{Mn}][\p{L}\p{Pc}\p{Sk}\p{Mn}\p{N}]*") -CONSTANT = re.compile(r"""([0-9_]+)|(0[bB][01_]+)|0o[0-7_]+|(0[xX][0-9a-fA-F_]+)|([0-9_]+(\.[0-9_]+|[eE][0-9_]+|\.[0-9_]+[eE][0-9_]+))|('([^\'\\]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})')""") -STRING_LITERAL = re.compile(r'''"([^\\"]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"''') +DECIMAL_CONSTANT = re.compile(r"[0-9_]+") +BINARY_CONSTANT = re.compile(r"0[bB][01_]+") +OCTAL_CONSTANT = re.compile(r"0o[0-7_]+") +HEX_CONSTANT = re.compile(r"0[xX][0-9a-fA-F]+") +FLOAT_CONSTANT = re.compile(r"[0-9_]+\.[0-9_]+([eE][+-]?[0-9_]+)?") +HEX_FLOAT_CONSTANT = re.compile(r"0(fx|FX)[0-9a-fA-F_]+\.[0-9a-fA-F_]+[pP][+-]?[0-9_]+") + +_ESCAPE_SEQUENCE = r"""\\['"\\rnt0]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}""" +CHAR_CONSTANT = re.compile(r"'([^'\\]|" + _ESCAPE_SEQUENCE + r")'") +STRING_LITERAL = re.compile(r'"([^"\\]|' + _ESCAPE_SEQUENCE + r')+"') PUNCTUATOR = re.compile(r"->|\+\+|--|>>|<<|<=|>=|&&|\|\||[=!+\-*/%&|^]=|[\[\](){}.,+\-*/%;!&|^~><=]") -WHITESPACE = re.compile(r"[\p{Zs}\p{Cc}]+") +WHITESPACE = re.compile(r"[\p{Z}\p{Cc}]+") COMMENT = re.compile(r"(//[^\n]*\n)|(/\*.*?\*/)", re.DOTALL) @@ -46,20 +67,25 @@ def scan(code): if match: remaining = remaining[match.end():] continue - match = KEYWORD.match(remaining) - if match: - result.append(Token(match.group())) - remaining = remaining[match.end():] + kw_match = KEYWORD.match(remaining) + id_match = IDENTIFIER.match(remaining) + if kw_match and ((not id_match) or len(kw_match.group()) == len(id_match.group())): + result.append(Token(kw_match.group())) + remaining = remaining[kw_match.end():] continue - match = IDENTIFIER.match(remaining) - if match: - result.append(Token('identifier', match.group())) - remaining = remaining[match.end():] + if id_match: + result.append(Token('identifier', id_match.group())) + remaining = remaining[id_match.end():] continue - match = CONSTANT.match(remaining) - if match: - result.append(Token('constant', match.group())) - remaining = remaining[match.end():] + was_constant = False + for constant in [DECIMAL_CONSTANT, BINARY_CONSTANT, OCTAL_CONSTANT, HEX_CONSTANT, FLOAT_CONSTANT, HEX_FLOAT_CONSTANT, CHAR_CONSTANT]: + match = constant.match(remaining) + if match: + result.append(Token('constant', match.group())) + remaining = remaining[match.end():] + was_constant = True + break + if was_constant: continue match = STRING_LITERAL.match(remaining) if match: -- cgit v1.2.3