aboutsummaryrefslogtreecommitdiff
path: root/crowbar_reference_compiler/scanner.py
diff options
context:
space:
mode:
Diffstat (limited to 'crowbar_reference_compiler/scanner.py')
-rw-r--r--crowbar_reference_compiler/scanner.py60
1 files changed, 43 insertions, 17 deletions
diff --git a/crowbar_reference_compiler/scanner.py b/crowbar_reference_compiler/scanner.py
index ea34536..1d846f3 100644
--- a/crowbar_reference_compiler/scanner.py
+++ b/crowbar_reference_compiler/scanner.py
@@ -1,5 +1,5 @@
from dataclasses import dataclass
-from typing import Optional, overload, List, Union
+from typing import Optional, List
import regex as re # type: ignore
@@ -24,12 +24,33 @@ class GenerousTokenList(List[Token]):
return Token('')
-KEYWORD = re.compile("bool|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|fragile|function|if|include|int|long|return|short|signed|sizeof|struct|switch|typedef|unsigned|void|while")
+KEYWORD = re.compile(r"""
+ bool|break|
+ case|char|const|continue|
+ default|do|
+ else|enum|
+ false|float32|float64|for|fragile|function|
+ if|include|int8|int16|int32|int64|intmax|intsize|
+ opaque|
+ return|
+ sizeof|struct|switch|
+ true|
+ uint8|uint16|uint32|uint64|uintaddr|uintmax|uintsize|union|
+ void|
+ while""", re.VERBOSE)
IDENTIFIER = re.compile(r"[\p{L}\p{Pc}\p{Sk}\p{Mn}][\p{L}\p{Pc}\p{Sk}\p{Mn}\p{N}]*")
-CONSTANT = re.compile(r"""([0-9_]+)|(0[bB][01_]+)|0o[0-7_]+|(0[xX][0-9a-fA-F_]+)|([0-9_]+(\.[0-9_]+|[eE][0-9_]+|\.[0-9_]+[eE][0-9_]+))|('([^\'\\]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})')""")
-STRING_LITERAL = re.compile(r'''"([^\\"]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"''')
+DECIMAL_CONSTANT = re.compile(r"[0-9_]+")
+BINARY_CONSTANT = re.compile(r"0[bB][01_]+")
+OCTAL_CONSTANT = re.compile(r"0o[0-7_]+")
+HEX_CONSTANT = re.compile(r"0[xX][0-9a-fA-F]+")
+FLOAT_CONSTANT = re.compile(r"[0-9_]+\.[0-9_]+([eE][+-]?[0-9_]+)?")
+HEX_FLOAT_CONSTANT = re.compile(r"0(fx|FX)[0-9a-fA-F_]+\.[0-9a-fA-F_]+[pP][+-]?[0-9_]+")
+
+_ESCAPE_SEQUENCE = r"""\\['"\\rnt0]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}"""
+CHAR_CONSTANT = re.compile(r"'([^'\\]|" + _ESCAPE_SEQUENCE + r")'")
+STRING_LITERAL = re.compile(r'"([^"\\]|' + _ESCAPE_SEQUENCE + r')+"')
PUNCTUATOR = re.compile(r"->|\+\+|--|>>|<<|<=|>=|&&|\|\||[=!+\-*/%&|^]=|[\[\](){}.,+\-*/%;!&|^~><=]")
-WHITESPACE = re.compile(r"[\p{Zs}\p{Cc}]+")
+WHITESPACE = re.compile(r"[\p{Z}\p{Cc}]+")
COMMENT = re.compile(r"(//[^\n]*\n)|(/\*.*?\*/)", re.DOTALL)
@@ -46,20 +67,25 @@ def scan(code):
if match:
remaining = remaining[match.end():]
continue
- match = KEYWORD.match(remaining)
- if match:
- result.append(Token(match.group()))
- remaining = remaining[match.end():]
+ kw_match = KEYWORD.match(remaining)
+ id_match = IDENTIFIER.match(remaining)
+ if kw_match and ((not id_match) or len(kw_match.group()) == len(id_match.group())):
+ result.append(Token(kw_match.group()))
+ remaining = remaining[kw_match.end():]
continue
- match = IDENTIFIER.match(remaining)
- if match:
- result.append(Token('identifier', match.group()))
- remaining = remaining[match.end():]
+ if id_match:
+ result.append(Token('identifier', id_match.group()))
+ remaining = remaining[id_match.end():]
continue
- match = CONSTANT.match(remaining)
- if match:
- result.append(Token('constant', match.group()))
- remaining = remaining[match.end():]
+ was_constant = False
+ for constant in [DECIMAL_CONSTANT, BINARY_CONSTANT, OCTAL_CONSTANT, HEX_CONSTANT, FLOAT_CONSTANT, HEX_FLOAT_CONSTANT, CHAR_CONSTANT]:
+ match = constant.match(remaining)
+ if match:
+ result.append(Token('constant', match.group()))
+ remaining = remaining[match.end():]
+ was_constant = True
+ break
+ if was_constant:
continue
match = STRING_LITERAL.match(remaining)
if match: