From 338049020a17831e68b6a437bb038d8f10bfc45e Mon Sep 17 00:00:00 2001 From: Melody Horn Date: Wed, 4 Nov 2020 02:07:00 -0700 Subject: bring in updates to spec --- crowbar_reference_compiler/parser.py | 301 ++++++++++++++++------------------ crowbar_reference_compiler/scanner.py | 60 +++++-- tests/test_hello_world.py | 2 +- tests/test_parsing.py | 34 ++-- 4 files changed, 203 insertions(+), 194 deletions(-) diff --git a/crowbar_reference_compiler/parser.py b/crowbar_reference_compiler/parser.py index 9bb3179..8400480 100644 --- a/crowbar_reference_compiler/parser.py +++ b/crowbar_reference_compiler/parser.py @@ -2,169 +2,154 @@ from parsimonious import TokenGrammar, ParseError, IncompleteParseError # type: grammar = TokenGrammar( r""" -HeaderFile = HeaderFileElement+ -HeaderFileElement = IncludeStatement / - TypeDeclaration / - FunctionDeclaration - -ImplementationFile = ImplementationFileElement+ -ImplementationFileElement = HeaderFileElement / - FunctionDefinition - -IncludeStatement = "include" string_literal ";" - -TypeDeclaration = StructDeclaration / - EnumDeclaration / - TypedefDeclaration -StructDeclaration = "struct" identifier "{" VariableDeclaration+ "}" ";" -EnumDeclaration = "enum" identifier "{" EnumBody "}" ";" -EnumBody = (identifier ("=" Expression)? "," EnumBody) / - (identifier ("=" Expression)? ","?) -TypedefDeclaration = "typedef" identifier "=" Type ";" - -FunctionDeclaration = FunctionSignature ";" -FunctionDefinition = FunctionSignature Block -FunctionSignature = Type identifier "(" SignatureArguments? ")" -SignatureArguments = (Type identifier "," SignatureArguments) / - (Type identifier ","?) - -Block = "{" Statement* "}" +HeaderFile <- IncludeStatement* HeaderFileElement+ +HeaderFileElement <- TypeDefinition / FunctionDeclaration / VariableDefinition / VariableDeclaration + +ImplementationFile <- IncludeStatement* ImplementationFileElement+ +ImplementationFileElement <- TypeDefinition / VariableDefinition / FunctionDefinition + +IncludeStatement <- 'include' string-literal ';' + +TypeDefinition <- StructDefinition / EnumDefinition / UnionDefinition +StructDefinition <- NormalStructDefinition / OpaqueStructDefinition +NormalStructDefinition <- 'struct' identifier '{' VariableDeclaration+ '}' +OpaqueStructDefinition <- 'opaque' 'struct' identifier ';' +EnumDefinition <- 'enum' identifier '{' EnumMember (',' EnumMember)* ','? '}' +EnumMember <- identifier ('=' Expression)? +UnionDefinition <- RobustUnionDefinition / FragileUnionDefinition +RobustUnionDefinition <- 'union' identifier '{' VariableDeclaration UnionBody '}' +UnionBody <- 'switch' '(' identifier ')' '{' UnionBodySet+ '}' +UnionBodySet <- CaseSpecifier+ (VariableDeclaration / ';') +FragileUnionDefinition <- 'fragile' 'union' identifier '{' VariableDeclaration+ '}' + +FunctionDeclaration <- FunctionSignature ';' +FunctionDefinition <- FunctionSignature Block +FunctionSignature <- Type identifier '(' SignatureArguments? ')' +SignatureArguments <- Type identifier (',' Type identifier)* ','? + +Block <- '{' Statement* '}' -Statement = VariableDefinition / - VariableDeclaration / - IfStatement / - SwitchStatement / - WhileStatement / - DoWhileStatement / - ForStatement / - FlowControlStatement / - AssignmentStatement / - ExpressionStatement - -VariableDefinition = Type identifier "=" Expression ";" -VariableDeclaration = Type identifier ";" - -IfStatement = ("if" Expression Block "else" Block) / - ("if" Expression Block) - -SwitchStatement = "switch" Expression "{" SwitchCase+ "}" -SwitchCase = (CaseSpecifier Block) / - ("default" Block) -CaseSpecifier = ("case" Expression "," CaseSpecifier) / - ("case" Expression ","?) - -WhileStatement = "while" Expression Block -DoWhileStatement = "do" Block "while" Expression ";" -ForStatement = "for" VariableDefinition? ";" Expression ";" AssignmentStatementBody? Block - -FlowControlStatement = ("continue" ";") / - ("break" ";") / - ("return" Expression? ";") - -AssignmentStatement = AssignmentStatementBody ";" -AssignmentStatementBody = (AssignmentTargetExpression "=" Expression) / - (AssignmentTargetExpression "+=" Expression) / - (AssignmentTargetExpression "-=" Expression) / - (AssignmentTargetExpression "*=" Expression) / - (AssignmentTargetExpression "/=" Expression) / - (AssignmentTargetExpression "%=" Expression) / - (AssignmentTargetExpression "&=" Expression) / - (AssignmentTargetExpression "^=" Expression) / - (AssignmentTargetExpression "|=" Expression) / - (AssignmentTargetExpression "++") / - (AssignmentTargetExpression "--") - -ExpressionStatement = Expression ";" - -Type = ("const" BasicType) / - (BasicType "*") / - (BasicType "[" Expression "]") / - (BasicType "function" "(" (BasicType ",")* ")") / - BasicType -BasicType = "void" / - IntegerType / - ("signed" IntegerType) / - ("unsigned" IntegerType) / - "float" / - "double" / - "bool" / - ("struct" identifier) / - ("enum" identifier) / - ("typedef" identifier) / - ("(" Type ")") -IntegerType = "char" / - "short" / - "int" / - "long" - -AssignmentTargetExpression = identifier ATEElementSuffix* -ATEElementSuffix = ("[" Expression "]") / - ("." identifier) / - ("->" identifier) - -AtomicExpression = identifier / - constant / - string_literal / - ("(" Expression ")") - -ObjectExpression = (AtomicExpression ObjectSuffix*) / - ArrayLiteralExpression / - StructLiteralExpression -ObjectSuffix = ("[" Expression "]") / - ("(" CommasExpressionList? ")") / - ("." identifier) / - ("->" identifier) -CommasExpressionList = (Expression "," CommasExpressionList) / - (Expression ","?) -ArrayLiteralExpression = "{" CommasExpressionList "}" -StructLiteralExpression = "{" StructLiteralBody "}" -StructLiteralBody = (StructLiteralElement "," StructLiteralBody?) / - (StructLiteralElement ","?) -StructLiteralElement = "." identifier "=" Expression - -FactorExpression = ("(" Type ")" FactorExpression) / - ("&" FactorExpression) / - ("*" FactorExpression) / - ("+" FactorExpression) / - ("-" FactorExpression) / - ("~" FactorExpression) / - ("!" FactorExpression) / - ("sizeof" FactorExpression) / - ("sizeof" Type) / - ObjectExpression - -TermExpression = FactorExpression TermSuffix* -TermSuffix = ("*" FactorExpression) / - ("/" FactorExpression) / - ("%" FactorExpression) - -ArithmeticExpression = TermExpression ArithmeticSuffix* -ArithmeticSuffix = ("+" TermExpression) / - ("-" TermExpression) - -BitwiseOpExpression = (ArithmeticExpression "<<" ArithmeticExpression) / - (ArithmeticExpression ">>" ArithmeticExpression) / - (ArithmeticExpression "^" ArithmeticExpression) / - (ArithmeticExpression ("&" ArithmeticExpression)+) / - (ArithmeticExpression ("|" ArithmeticExpression)+) / - ArithmeticExpression - -ComparisonExpression = (BitwiseOpExpression "==" BitwiseOpExpression) / - (BitwiseOpExpression "!=" BitwiseOpExpression) / - (BitwiseOpExpression "<=" BitwiseOpExpression) / - (BitwiseOpExpression ">=" BitwiseOpExpression) / - (BitwiseOpExpression "<" BitwiseOpExpression) / - (BitwiseOpExpression ">" BitwiseOpExpression) / - BitwiseOpExpression - -Expression = (ComparisonExpression ("&&" ComparisonExpression)+) / - (ComparisonExpression ("||" ComparisonExpression)+) / - ComparisonExpression +Statement <- VariableDefinition / StructureStatement / FlowControlStatement / AssignmentStatement / FragileStatement / ExpressionStatement / EmptyStatement +EmptyStatement <- ';' +FragileStatement <- 'fragile' Statement +ExpressionStatement <- Expression ';' + +VariableDeclaration <- Type identifier ';' +VariableDefinition <- Type identifier '=' Expression ';' + +StructureStatement <- IfStatement / SwitchStatement / WhileStatement / DoWhileStatement / ForStatement +IfStatement <- 'if' '(' Expression ')' Block ('else' Block)? +SwitchStatement <- 'switch' '(' Expression ')' '{' (CaseSpecifier / Statement)+ '}' +CaseSpecifier <- ('case' Expression ':') / ('default' ':') +WhileStatement <- 'while' '(' Expression ')' Block +DoWhileStatement <- 'do' Block 'while' '(' Expression ')' ';' +ForStatement <- 'for' '(' ForInit? ';' Expression ';' ForUpdate? ')' Block +ForInit <- ForInitializer (',' ForInitializer)* ','? +ForInitializer <- Type identifier '=' Expression +ForUpdate <- AssignmentBody (',' AssignmentBody)* ','? + +FlowControlStatement <- ContinueStatement / BreakStatement / ReturnStatement +ContinueStatement <- 'continue' ';' +BreakStatement <- 'break' ';' +ReturnStatement <- 'return' Expression? ';' + +AssignmentStatement <- AssignmentBody ';' +AssignmentBody <- DirectAssignmentBody / UpdateAssignmentBody / CrementAssignmentBody +DirectAssignmentBody <- Expression '=' Expression +UpdateAssignmentBody <- Expression ('+=' / '-=' / '*=' / '/=' / '%=' / '&=' / '^=' / '|=') Expression +CrementAssignmentBody <- Expression ('++' / '--') + +Type <- ConstType / PointerType / ArrayType / FunctionType / BasicType +ConstType <- 'const' BasicType +PointerType <- BasicType '*' +ArrayType <- BasicType '[' Expression ']' +FunctionType <- BasicType 'function' '(' FunctionTypeArgs? ')' +FunctionTypeArgs <- BasicType (',' BasicType)* ','? +BasicType <- 'void' / 'bool' / 'float32' / 'float64' / + 'int8' / 'int16' / 'int32' / 'int64' / 'intaddr' / 'intmax' / 'intsize' / + 'uint8' / 'uint16' / 'uint32' / 'uint64' / 'uintaddr' / 'uintmax' / 'uintsize' / + ('struct' identifier) / ('enum' identifier) / ('union' identifier) / ('(' Type ')') + + +AtomicExpression <- identifier / constant / 'true' / 'false' / string-literal / ('(' Expression ')') + +ObjectExpression <- (AtomicExpression ObjectSuffix*) / ArrayLiteral / StructLiteral +ObjectSuffix <- ArrayIndexSuffix / FunctionCallSuffix / StructElementSuffix / StructPointerElementSuffix + +ArrayIndexSuffix <- '[' Expression ']' + +FunctionCallSuffix <- '(' CommasExpressionList? ')' +CommasExpressionList <- Expression (',' Expression)* ','? + +StructElementSuffix <- '.' identifier + +StructPointerElementSuffix <- '->' identifier + +ArrayLiteral <- '{' CommasExpressionList '}' + +StructLiteral <- '{' StructLiteralElement (',' StructLiteralElement)* ','? '}' +StructLiteralElement <- '.' identifier '=' Expression + +FactorExpression <- CastExpression / AddressOfExpression / DerefExpression / PositiveExpression / NegativeExpression / BitwiseNotExpression / LogicalNotExpression / SizeofExpression / ObjectExpression + +CastExpression <- '(' Type ')' ObjectExpression + +AddressOfExpression <- '&' ObjectExpression + +DerefExpression <- '*' ObjectExpression + +PositiveExpression <- '+' ObjectExpression + +NegativeExpression <- '-' ObjectExpression + +BitwiseNotExpression <- '~' ObjectExpression + +LogicalNotExpression <- '!' ObjectExpression + +SizeofExpression <- ('sizeof' ObjectExpression) / ('sizeof' Type) + +TermExpression <- FactorExpression TermSuffix? +TermSuffix <- ('*' FactorExpression)+ / ('/' FactorExpression)+ / ('%' FactorExpression)+ + +ArithmeticExpression <- TermExpression ArithmeticSuffix? +ArithmeticSuffix <- ('+' TermExpression)+ / ('-' TermExpression)+ + +BitwiseOpExpression <- ShiftExpression / XorExpression / BitwiseAndExpression / BitwiseOrExpression / ArithmeticExpression + +ShiftExpression <- (ArithmeticExpression '<<' ArithmeticExpression) / (ArithmeticExpression '>>' ArithmeticExpression) + +XorExpression <- ArithmeticExpression '^' ArithmeticExpression + +BitwiseAndExpression <- ArithmeticExpression ('&' ArithmeticExpression)+ + +BitwiseOrExpression <- ArithmeticExpression ('|' ArithmeticExpression)+ + +ComparisonExpression <- EqualExpression / NotEqualExpression / LessEqExpression / GreaterEqExpression / LessThanExpression / GreaterThanExpression / BitwiseOpExpression + +EqualExpression <- BitwiseOpExpression '==' BitwiseOpExpression + +NotEqualExpression <- BitwiseOpExpression '!=' BitwiseOpExpression + +LessEqExpression <- BitwiseOpExpression '<=' BitwiseOpExpression + +GreaterEqExpression <- BitwiseOpExpression '>=' BitwiseOpExpression + +LessThanExpression <- BitwiseOpExpression '<' BitwiseOpExpression + +GreaterThanExpression <- BitwiseOpExpression '>' BitwiseOpExpression + +LogicalOpExpression <- LogicalAndExpression / LogicalOrExpression / ComparisonExpression + +LogicalAndExpression <- ComparisonExpression ('&&' ComparisonExpression)+ + +LogicalOrExpression <- ComparisonExpression ('||' ComparisonExpression)+ + +Expression <- LogicalOpExpression identifier = "identifier" constant = "constant" string_literal = "string_literal" -""") +""".replace(' <- ', ' = ').replace('string-literal', 'string_literal')) class LegibleParseError(ParseError): diff --git a/crowbar_reference_compiler/scanner.py b/crowbar_reference_compiler/scanner.py index ea34536..1d846f3 100644 --- a/crowbar_reference_compiler/scanner.py +++ b/crowbar_reference_compiler/scanner.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Optional, overload, List, Union +from typing import Optional, List import regex as re # type: ignore @@ -24,12 +24,33 @@ class GenerousTokenList(List[Token]): return Token('') -KEYWORD = re.compile("bool|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|fragile|function|if|include|int|long|return|short|signed|sizeof|struct|switch|typedef|unsigned|void|while") +KEYWORD = re.compile(r""" + bool|break| + case|char|const|continue| + default|do| + else|enum| + false|float32|float64|for|fragile|function| + if|include|int8|int16|int32|int64|intmax|intsize| + opaque| + return| + sizeof|struct|switch| + true| + uint8|uint16|uint32|uint64|uintaddr|uintmax|uintsize|union| + void| + while""", re.VERBOSE) IDENTIFIER = re.compile(r"[\p{L}\p{Pc}\p{Sk}\p{Mn}][\p{L}\p{Pc}\p{Sk}\p{Mn}\p{N}]*") -CONSTANT = re.compile(r"""([0-9_]+)|(0[bB][01_]+)|0o[0-7_]+|(0[xX][0-9a-fA-F_]+)|([0-9_]+(\.[0-9_]+|[eE][0-9_]+|\.[0-9_]+[eE][0-9_]+))|('([^\'\\]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})')""") -STRING_LITERAL = re.compile(r'''"([^\\"]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"''') +DECIMAL_CONSTANT = re.compile(r"[0-9_]+") +BINARY_CONSTANT = re.compile(r"0[bB][01_]+") +OCTAL_CONSTANT = re.compile(r"0o[0-7_]+") +HEX_CONSTANT = re.compile(r"0[xX][0-9a-fA-F]+") +FLOAT_CONSTANT = re.compile(r"[0-9_]+\.[0-9_]+([eE][+-]?[0-9_]+)?") +HEX_FLOAT_CONSTANT = re.compile(r"0(fx|FX)[0-9a-fA-F_]+\.[0-9a-fA-F_]+[pP][+-]?[0-9_]+") + +_ESCAPE_SEQUENCE = r"""\\['"\\rnt0]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}""" +CHAR_CONSTANT = re.compile(r"'([^'\\]|" + _ESCAPE_SEQUENCE + r")'") +STRING_LITERAL = re.compile(r'"([^"\\]|' + _ESCAPE_SEQUENCE + r')+"') PUNCTUATOR = re.compile(r"->|\+\+|--|>>|<<|<=|>=|&&|\|\||[=!+\-*/%&|^]=|[\[\](){}.,+\-*/%;!&|^~><=]") -WHITESPACE = re.compile(r"[\p{Zs}\p{Cc}]+") +WHITESPACE = re.compile(r"[\p{Z}\p{Cc}]+") COMMENT = re.compile(r"(//[^\n]*\n)|(/\*.*?\*/)", re.DOTALL) @@ -46,20 +67,25 @@ def scan(code): if match: remaining = remaining[match.end():] continue - match = KEYWORD.match(remaining) - if match: - result.append(Token(match.group())) - remaining = remaining[match.end():] + kw_match = KEYWORD.match(remaining) + id_match = IDENTIFIER.match(remaining) + if kw_match and ((not id_match) or len(kw_match.group()) == len(id_match.group())): + result.append(Token(kw_match.group())) + remaining = remaining[kw_match.end():] continue - match = IDENTIFIER.match(remaining) - if match: - result.append(Token('identifier', match.group())) - remaining = remaining[match.end():] + if id_match: + result.append(Token('identifier', id_match.group())) + remaining = remaining[id_match.end():] continue - match = CONSTANT.match(remaining) - if match: - result.append(Token('constant', match.group())) - remaining = remaining[match.end():] + was_constant = False + for constant in [DECIMAL_CONSTANT, BINARY_CONSTANT, OCTAL_CONSTANT, HEX_CONSTANT, FLOAT_CONSTANT, HEX_FLOAT_CONSTANT, CHAR_CONSTANT]: + match = constant.match(remaining) + if match: + result.append(Token('constant', match.group())) + remaining = remaining[match.end():] + was_constant = True + break + if was_constant: continue match = STRING_LITERAL.match(remaining) if match: diff --git a/tests/test_hello_world.py b/tests/test_hello_world.py index ec1ccdd..594ab4f 100644 --- a/tests/test_hello_world.py +++ b/tests/test_hello_world.py @@ -8,7 +8,7 @@ class TestHelloWorld(unittest.TestCase): code = r""" include "stdio.hro"; -int main() { +int32 main() { printf("Hello, world!\n"); return 0; } diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 91787a6..7463fe7 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -5,41 +5,39 @@ from crowbar_reference_compiler import parse_header, parse_implementation, scan class TestParsing(unittest.TestCase): def test_basic(self): - print(parse_header(scan("int x();"))) + print(parse_header(scan("int8 x();"))) def test_scdoc_str(self): # adapted from https://git.sr.ht/~sircmpwn/scdoc/tree/master/include/str.h print(parse_header(scan(r""" -include "stdint.h"; - struct str { - char *str; - typedef size_t len; - typedef size_t size; -}; + (uint8[size])* str; + uintsize len; + uintsize size; +} struct str *str_create(); void str_free(struct str *str); void str_reset(struct str *str); -int str_append_ch(struct str *str, typedef uint32_t ch); +intsize str_append_ch(struct str *str, uint32 ch); """))) # adapted from https://git.sr.ht/~sircmpwn/scdoc/tree/master/src/string.c print(parse_implementation(scan(r""" -include "stdlib.h"; -include "stdint.h"; -include "str.h"; -include "unicode.h"; +include "stdlib.hro"; +include "stdint.hro"; +include "str.hro"; +include "unicode.hro"; -int ensure_capacity(struct str *str, typedef size_t len) { +bool ensure_capacity(struct str *str, intsize len) { if (len + 1 >= str->size) { - char *new = realloc(str->str, str->size * 2); + (uint8[str->size * 2])* new = realloc(str->str, str->size * 2); if (!new) { - return 0; + return false; } str->str = new; str->size *= 2; } - return 1; + return true; } struct str *str_create() { @@ -59,8 +57,8 @@ void str_free(struct str *str) { free(str); } -int str_append_ch(struct str *str, typedef uint32_t ch) { - int size = utf8_chsize(ch); +intsize str_append_ch(struct str *str, uint32 ch) { + intsize size = utf8_chsize(ch); if (size <= 0) { return -1; } -- cgit v1.2.3