diff options
| -rw-r--r-- | crowbar_reference_compiler/parser.py | 301 | ||||
| -rw-r--r-- | crowbar_reference_compiler/scanner.py | 60 | ||||
| -rw-r--r-- | tests/test_hello_world.py | 2 | ||||
| -rw-r--r-- | tests/test_parsing.py | 34 | 
4 files changed, 203 insertions, 194 deletions
diff --git a/crowbar_reference_compiler/parser.py b/crowbar_reference_compiler/parser.py index 9bb3179..8400480 100644 --- a/crowbar_reference_compiler/parser.py +++ b/crowbar_reference_compiler/parser.py @@ -2,169 +2,154 @@ from parsimonious import TokenGrammar, ParseError, IncompleteParseError  # type:  grammar = TokenGrammar(      r""" -HeaderFile                 = HeaderFileElement+ -HeaderFileElement          = IncludeStatement / -                             TypeDeclaration / -                             FunctionDeclaration - -ImplementationFile         = ImplementationFileElement+ -ImplementationFileElement  = HeaderFileElement / -                             FunctionDefinition - -IncludeStatement           = "include" string_literal ";" - -TypeDeclaration            = StructDeclaration / -                             EnumDeclaration / -                             TypedefDeclaration -StructDeclaration          = "struct" identifier "{" VariableDeclaration+ "}" ";" -EnumDeclaration            = "enum" identifier "{" EnumBody "}" ";" -EnumBody                   = (identifier ("=" Expression)? "," EnumBody) / -                             (identifier ("=" Expression)? ","?) -TypedefDeclaration         = "typedef" identifier "=" Type ";" - -FunctionDeclaration        = FunctionSignature ";" -FunctionDefinition         = FunctionSignature Block -FunctionSignature          = Type identifier "(" SignatureArguments? ")" -SignatureArguments         = (Type identifier "," SignatureArguments) / -                             (Type identifier ","?) - -Block                      = "{" Statement* "}" +HeaderFile <- IncludeStatement* HeaderFileElement+ +HeaderFileElement <- TypeDefinition / FunctionDeclaration / VariableDefinition / VariableDeclaration + +ImplementationFile <- IncludeStatement* ImplementationFileElement+ +ImplementationFileElement <- TypeDefinition / VariableDefinition / FunctionDefinition + +IncludeStatement <- 'include' string-literal ';' + +TypeDefinition <- StructDefinition / EnumDefinition / UnionDefinition +StructDefinition <- NormalStructDefinition / OpaqueStructDefinition +NormalStructDefinition <- 'struct' identifier '{' VariableDeclaration+ '}' +OpaqueStructDefinition <- 'opaque' 'struct' identifier ';' +EnumDefinition <- 'enum' identifier '{' EnumMember (',' EnumMember)* ','? '}' +EnumMember <- identifier ('=' Expression)? +UnionDefinition <- RobustUnionDefinition / FragileUnionDefinition +RobustUnionDefinition <- 'union' identifier '{' VariableDeclaration UnionBody '}' +UnionBody <- 'switch' '(' identifier ')' '{' UnionBodySet+ '}' +UnionBodySet <- CaseSpecifier+ (VariableDeclaration / ';') +FragileUnionDefinition <- 'fragile' 'union' identifier '{' VariableDeclaration+ '}' + +FunctionDeclaration <- FunctionSignature ';' +FunctionDefinition <- FunctionSignature Block +FunctionSignature <- Type identifier '(' SignatureArguments? ')' +SignatureArguments <- Type identifier (',' Type identifier)* ','? + +Block <- '{' Statement* '}' -Statement                  = VariableDefinition / -                             VariableDeclaration / -                             IfStatement / -                             SwitchStatement / -                             WhileStatement / -                             DoWhileStatement / -                             ForStatement / -                             FlowControlStatement / -                             AssignmentStatement / -                             ExpressionStatement - -VariableDefinition         = Type identifier "=" Expression ";" -VariableDeclaration        = Type identifier ";" - -IfStatement                = ("if" Expression Block "else" Block) / -                             ("if" Expression Block) - -SwitchStatement            = "switch" Expression "{" SwitchCase+ "}" -SwitchCase                 = (CaseSpecifier Block) / -                             ("default" Block) -CaseSpecifier              = ("case" Expression "," CaseSpecifier) / -                             ("case" Expression ","?) - -WhileStatement             = "while" Expression Block -DoWhileStatement           = "do" Block "while" Expression ";" -ForStatement               = "for" VariableDefinition? ";" Expression ";" AssignmentStatementBody? Block -    -FlowControlStatement       = ("continue" ";") / -                             ("break" ";") / -                             ("return" Expression? ";") -    -AssignmentStatement        = AssignmentStatementBody ";" -AssignmentStatementBody    = (AssignmentTargetExpression "=" Expression) / -                             (AssignmentTargetExpression "+=" Expression) / -                             (AssignmentTargetExpression "-=" Expression) / -                             (AssignmentTargetExpression "*=" Expression) / -                             (AssignmentTargetExpression "/=" Expression) / -                             (AssignmentTargetExpression "%=" Expression) / -                             (AssignmentTargetExpression "&=" Expression) / -                             (AssignmentTargetExpression "^=" Expression) / -                             (AssignmentTargetExpression "|=" Expression) / -                             (AssignmentTargetExpression "++") / -                             (AssignmentTargetExpression "--") - -ExpressionStatement        = Expression ";" -    -Type                       = ("const" BasicType) / -                             (BasicType "*") / -                             (BasicType "[" Expression "]") / -                             (BasicType "function" "(" (BasicType ",")* ")") / -                             BasicType -BasicType                  = "void" / -                             IntegerType / -                             ("signed" IntegerType) / -                             ("unsigned" IntegerType) / -                             "float" / -                             "double" / -                             "bool" / -                             ("struct" identifier) / -                             ("enum" identifier) / -                             ("typedef" identifier) / -                             ("(" Type ")") -IntegerType                = "char" / -                             "short" / -                             "int" / -                             "long" - -AssignmentTargetExpression = identifier ATEElementSuffix* -ATEElementSuffix           = ("[" Expression "]") / -                             ("." identifier) / -                             ("->" identifier) - -AtomicExpression           = identifier / -                             constant / -                             string_literal / -                             ("(" Expression ")") - -ObjectExpression           = (AtomicExpression ObjectSuffix*) / -                             ArrayLiteralExpression / -                             StructLiteralExpression -ObjectSuffix               = ("[" Expression "]") / -                             ("(" CommasExpressionList? ")") / -                             ("." identifier) / -                             ("->" identifier) -CommasExpressionList       = (Expression "," CommasExpressionList) / -                             (Expression ","?) -ArrayLiteralExpression     = "{" CommasExpressionList "}" -StructLiteralExpression    = "{" StructLiteralBody "}" -StructLiteralBody          = (StructLiteralElement "," StructLiteralBody?) / -                             (StructLiteralElement ","?) -StructLiteralElement       = "." identifier "=" Expression - -FactorExpression           = ("(" Type ")" FactorExpression) / -                             ("&" FactorExpression) / -                             ("*" FactorExpression) / -                             ("+" FactorExpression) / -                             ("-" FactorExpression) / -                             ("~" FactorExpression) / -                             ("!" FactorExpression) / -                             ("sizeof" FactorExpression) / -                             ("sizeof" Type) / -                             ObjectExpression - -TermExpression             = FactorExpression TermSuffix* -TermSuffix                 = ("*" FactorExpression) / -                             ("/" FactorExpression) / -                             ("%" FactorExpression) - -ArithmeticExpression       = TermExpression ArithmeticSuffix* -ArithmeticSuffix           = ("+" TermExpression) / -                             ("-" TermExpression) - -BitwiseOpExpression        = (ArithmeticExpression "<<" ArithmeticExpression) / -                             (ArithmeticExpression ">>" ArithmeticExpression) / -                             (ArithmeticExpression "^" ArithmeticExpression) / -                             (ArithmeticExpression ("&" ArithmeticExpression)+) / -                             (ArithmeticExpression ("|" ArithmeticExpression)+) / -                             ArithmeticExpression - -ComparisonExpression       = (BitwiseOpExpression "==" BitwiseOpExpression) / -                             (BitwiseOpExpression "!=" BitwiseOpExpression) / -                             (BitwiseOpExpression "<=" BitwiseOpExpression) / -                             (BitwiseOpExpression ">=" BitwiseOpExpression) / -                             (BitwiseOpExpression "<" BitwiseOpExpression) / -                             (BitwiseOpExpression ">" BitwiseOpExpression) / -                             BitwiseOpExpression - -Expression                 = (ComparisonExpression ("&&" ComparisonExpression)+) / -                             (ComparisonExpression ("||" ComparisonExpression)+) / -                             ComparisonExpression +Statement <- VariableDefinition / StructureStatement / FlowControlStatement / AssignmentStatement / FragileStatement / ExpressionStatement / EmptyStatement +EmptyStatement <- ';' +FragileStatement <- 'fragile' Statement +ExpressionStatement <- Expression ';' + +VariableDeclaration <- Type identifier ';' +VariableDefinition <- Type identifier '=' Expression ';' + +StructureStatement <- IfStatement / SwitchStatement / WhileStatement / DoWhileStatement / ForStatement +IfStatement <- 'if' '(' Expression ')' Block ('else' Block)? +SwitchStatement <- 'switch' '(' Expression ')' '{' (CaseSpecifier / Statement)+ '}' +CaseSpecifier <- ('case' Expression ':') / ('default' ':') +WhileStatement <- 'while' '(' Expression ')' Block +DoWhileStatement <- 'do' Block 'while' '(' Expression ')' ';' +ForStatement <- 'for' '(' ForInit? ';' Expression ';' ForUpdate? ')' Block +ForInit <- ForInitializer (',' ForInitializer)* ','? +ForInitializer <- Type identifier '=' Expression +ForUpdate <- AssignmentBody (',' AssignmentBody)* ','? + +FlowControlStatement <- ContinueStatement / BreakStatement / ReturnStatement +ContinueStatement <- 'continue' ';' +BreakStatement <- 'break' ';' +ReturnStatement <- 'return' Expression? ';' + +AssignmentStatement <- AssignmentBody ';' +AssignmentBody <- DirectAssignmentBody / UpdateAssignmentBody / CrementAssignmentBody +DirectAssignmentBody <- Expression '=' Expression +UpdateAssignmentBody <- Expression ('+=' / '-=' / '*=' / '/=' / '%=' / '&=' / '^=' / '|=') Expression +CrementAssignmentBody <- Expression ('++' / '--') + +Type <- ConstType / PointerType / ArrayType / FunctionType / BasicType +ConstType <- 'const' BasicType +PointerType <- BasicType '*' +ArrayType <- BasicType '[' Expression ']' +FunctionType <- BasicType 'function' '(' FunctionTypeArgs? ')' +FunctionTypeArgs <- BasicType (',' BasicType)* ','? +BasicType <- 'void' / 'bool' / 'float32' / 'float64' / +             'int8' / 'int16' / 'int32' / 'int64' / 'intaddr' / 'intmax' / 'intsize' / +             'uint8' / 'uint16' / 'uint32' / 'uint64' / 'uintaddr' / 'uintmax' / 'uintsize' / +             ('struct' identifier) / ('enum' identifier) / ('union' identifier) / ('(' Type ')') + + +AtomicExpression <- identifier / constant / 'true' / 'false' / string-literal / ('(' Expression ')') + +ObjectExpression <- (AtomicExpression ObjectSuffix*) / ArrayLiteral / StructLiteral +ObjectSuffix <- ArrayIndexSuffix / FunctionCallSuffix / StructElementSuffix / StructPointerElementSuffix + +ArrayIndexSuffix <- '[' Expression ']' + +FunctionCallSuffix <- '(' CommasExpressionList? ')' +CommasExpressionList <- Expression (',' Expression)* ','? + +StructElementSuffix <- '.' identifier + +StructPointerElementSuffix <- '->' identifier + +ArrayLiteral <- '{' CommasExpressionList '}' + +StructLiteral <- '{' StructLiteralElement (',' StructLiteralElement)* ','? '}' +StructLiteralElement <- '.' identifier '=' Expression + +FactorExpression <- CastExpression / AddressOfExpression / DerefExpression / PositiveExpression / NegativeExpression / BitwiseNotExpression / LogicalNotExpression / SizeofExpression / ObjectExpression + +CastExpression <- '(' Type ')' ObjectExpression + +AddressOfExpression <- '&' ObjectExpression + +DerefExpression <- '*' ObjectExpression + +PositiveExpression <- '+' ObjectExpression + +NegativeExpression <- '-' ObjectExpression + +BitwiseNotExpression <- '~' ObjectExpression + +LogicalNotExpression <- '!' ObjectExpression + +SizeofExpression <- ('sizeof' ObjectExpression) / ('sizeof' Type) + +TermExpression <- FactorExpression TermSuffix? +TermSuffix <- ('*' FactorExpression)+ / ('/' FactorExpression)+ / ('%' FactorExpression)+ + +ArithmeticExpression <- TermExpression ArithmeticSuffix? +ArithmeticSuffix <- ('+' TermExpression)+ / ('-' TermExpression)+ + +BitwiseOpExpression <- ShiftExpression / XorExpression / BitwiseAndExpression / BitwiseOrExpression / ArithmeticExpression + +ShiftExpression <- (ArithmeticExpression '<<' ArithmeticExpression) / (ArithmeticExpression '>>' ArithmeticExpression) + +XorExpression <- ArithmeticExpression '^' ArithmeticExpression + +BitwiseAndExpression <- ArithmeticExpression ('&' ArithmeticExpression)+ + +BitwiseOrExpression <- ArithmeticExpression ('|' ArithmeticExpression)+ + +ComparisonExpression <- EqualExpression / NotEqualExpression / LessEqExpression / GreaterEqExpression / LessThanExpression / GreaterThanExpression / BitwiseOpExpression + +EqualExpression <- BitwiseOpExpression '==' BitwiseOpExpression + +NotEqualExpression <- BitwiseOpExpression '!=' BitwiseOpExpression + +LessEqExpression <- BitwiseOpExpression '<=' BitwiseOpExpression + +GreaterEqExpression <- BitwiseOpExpression '>=' BitwiseOpExpression + +LessThanExpression <- BitwiseOpExpression '<' BitwiseOpExpression + +GreaterThanExpression <- BitwiseOpExpression '>' BitwiseOpExpression + +LogicalOpExpression <- LogicalAndExpression / LogicalOrExpression / ComparisonExpression + +LogicalAndExpression <- ComparisonExpression ('&&' ComparisonExpression)+ + +LogicalOrExpression <- ComparisonExpression ('||' ComparisonExpression)+ + +Expression <- LogicalOpExpression  identifier = "identifier"  constant = "constant"  string_literal = "string_literal" -""") +""".replace(' <- ', ' = ').replace('string-literal', 'string_literal'))  class LegibleParseError(ParseError): diff --git a/crowbar_reference_compiler/scanner.py b/crowbar_reference_compiler/scanner.py index ea34536..1d846f3 100644 --- a/crowbar_reference_compiler/scanner.py +++ b/crowbar_reference_compiler/scanner.py @@ -1,5 +1,5 @@  from dataclasses import dataclass -from typing import Optional, overload, List, Union +from typing import Optional, List  import regex as re  # type: ignore @@ -24,12 +24,33 @@ class GenerousTokenList(List[Token]):              return Token('') -KEYWORD = re.compile("bool|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|fragile|function|if|include|int|long|return|short|signed|sizeof|struct|switch|typedef|unsigned|void|while") +KEYWORD = re.compile(r""" +    bool|break| +    case|char|const|continue| +    default|do| +    else|enum| +    false|float32|float64|for|fragile|function| +    if|include|int8|int16|int32|int64|intmax|intsize| +    opaque| +    return| +    sizeof|struct|switch| +    true| +    uint8|uint16|uint32|uint64|uintaddr|uintmax|uintsize|union| +    void| +    while""", re.VERBOSE)  IDENTIFIER = re.compile(r"[\p{L}\p{Pc}\p{Sk}\p{Mn}][\p{L}\p{Pc}\p{Sk}\p{Mn}\p{N}]*") -CONSTANT = re.compile(r"""([0-9_]+)|(0[bB][01_]+)|0o[0-7_]+|(0[xX][0-9a-fA-F_]+)|([0-9_]+(\.[0-9_]+|[eE][0-9_]+|\.[0-9_]+[eE][0-9_]+))|('([^\'\\]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})')""") -STRING_LITERAL = re.compile(r'''"([^\\"]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"''') +DECIMAL_CONSTANT = re.compile(r"[0-9_]+") +BINARY_CONSTANT = re.compile(r"0[bB][01_]+") +OCTAL_CONSTANT = re.compile(r"0o[0-7_]+") +HEX_CONSTANT = re.compile(r"0[xX][0-9a-fA-F]+") +FLOAT_CONSTANT = re.compile(r"[0-9_]+\.[0-9_]+([eE][+-]?[0-9_]+)?") +HEX_FLOAT_CONSTANT = re.compile(r"0(fx|FX)[0-9a-fA-F_]+\.[0-9a-fA-F_]+[pP][+-]?[0-9_]+") + +_ESCAPE_SEQUENCE = r"""\\['"\\rnt0]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}""" +CHAR_CONSTANT = re.compile(r"'([^'\\]|" + _ESCAPE_SEQUENCE + r")'") +STRING_LITERAL = re.compile(r'"([^"\\]|' + _ESCAPE_SEQUENCE + r')+"')  PUNCTUATOR = re.compile(r"->|\+\+|--|>>|<<|<=|>=|&&|\|\||[=!+\-*/%&|^]=|[\[\](){}.,+\-*/%;!&|^~><=]") -WHITESPACE = re.compile(r"[\p{Zs}\p{Cc}]+") +WHITESPACE = re.compile(r"[\p{Z}\p{Cc}]+")  COMMENT = re.compile(r"(//[^\n]*\n)|(/\*.*?\*/)", re.DOTALL) @@ -46,20 +67,25 @@ def scan(code):          if match:              remaining = remaining[match.end():]              continue -        match = KEYWORD.match(remaining) -        if match: -            result.append(Token(match.group())) -            remaining = remaining[match.end():] +        kw_match = KEYWORD.match(remaining) +        id_match = IDENTIFIER.match(remaining) +        if kw_match and ((not id_match) or len(kw_match.group()) == len(id_match.group())): +            result.append(Token(kw_match.group())) +            remaining = remaining[kw_match.end():]              continue -        match = IDENTIFIER.match(remaining) -        if match: -            result.append(Token('identifier', match.group())) -            remaining = remaining[match.end():] +        if id_match: +            result.append(Token('identifier', id_match.group())) +            remaining = remaining[id_match.end():]              continue -        match = CONSTANT.match(remaining) -        if match: -            result.append(Token('constant', match.group())) -            remaining = remaining[match.end():] +        was_constant = False +        for constant in [DECIMAL_CONSTANT, BINARY_CONSTANT, OCTAL_CONSTANT, HEX_CONSTANT, FLOAT_CONSTANT, HEX_FLOAT_CONSTANT, CHAR_CONSTANT]: +            match = constant.match(remaining) +            if match: +                result.append(Token('constant', match.group())) +                remaining = remaining[match.end():] +                was_constant = True +                break +        if was_constant:              continue          match = STRING_LITERAL.match(remaining)          if match: diff --git a/tests/test_hello_world.py b/tests/test_hello_world.py index ec1ccdd..594ab4f 100644 --- a/tests/test_hello_world.py +++ b/tests/test_hello_world.py @@ -8,7 +8,7 @@ class TestHelloWorld(unittest.TestCase):          code = r"""  include "stdio.hro"; -int main() { +int32 main() {      printf("Hello, world!\n");      return 0;  } diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 91787a6..7463fe7 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -5,41 +5,39 @@ from crowbar_reference_compiler import parse_header, parse_implementation, scan  class TestParsing(unittest.TestCase):      def test_basic(self): -        print(parse_header(scan("int x();"))) +        print(parse_header(scan("int8 x();")))      def test_scdoc_str(self):          # adapted from https://git.sr.ht/~sircmpwn/scdoc/tree/master/include/str.h          print(parse_header(scan(r""" -include "stdint.h"; -  struct str { -    char *str; -    typedef size_t len; -    typedef size_t size; -}; +    (uint8[size])* str; +    uintsize len; +    uintsize size; +}  struct str *str_create();  void str_free(struct str *str);  void str_reset(struct str *str); -int str_append_ch(struct str *str, typedef uint32_t ch); +intsize str_append_ch(struct str *str, uint32 ch);  """)))          # adapted from https://git.sr.ht/~sircmpwn/scdoc/tree/master/src/string.c          print(parse_implementation(scan(r""" -include "stdlib.h"; -include "stdint.h"; -include "str.h"; -include "unicode.h"; +include "stdlib.hro"; +include "stdint.hro"; +include "str.hro"; +include "unicode.hro"; -int ensure_capacity(struct str *str, typedef size_t len) { +bool ensure_capacity(struct str *str, intsize len) {      if (len + 1 >= str->size) { -        char *new = realloc(str->str, str->size * 2); +        (uint8[str->size * 2])* new = realloc(str->str, str->size * 2);          if (!new) { -            return 0; +            return false;          }          str->str = new;          str->size *= 2;      } -    return 1; +    return true;  }  struct str *str_create() { @@ -59,8 +57,8 @@ void str_free(struct str *str) {      free(str);  } -int str_append_ch(struct str *str, typedef uint32_t ch) { -    int size = utf8_chsize(ch); +intsize str_append_ch(struct str *str, uint32 ch) { +    intsize size = utf8_chsize(ch);      if (size <= 0) {          return -1;      }  |