From 732ef5e9787e618ead3a18f9f6aa2ed1f03f1670 Mon Sep 17 00:00:00 2001 From: Melody Horn Date: Wed, 14 Oct 2020 17:44:38 -0600 Subject: throw early draft of parser into the world --- crowbar_reference_compiler/__init__.py | 2 + crowbar_reference_compiler/parser.py | 200 +++++++++++++++++++++++++++++++++ crowbar_reference_compiler/scanner.py | 76 +++++++++++++ 3 files changed, 278 insertions(+) create mode 100644 crowbar_reference_compiler/__init__.py create mode 100644 crowbar_reference_compiler/parser.py create mode 100644 crowbar_reference_compiler/scanner.py (limited to 'crowbar_reference_compiler') diff --git a/crowbar_reference_compiler/__init__.py b/crowbar_reference_compiler/__init__.py new file mode 100644 index 0000000..7c7fca7 --- /dev/null +++ b/crowbar_reference_compiler/__init__.py @@ -0,0 +1,2 @@ +from .parser import parse_header, parse_implementation +from .scanner import scan diff --git a/crowbar_reference_compiler/parser.py b/crowbar_reference_compiler/parser.py new file mode 100644 index 0000000..9bb3179 --- /dev/null +++ b/crowbar_reference_compiler/parser.py @@ -0,0 +1,200 @@ +from parsimonious import TokenGrammar, ParseError, IncompleteParseError # type: ignore + +grammar = TokenGrammar( + r""" +HeaderFile = HeaderFileElement+ +HeaderFileElement = IncludeStatement / + TypeDeclaration / + FunctionDeclaration + +ImplementationFile = ImplementationFileElement+ +ImplementationFileElement = HeaderFileElement / + FunctionDefinition + +IncludeStatement = "include" string_literal ";" + +TypeDeclaration = StructDeclaration / + EnumDeclaration / + TypedefDeclaration +StructDeclaration = "struct" identifier "{" VariableDeclaration+ "}" ";" +EnumDeclaration = "enum" identifier "{" EnumBody "}" ";" +EnumBody = (identifier ("=" Expression)? "," EnumBody) / + (identifier ("=" Expression)? ","?) +TypedefDeclaration = "typedef" identifier "=" Type ";" + +FunctionDeclaration = FunctionSignature ";" +FunctionDefinition = FunctionSignature Block +FunctionSignature = Type identifier "(" SignatureArguments? ")" +SignatureArguments = (Type identifier "," SignatureArguments) / + (Type identifier ","?) + +Block = "{" Statement* "}" + +Statement = VariableDefinition / + VariableDeclaration / + IfStatement / + SwitchStatement / + WhileStatement / + DoWhileStatement / + ForStatement / + FlowControlStatement / + AssignmentStatement / + ExpressionStatement + +VariableDefinition = Type identifier "=" Expression ";" +VariableDeclaration = Type identifier ";" + +IfStatement = ("if" Expression Block "else" Block) / + ("if" Expression Block) + +SwitchStatement = "switch" Expression "{" SwitchCase+ "}" +SwitchCase = (CaseSpecifier Block) / + ("default" Block) +CaseSpecifier = ("case" Expression "," CaseSpecifier) / + ("case" Expression ","?) + +WhileStatement = "while" Expression Block +DoWhileStatement = "do" Block "while" Expression ";" +ForStatement = "for" VariableDefinition? ";" Expression ";" AssignmentStatementBody? Block + +FlowControlStatement = ("continue" ";") / + ("break" ";") / + ("return" Expression? ";") + +AssignmentStatement = AssignmentStatementBody ";" +AssignmentStatementBody = (AssignmentTargetExpression "=" Expression) / + (AssignmentTargetExpression "+=" Expression) / + (AssignmentTargetExpression "-=" Expression) / + (AssignmentTargetExpression "*=" Expression) / + (AssignmentTargetExpression "/=" Expression) / + (AssignmentTargetExpression "%=" Expression) / + (AssignmentTargetExpression "&=" Expression) / + (AssignmentTargetExpression "^=" Expression) / + (AssignmentTargetExpression "|=" Expression) / + (AssignmentTargetExpression "++") / + (AssignmentTargetExpression "--") + +ExpressionStatement = Expression ";" + +Type = ("const" BasicType) / + (BasicType "*") / + (BasicType "[" Expression "]") / + (BasicType "function" "(" (BasicType ",")* ")") / + BasicType +BasicType = "void" / + IntegerType / + ("signed" IntegerType) / + ("unsigned" IntegerType) / + "float" / + "double" / + "bool" / + ("struct" identifier) / + ("enum" identifier) / + ("typedef" identifier) / + ("(" Type ")") +IntegerType = "char" / + "short" / + "int" / + "long" + +AssignmentTargetExpression = identifier ATEElementSuffix* +ATEElementSuffix = ("[" Expression "]") / + ("." identifier) / + ("->" identifier) + +AtomicExpression = identifier / + constant / + string_literal / + ("(" Expression ")") + +ObjectExpression = (AtomicExpression ObjectSuffix*) / + ArrayLiteralExpression / + StructLiteralExpression +ObjectSuffix = ("[" Expression "]") / + ("(" CommasExpressionList? ")") / + ("." identifier) / + ("->" identifier) +CommasExpressionList = (Expression "," CommasExpressionList) / + (Expression ","?) +ArrayLiteralExpression = "{" CommasExpressionList "}" +StructLiteralExpression = "{" StructLiteralBody "}" +StructLiteralBody = (StructLiteralElement "," StructLiteralBody?) / + (StructLiteralElement ","?) +StructLiteralElement = "." identifier "=" Expression + +FactorExpression = ("(" Type ")" FactorExpression) / + ("&" FactorExpression) / + ("*" FactorExpression) / + ("+" FactorExpression) / + ("-" FactorExpression) / + ("~" FactorExpression) / + ("!" FactorExpression) / + ("sizeof" FactorExpression) / + ("sizeof" Type) / + ObjectExpression + +TermExpression = FactorExpression TermSuffix* +TermSuffix = ("*" FactorExpression) / + ("/" FactorExpression) / + ("%" FactorExpression) + +ArithmeticExpression = TermExpression ArithmeticSuffix* +ArithmeticSuffix = ("+" TermExpression) / + ("-" TermExpression) + +BitwiseOpExpression = (ArithmeticExpression "<<" ArithmeticExpression) / + (ArithmeticExpression ">>" ArithmeticExpression) / + (ArithmeticExpression "^" ArithmeticExpression) / + (ArithmeticExpression ("&" ArithmeticExpression)+) / + (ArithmeticExpression ("|" ArithmeticExpression)+) / + ArithmeticExpression + +ComparisonExpression = (BitwiseOpExpression "==" BitwiseOpExpression) / + (BitwiseOpExpression "!=" BitwiseOpExpression) / + (BitwiseOpExpression "<=" BitwiseOpExpression) / + (BitwiseOpExpression ">=" BitwiseOpExpression) / + (BitwiseOpExpression "<" BitwiseOpExpression) / + (BitwiseOpExpression ">" BitwiseOpExpression) / + BitwiseOpExpression + +Expression = (ComparisonExpression ("&&" ComparisonExpression)+) / + (ComparisonExpression ("||" ComparisonExpression)+) / + ComparisonExpression + +identifier = "identifier" +constant = "constant" +string_literal = "string_literal" +""") + + +class LegibleParseError(ParseError): + def line(self): + return "🤷" + + def column(self): + return "🤷" + + +class LegibleIncompleteParseError(IncompleteParseError): + def line(self): + return "🤷" + + def column(self): + return "🤷" + + +def parse_from_rule(rule, tokens): + try: + return rule.parse(tokens) + except IncompleteParseError as error: + raise LegibleIncompleteParseError(error.text, error.pos, error.expr) + except ParseError as error: + raise LegibleParseError(error.text, error.pos, error.expr) + + +def parse_header(tokens): + return parse_from_rule(grammar['HeaderFile'], tokens) + + +def parse_implementation(tokens): + return parse_from_rule(grammar['ImplementationFile'], tokens) diff --git a/crowbar_reference_compiler/scanner.py b/crowbar_reference_compiler/scanner.py new file mode 100644 index 0000000..fff8c35 --- /dev/null +++ b/crowbar_reference_compiler/scanner.py @@ -0,0 +1,76 @@ +from dataclasses import dataclass +from typing import Optional, overload, List, Union + +import regex as re # type: ignore + + +@dataclass +class Token: + type: str + data: Optional[str] = None + + def __repr__(self) -> str: + if self.data is not None: + return "{}: {}".format(self.type, repr(self.data)) + else: + return repr(self.type) + + +class GenerousTokenList(List[Token]): + def __getitem__(self, i): + try: + return super(GenerousTokenList, self).__getitem__(i) + except IndexError: + return Token('') + + +KEYWORD = re.compile("bool|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|function|if|include|int|long|return|short|signed|sizeof|struct|switch|typedef|unsigned|void|while") +IDENTIFIER = re.compile(r"[\p{L}\p{Pc}\p{Cf}\p{Sk}\p{Mn}][\p{L}\p{Pc}\p{Cf}\p{Sk}\p{Mn}\p{N}]*") +CONSTANT = re.compile(r"""([0-9_]+)|(0[bB][01_]+)|(0[xX][0-9a-fA-F_]+)|([0-9_]+(\.[0-9_]+|[eE][0-9_]+|\.[0-9_]+[eE][0-9_]+))|('([^\'\\]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})')""") +STRING_LITERAL = re.compile(r'''"([^\\"]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"''') +PUNCTUATOR = re.compile(r"->|\+\+|--|>>|<<|<=|>=|&&|\|\||[=!+\-*/%&|^]=|[\[\](){}.,+\-*/%;!&|^~><=]") +WHITESPACE = re.compile(r"[\p{Zs}\p{Cc}]+") +COMMENT = re.compile(r"(//[^\n]*\n)|(/\*.*?\*/)", re.DOTALL) + + +def scan(code): + result = [] + remaining = code + + while len(remaining) > 0: + match = COMMENT.match(remaining) + if match: + remaining = remaining[match.end():] + continue + match = WHITESPACE.match(remaining) + if match: + remaining = remaining[match.end():] + continue + match = KEYWORD.match(remaining) + if match: + result.append(Token(match.group())) + remaining = remaining[match.end():] + continue + match = IDENTIFIER.match(remaining) + if match: + result.append(Token('identifier', match.group())) + remaining = remaining[match.end():] + continue + match = CONSTANT.match(remaining) + if match: + result.append(Token('constant', match.group())) + remaining = remaining[match.end():] + continue + match = STRING_LITERAL.match(remaining) + if match: + result.append(Token('string_literal', match.group())) + remaining = remaining[match.end():] + continue + match = PUNCTUATOR.match(remaining) + if match: + result.append(Token(match.group())) + remaining = remaining[match.end():] + continue + raise ValueError("unrecognized code in scanner: {}".format(repr(remaining[:20]))) + + return GenerousTokenList(result) -- cgit v1.2.3