From 5af481d62df80d8be3f5835042d30372ef9cbe04 Mon Sep 17 00:00:00 2001 From: Melody Horn Date: Sat, 31 Oct 2020 21:59:00 -0600 Subject: define and annotate some language elements --- _ext/crowbar_domain.py | 91 +++++++++++++++----- index.rst | 4 +- language/flow-control.rst | 6 -- language/include.rst | 7 ++ language/index.rst | 18 ++-- language/scanning.rst | 6 +- language/source-file.rst | 17 ++++ language/type-definition.rst | 23 +++++ syntax.md | 196 ------------------------------------------- 9 files changed, 127 insertions(+), 241 deletions(-) delete mode 100644 language/flow-control.rst create mode 100644 language/include.rst create mode 100644 language/source-file.rst create mode 100644 language/type-definition.rst diff --git a/_ext/crowbar_domain.py b/_ext/crowbar_domain.py index a5f2570..db1330e 100644 --- a/_ext/crowbar_domain.py +++ b/_ext/crowbar_domain.py @@ -1,5 +1,7 @@ from collections import defaultdict +import re +from docutils import nodes from docutils.parsers.rst import directives from sphinx import addnodes @@ -7,41 +9,74 @@ from sphinx.directives import ObjectDescription from sphinx.domains import Domain from sphinx.domains import Index from sphinx.roles import XRefRole +from sphinx.util import logging from sphinx.util.nodes import make_refnode +logger = logging.getLogger(__name__) -class KeywordDirective(ObjectDescription): +peg_token = re.compile(r"\s+|\+|\*|\?|\(|\)|'[^']+'|[\w\-]+") +def tokenize_peg(peg_def): + for token in peg_token.finditer(peg_def): + yield token[0] + +class ElementDirective(ObjectDescription): has_content = True required_arguments = 1 + option_spec = { + 'contains': directives.unchanged_required, + } def handle_signature(self, sig, signode): - signode += addnodes.desc_name(text=sig) - return sig - - def add_target_and_index(self, name_cls, sig, signode): - signode['ids'].append('keyword' + '-' + sig) + element, defs = sig.split(' <- ') + peg_rule = nodes.literal() + signode += peg_rule + peg_rule += addnodes.desc_name(text=element) + peg_rule += nodes.Text(' ') + peg_rule += addnodes.desc_sig_operator(text='<-') + defs = defs.split(' / ') + for d in defs: + if peg_rule.children[-1].astext() != '<-': + peg_rule += [nodes.Text(' '), addnodes.desc_sig_operator(text='/')] + peg_rule += nodes.Text(' ') + for token in tokenize_peg(d): + if re.fullmatch(r"[a-z][\w\-]+", token): + refattrs = dict(refdomain='std', refexplicit=False, reftarget=token.replace('-', ' '), reftype='term', refwarn=True) + reference = addnodes.pending_xref('', nodes.literal(text=token, classes=['xref', 'std', 'std-term']), **refattrs) + peg_rule += addnodes.desc_sig_name('', '', reference) + elif re.fullmatch(r'[A-Z]\w+', token): + refattrs = dict(refdomain='crowbar', refexplicit=False, reftarget=token, reftype='ref', refwarn=True) + reference = addnodes.pending_xref('', nodes.literal(text=token, classes=['xref', 'crowbar', 'crowbar-ref']), **refattrs) + peg_rule += addnodes.desc_sig_name('', '', reference) + elif len(token.strip()) == 0 or (token.startswith("'") and token.endswith("'")): + peg_rule += nodes.literal(text=token) + else: + peg_rule += addnodes.desc_sig_operator('', '', nodes.literal(text=token)) + return element + + def add_target_and_index(self, name, sig, signode): + signode['ids'].append('element' + '-' + name) if 'noindex' not in self.options: crowbar = self.env.get_domain('crowbar') - crowbar.add_keyword(sig) + crowbar.add_element(name) -class KeywordIndex(Index): - name = 'keyword' - localname = 'Keyword Index' - shortname = 'Keyword' +class ElementIndex(Index): + name = 'element' + localname = 'Element Index' + shortname = 'Element' def generate(self, docnames=None): content = defaultdict(list) - # sort the list of recipes in alphabetical order - recipes = self.domain.get_objects() - recipes = sorted(recipes, key=lambda recipe: recipe[0]) + # sort the list of elements in alphabetical order + elements = self.domain.get_elements() + elements = sorted(elements, key=lambda recipe: recipe[0]) # generate the expected output, shown below, from the above using the # first letter of the recipe as a key to group thing # # name, subtype, docname, anchor, extra, qualifier, description - for name, dispname, typ, docname, anchor, _ in recipes: + for name, dispname, typ, docname, anchor, _ in elements: content[dispname[0].lower()].append( (dispname, 0, docname, anchor, docname, '', typ)) @@ -55,25 +90,28 @@ class CrowbarDomain(Domain): name = 'crowbar' label = 'Crowbar' roles = { - 'ref': XRefRole() + 'ref': XRefRole(warn_dangling=True), } directives = { - 'keyword': KeywordDirective, + 'element': ElementDirective, } indices = { - KeywordIndex, + ElementIndex, } initial_data = { - 'keywords': [], + 'elements': [], } def get_full_qualified_name(self, node): - return '{}.{}'.format('keyword', node.arguments[0]) + return '{}.{}'.format('element', node.arguments[0]) - def get_objects(self): - for obj in self.data['keywords']: + def get_elements(self): + for obj in self.data['elements']: yield(obj) + def get_objects(self): + yield from self.get_elements() + def resolve_xref(self, env, fromdocname, builder, typ, target, node, contnode): match = [(docname, anchor) @@ -99,6 +137,15 @@ class CrowbarDomain(Domain): self.data['keywords'].append( (name, signature, 'Keyword', self.env.docname, anchor, 0)) + def add_element(self, signature): + """Add a new element to the domain.""" + name = '{}.{}'.format('element', signature) + anchor = 'element-{}'.format(signature) + + # name, dispname, type, docname, anchor, priority + self.data['elements'].append( + (name, signature, 'Element', self.env.docname, anchor, 0)) + def setup(app): app.add_domain(CrowbarDomain) diff --git a/index.rst b/index.rst index 99f5d44..8b310af 100644 --- a/index.rst +++ b/index.rst @@ -12,7 +12,7 @@ Crowbar is a language that is derived from (and, wherever possible, interoperabl Ideally, a typical C codebase should be straightforward to rewrite in Crowbar, and any atypical C constructions not supported by Crowbar can be left as C. -This site hosts the Crowbar specification. +This site hosts the Crowbar specification at https://crowbar-lang.org and at ``_. Additional resources you may be interested in: * `sr.ht project hub`_ @@ -67,5 +67,5 @@ Chapters ================== * :ref:`genindex` - * :ref:`crowbar-keyword` + * :ref:`crowbar-element` * :ref:`search` diff --git a/language/flow-control.rst b/language/flow-control.rst deleted file mode 100644 index 853b5bd..0000000 --- a/language/flow-control.rst +++ /dev/null @@ -1,6 +0,0 @@ -Flow Control -============ - -.. crowbar:keyword:: break - - This keyword exits the containing loop. diff --git a/language/include.rst b/language/include.rst new file mode 100644 index 0000000..cc2964a --- /dev/null +++ b/language/include.rst @@ -0,0 +1,7 @@ +Including Headers +----------------- + +.. crowbar:element:: IncludeStatement <- 'include' string-literal ';' + + When encountering this statement at the beginning of a file, the compiler should interpret the string literal as a relative file path, look up the corresponding file in an implementation-defined way, and load the definitions from the given :crowbar:ref:`HeaderFile`. + This statement has no runtime effect. diff --git a/language/index.rst b/language/index.rst index 79702c5..eb2d92c 100644 --- a/language/index.rst +++ b/language/index.rst @@ -4,23 +4,17 @@ Language The syntax of Crowbar is designed to be similar to the syntax of C. A Crowbar source file is UTF-8. +Unless otherwise specified, a *character* in this specification refers to a `Unicode scalar value `_. Crowbar source files can come in two varieties: -.. glossary:: - - header file - A Crowbar source file declaring types and functions. - Can be intended for internal use within a project, or to define the public API of a library. - Conventionally has the ``.hro`` file extension. - - implementation file - A Crowbar source file providing function definitions, and sometimes its own type declarations. - Conventionally has the ``.cro`` file extension. - A Crowbar source file is read into memory in two phases: *scanning* (which converts text into an unstructured sequence of tokens) and *parsing* (which converts an unstructured sequence of tokens into a parse tree). +Syntax elements in this document are given in the form of `parsing expression grammar `_ rules. + .. toctree:: :maxdepth: 1 scanning - flow-control + source-file + include + type-definition diff --git a/language/scanning.rst b/language/scanning.rst index 86177ac..ed85ead 100644 --- a/language/scanning.rst +++ b/language/scanning.rst @@ -11,10 +11,10 @@ Scanning Punctuators, string literals, and character constants do not require explicit separation from adjacent tokens. keyword - One of the literal words ``bool``, :crowbar:ref:`break`, ``case``, + One of the literal words ``bool``, ``break``, ``case``, ``char``, ``const``, ``continue``, ``default``, ``do``, ``double``, ``else``, ``enum``, ``extern``, ``float``, ``for``, ``fragile``, - ``function``, ``if``, ``include``, ``int``, ``long``, ``return``, + ``function``, ``if``, :crowbar:ref:`include `, ``int``, ``long``, ``return``, ``short``, ``signed``, ``sizeof``, ``struct``, ``switch``, ``unsigned``, ``void``, or ``while``. @@ -67,7 +67,7 @@ Scanning character constant A pair of single quotes ``'`` surrounding either a single character or an :term:`escape sequence`. The single character may not be a single quote or a backslash ``\``. - Denotes the Unicode code point number for either the single surrounded character or the character denoted by the escape sequence. + Denotes the Unicode scalar value for either the single surrounded character or the character denoted by the escape sequence. escape sequence One of the following pairs of characters: diff --git a/language/source-file.rst b/language/source-file.rst new file mode 100644 index 0000000..6522ea8 --- /dev/null +++ b/language/source-file.rst @@ -0,0 +1,17 @@ +Source Files +------------ + +.. crowbar:element:: HeaderFile <- IncludeStatement* HeaderFileElement+ +.. crowbar:element:: HeaderFileElement <- TypeDefinition / FunctionDeclaration / ConstantDefinition / UninitializedVariableDeclaration + + A Crowbar header file defines an API boundary, either at the surface of a library or between pieces of a library or application. + :crowbar:ref:`IncludeStatement`\ s can only appear at the beginning of the header file, and header files cannot define behavior directly. + Conventionally, a header file has a ``.hro`` file extension. + +.. crowbar:element:: ImplementationFile <- IncludeStatement* ImplementationFileElement+ +.. crowbar:element:: ImplementationFileElement <- TypeDefinition / VariableDefinition / FunctionDefinition + + A Crowbar implementation file defines the actual behavior of some piece of a library or application. + It can also define internal types, functions, and variables. + :crowbar:ref:`IncludeStatement`\ s can only appear at the beginning of the implementation file. + Conventionally, an implementation file has a ``.cro`` file extension. diff --git a/language/type-definition.rst b/language/type-definition.rst new file mode 100644 index 0000000..02616b8 --- /dev/null +++ b/language/type-definition.rst @@ -0,0 +1,23 @@ +Defining Types +-------------- + +.. crowbar:element:: TypeDefinition <- StructDefinition / EnumDefinition / UnionDefinition + + Crowbar has three different kinds of user-defined types. + +.. crowbar:element:: StructDefinition <- 'struct' identifier '{' VariableDeclaration+ '}' ';' + + A ``struct`` defines a composite type with several members. + + .. todo:: + + define struct layout in memory + +.. crowbar:element:: EnumDefinition <- 'enum' identifier '{' EnumMember (',' EnumMember)* ','? '}' ';' + EnumMember <- identifier ('=' Expression)? + + An ``enum`` defines a type which can take one of several specified values. + + .. todo:: + + define enum value assignment, type-related behavior diff --git a/syntax.md b/syntax.md index 80fa54b..96f0b88 100644 --- a/syntax.md +++ b/syntax.md @@ -1,204 +1,8 @@ # Syntax (old) -The syntax of Crowbar mostly matches the syntax of C, with fewer obscure/advanced/edge case features. - -## Source Files - -A Crowbar source file is UTF-8. -Crowbar source files can come in two varieties, an *implementation file* and a *header file*. -An implementation file conventionally has a `.cro` extension, and a header file conventionally has a `.hro` extension. - -A Crowbar source file is read into memory in two phases: *scanning* (which converts text into an unstructured sequence of tokens) and *parsing* (which converts an unstructured sequence of tokens into a parse tree). - -## Scanning - -A *token* is one of the following kinds of token: - -- a *keyword*, -- an *identifier*, -- a *constant*, -- a *string literal*, -- or a *punctuator*. - -Tokens are separated by either *whitespace* or a *comment*. - -### Keywords - -A *keyword* is one of the following literal words: - -- `bool` -- `break` -- `case` -- `char` -- `const` -- `continue` -- `default` -- `do` -- `double` -- `else` -- `enum` -- `extern` -- `float` -- `for` -- `fragile` -- `function` -- `if` -- `include` -- `int` -- `long` -- `return` -- `short` -- `signed` -- `sizeof` -- `struct` -- `switch` -- `unsigned` -- `void` -- `while` - -### Identifiers - -An *identifier* is a sequence of one or more characters having Unicode categories within a legal set. - -The first character in an identifier must have one of the following Unicode categories: - -- `Pc` Connector Punctuation (e.g. `_`) -- `Ll` Lowercase Letter (e.g. `h`) -- `Lm` Modifier Letter (e.g. `ʹ`, U+02B9 Modifier Letter Prime) -- `Lo` Other Letter (e.g. `א`, U+05D0 Hebrew Letter Alef) -- `Lt` Titlecase Letter (e.g. `Dž`, U+01C5 Latin Capital Letter D With Small Letter Z With Caron) -- `Lu` Uppercase Letter (e.g. `B`) -- `Mn` Nonspacing Mark (e.g. ` ̂`, U+0302 Combining Circumflex Accent) -- `Sk` Modifier Symbol (e.g. `^`, U+005E Circumflex Accent) - -Subsequent characters may have any of the above-listed Unicode categories, or one of the following: - -- `Nd` Decimal Digit Number (e.g. `0`) -- `Nl` Letter Number (e.g. `Ⅳ`, U+2163 Roman Numeral Four) -- `No` Other Number (e.g. `¼`, U+00BC Vulgar Fraction One Quarter) - -### Constants - -A *constant* can have one of six types: - -- a *decimal constant*, a sequence of characters drawn from the set {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `_`}; -- a *binary constant*, a prefix (either `0b` or `0B`) followed by a sequence of characters drawn from the set {`0`, `1`, `_`}; -- an *octal constant*, the prefix `0o` followed by a sequence of characters drawn from the set {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `_`}; -- a *hexadecimal constant*, a prefix (either `0x` or `0X`) followed by a sequence of characters drawn from the set {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `A`, `a`, `B`, `b`, `C`, `c`, `D`, `d`, `E`, `e`, `F`, `f`, `_`}; -- a *floating-point constant*, a decimal constant followed by one of - - `.` followed by a decimal constant, - - either `e` or `E` followed by a decimal constant, - - or a `.` followed by a decimal constant followed by either an `e` or `E` followed by a decimal constant; -- or a *character constant*, a `'` followed by either a single character or an *escape sequence* followed by another `'`. - -#### Escape Sequences - -The following sequences of characters are *escape sequences*: - -- `\'` -- `\"` -- `\\` -- `\r` -- `\n` -- `\t` -- `\0` -- `\x` followed by two characters drawn from the set {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `A`, `a`, `B`, `b`, `C`, `c`, `D`, `d`, `E`, `e`, `F`, `f`} -- `\u` followed by four characters drawn from the set {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `A`, `a`, `B`, `b`, `C`, `c`, `D`, `d`, `E`, `e`, `F`, `f`} -- `\U` followed by eight characters drawn from the set {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `A`, `a`, `B`, `b`, `C`, `c`, `D`, `d`, `E`, `e`, `F`, `f`} - -### String Literals - -A *string literal* begins with a `"`. -It then contains a sequence where each element is either an escape sequence or a character that is neither `"` nor `\`. -It then ends with a `"`. - -### Punctuators - -The following sequences of characters form *punctuators*: - -- `[` -- `]` -- `(` -- `)` -- `{` -- `}` -- `.` -- `,` -- `+` -- `-` -- `*` -- `/` -- `%` -- `;` -- `!` -- `&` -- `|` -- `^` -- `~` -- `>` -- `<` -- `=` -- `->` -- `++` -- `--` -- `>>` -- `<<` -- `<=` -- `>=` -- `==` -- `!=` -- `&&` -- `||` -- `+=` -- `-=` -- `*=` -- `/=` -- `%=` -- `&=` -- `|=` -- `^=` - -### Whitespace - -A nonempty sequence of characters is considered to be *whitespace* if each character in it has a Unicode class of either Space Separator or Control Other. - -### Comments - -A *comment* can be either a *line comment* or a *block comment*. - -A *line comment* begins with the characters `//` if they occur outside of a string literal or comment, and ends with a newline character U+000A. - -A *block comment* begins with the characters `/*` if they occur outside of a string literal or comment, and ends with the characters `*/`. - -## Parsing - -The syntax of Crowbar is given as a [parsing expression grammar](https://en.wikipedia.org/wiki/Parsing_expression_grammar): - -### Entry points - -```PEG -HeaderFile ← HeaderFileElement+ -HeaderFileElement ← IncludeStatement / - TypeDeclaration / - FunctionDeclaration - -ImplementationFile ← ImplementationFileElement+ -ImplementationFileElement ← HeaderFileElement / - FunctionDefinition -``` - ### Top-level elements ```PEG -IncludeStatement ← 'include' string-literal ';' - -TypeDeclaration ← StructDeclaration / - EnumDeclaration -StructDeclaration ← 'struct' identifier '{' VariableDeclaration+ '}' ';' -EnumDeclaration ← 'enum' identifier '{' EnumBody '}' ';' -EnumBody ← identifier ('=' Expression)? ',' EnumBody / - identifier ('=' Expression)? ','? - FunctionDeclaration ← FunctionSignature ';' FunctionDefinition ← FunctionSignature Block FunctionSignature ← Type identifier '(' SignatureArguments? ')' -- cgit v1.2.3