From 5af481d62df80d8be3f5835042d30372ef9cbe04 Mon Sep 17 00:00:00 2001
From: Melody Horn <melody@boringcactus.com>
Date: Sat, 31 Oct 2020 21:59:00 -0600
Subject: define and annotate some language elements

---
 _ext/crowbar_domain.py       |  91 +++++++++++++++-----
 index.rst                    |   4 +-
 language/flow-control.rst    |   6 --
 language/include.rst         |   7 ++
 language/index.rst           |  18 ++--
 language/scanning.rst        |   6 +-
 language/source-file.rst     |  17 ++++
 language/type-definition.rst |  23 +++++
 syntax.md                    | 196 -------------------------------------------
 9 files changed, 127 insertions(+), 241 deletions(-)
 delete mode 100644 language/flow-control.rst
 create mode 100644 language/include.rst
 create mode 100644 language/source-file.rst
 create mode 100644 language/type-definition.rst

diff --git a/_ext/crowbar_domain.py b/_ext/crowbar_domain.py
index a5f2570..db1330e 100644
--- a/_ext/crowbar_domain.py
+++ b/_ext/crowbar_domain.py
@@ -1,5 +1,7 @@
 from collections import defaultdict
+import re
 
+from docutils import nodes
 from docutils.parsers.rst import directives
 
 from sphinx import addnodes
@@ -7,41 +9,74 @@ from sphinx.directives import ObjectDescription
 from sphinx.domains import Domain
 from sphinx.domains import Index
 from sphinx.roles import XRefRole
+from sphinx.util import logging
 from sphinx.util.nodes import make_refnode
 
+logger = logging.getLogger(__name__)
 
-class KeywordDirective(ObjectDescription):
+peg_token = re.compile(r"\s+|\+|\*|\?|\(|\)|'[^']+'|[\w\-]+")
+def tokenize_peg(peg_def):
+    for token in peg_token.finditer(peg_def):
+        yield token[0]
+
+class ElementDirective(ObjectDescription):
     has_content = True
     required_arguments = 1
+    option_spec = {
+        'contains': directives.unchanged_required,
+    }
 
     def handle_signature(self, sig, signode):
-        signode += addnodes.desc_name(text=sig)
-        return sig
-
-    def add_target_and_index(self, name_cls, sig, signode):
-        signode['ids'].append('keyword' + '-' + sig)
+        element, defs = sig.split(' <- ')
+        peg_rule = nodes.literal()
+        signode += peg_rule
+        peg_rule += addnodes.desc_name(text=element)
+        peg_rule += nodes.Text(' ')
+        peg_rule += addnodes.desc_sig_operator(text='<-')
+        defs = defs.split(' / ')
+        for d in defs:
+            if peg_rule.children[-1].astext() != '<-':
+                peg_rule += [nodes.Text(' '), addnodes.desc_sig_operator(text='/')]
+            peg_rule += nodes.Text(' ')
+            for token in tokenize_peg(d):
+                if re.fullmatch(r"[a-z][\w\-]+", token):
+                    refattrs = dict(refdomain='std', refexplicit=False, reftarget=token.replace('-', ' '), reftype='term', refwarn=True)
+                    reference = addnodes.pending_xref('', nodes.literal(text=token, classes=['xref', 'std', 'std-term']), **refattrs)
+                    peg_rule += addnodes.desc_sig_name('', '', reference)
+                elif re.fullmatch(r'[A-Z]\w+', token):
+                    refattrs = dict(refdomain='crowbar', refexplicit=False, reftarget=token, reftype='ref', refwarn=True)
+                    reference = addnodes.pending_xref('', nodes.literal(text=token, classes=['xref', 'crowbar', 'crowbar-ref']), **refattrs)
+                    peg_rule += addnodes.desc_sig_name('', '', reference)
+                elif len(token.strip()) == 0 or (token.startswith("'") and token.endswith("'")):
+                    peg_rule += nodes.literal(text=token)
+                else:
+                    peg_rule += addnodes.desc_sig_operator('', '', nodes.literal(text=token))
+        return element
+    
+    def add_target_and_index(self, name, sig, signode):
+        signode['ids'].append('element' + '-' + name)
         if 'noindex' not in self.options:
             crowbar = self.env.get_domain('crowbar')
-            crowbar.add_keyword(sig)
+            crowbar.add_element(name)
 
 
-class KeywordIndex(Index):
-    name = 'keyword'
-    localname = 'Keyword Index'
-    shortname = 'Keyword'
+class ElementIndex(Index):
+    name = 'element'
+    localname = 'Element Index'
+    shortname = 'Element'
 
     def generate(self, docnames=None):
         content = defaultdict(list)
 
-        # sort the list of recipes in alphabetical order
-        recipes = self.domain.get_objects()
-        recipes = sorted(recipes, key=lambda recipe: recipe[0])
+        # sort the list of elements in alphabetical order
+        elements = self.domain.get_elements()
+        elements = sorted(elements, key=lambda recipe: recipe[0])
 
         # generate the expected output, shown below, from the above using the
         # first letter of the recipe as a key to group thing
         #
         # name, subtype, docname, anchor, extra, qualifier, description
-        for name, dispname, typ, docname, anchor, _ in recipes:
+        for name, dispname, typ, docname, anchor, _ in elements:
             content[dispname[0].lower()].append(
                 (dispname, 0, docname, anchor, docname, '', typ))
 
@@ -55,25 +90,28 @@ class CrowbarDomain(Domain):
     name = 'crowbar'
     label = 'Crowbar'
     roles = {
-        'ref': XRefRole()
+        'ref': XRefRole(warn_dangling=True),
     }
     directives = {
-        'keyword': KeywordDirective,
+        'element': ElementDirective,
     }
     indices = {
-        KeywordIndex,
+        ElementIndex,
     }
     initial_data = {
-        'keywords': [],
+        'elements': [],
     }
 
     def get_full_qualified_name(self, node):
-        return '{}.{}'.format('keyword', node.arguments[0])
+        return '{}.{}'.format('element', node.arguments[0])
 
-    def get_objects(self):
-        for obj in self.data['keywords']:
+    def get_elements(self):
+        for obj in self.data['elements']:
             yield(obj)
 
+    def get_objects(self):
+        yield from self.get_elements()
+
     def resolve_xref(self, env, fromdocname, builder, typ, target, node,
                      contnode):
         match = [(docname, anchor)
@@ -99,6 +137,15 @@ class CrowbarDomain(Domain):
         self.data['keywords'].append(
             (name, signature, 'Keyword', self.env.docname, anchor, 0))
 
+    def add_element(self, signature):
+        """Add a new element to the domain."""
+        name = '{}.{}'.format('element', signature)
+        anchor = 'element-{}'.format(signature)
+
+        # name, dispname, type, docname, anchor, priority
+        self.data['elements'].append(
+            (name, signature, 'Element', self.env.docname, anchor, 0))
+
 
 def setup(app):
     app.add_domain(CrowbarDomain)
diff --git a/index.rst b/index.rst
index 99f5d44..8b310af 100644
--- a/index.rst
+++ b/index.rst
@@ -12,7 +12,7 @@ Crowbar is a language that is derived from (and, wherever possible, interoperabl
 
 Ideally, a typical C codebase should be straightforward to rewrite in Crowbar, and any atypical C constructions not supported by Crowbar can be left as C.
 
-This site hosts the Crowbar specification.
+This site hosts the Crowbar specification at https://crowbar-lang.org and at `<gemini://crowbar-lang.org>`_.
 Additional resources you may be interested in:
 
 * `sr.ht project hub`_
@@ -67,5 +67,5 @@ Chapters
     ==================
 
     * :ref:`genindex`
-    * :ref:`crowbar-keyword`
+    * :ref:`crowbar-element`
     * :ref:`search`
diff --git a/language/flow-control.rst b/language/flow-control.rst
deleted file mode 100644
index 853b5bd..0000000
--- a/language/flow-control.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-Flow Control
-============
-
-.. crowbar:keyword:: break
-
-    This keyword exits the containing loop.
diff --git a/language/include.rst b/language/include.rst
new file mode 100644
index 0000000..cc2964a
--- /dev/null
+++ b/language/include.rst
@@ -0,0 +1,7 @@
+Including Headers
+-----------------
+
+.. crowbar:element:: IncludeStatement <- 'include' string-literal ';'
+
+    When encountering this statement at the beginning of a file, the compiler should interpret the string literal as a relative file path, look up the corresponding file in an implementation-defined way, and load the definitions from the given :crowbar:ref:`HeaderFile`.
+    This statement has no runtime effect.
diff --git a/language/index.rst b/language/index.rst
index 79702c5..eb2d92c 100644
--- a/language/index.rst
+++ b/language/index.rst
@@ -4,23 +4,17 @@ Language
 The syntax of Crowbar is designed to be similar to the syntax of C.
 
 A Crowbar source file is UTF-8.
+Unless otherwise specified, a *character* in this specification refers to a `Unicode scalar value <https://www.unicode.org/glossary/#unicode_scalar_value>`_.
 Crowbar source files can come in two varieties:
 
-.. glossary::
-
-    header file
-        A Crowbar source file declaring types and functions.
-        Can be intended for internal use within a project, or to define the public API of a library.
-        Conventionally has the ``.hro`` file extension.
-
-    implementation file
-        A Crowbar source file providing function definitions, and sometimes its own type declarations.
-        Conventionally has the ``.cro`` file extension.
-
 A Crowbar source file is read into memory in two phases: *scanning* (which converts text into an unstructured sequence of tokens) and *parsing* (which converts an unstructured sequence of tokens into a parse tree).
 
+Syntax elements in this document are given in the form of `parsing expression grammar <https://en.wikipedia.org/wiki/Parsing_expression_grammar>`_ rules.
+
 ..  toctree::
     :maxdepth: 1
     
     scanning
-    flow-control
+    source-file
+    include
+    type-definition
diff --git a/language/scanning.rst b/language/scanning.rst
index 86177ac..ed85ead 100644
--- a/language/scanning.rst
+++ b/language/scanning.rst
@@ -11,10 +11,10 @@ Scanning
         Punctuators, string literals, and character constants do not require explicit separation from adjacent tokens.
 
     keyword
-        One of the literal words ``bool``, :crowbar:ref:`break`, ``case``,
+        One of the literal words ``bool``, ``break``, ``case``,
         ``char``, ``const``, ``continue``, ``default``, ``do``, ``double``,
         ``else``, ``enum``, ``extern``, ``float``, ``for``, ``fragile``,
-        ``function``, ``if``, ``include``, ``int``, ``long``, ``return``,
+        ``function``, ``if``, :crowbar:ref:`include <IncludeStatement>`, ``int``, ``long``, ``return``,
         ``short``, ``signed``, ``sizeof``, ``struct``, ``switch``,
         ``unsigned``, ``void``, or ``while``.
     
@@ -67,7 +67,7 @@ Scanning
     character constant
         A pair of single quotes ``'`` surrounding either a single character or an :term:`escape sequence`.
         The single character may not be a single quote or a backslash ``\``.
-        Denotes the Unicode code point number for either the single surrounded character or the character denoted by the escape sequence.
+        Denotes the Unicode scalar value for either the single surrounded character or the character denoted by the escape sequence.
     
     escape sequence
         One of the following pairs of characters:
diff --git a/language/source-file.rst b/language/source-file.rst
new file mode 100644
index 0000000..6522ea8
--- /dev/null
+++ b/language/source-file.rst
@@ -0,0 +1,17 @@
+Source Files
+------------
+
+.. crowbar:element:: HeaderFile <- IncludeStatement* HeaderFileElement+
+.. crowbar:element:: HeaderFileElement <- TypeDefinition / FunctionDeclaration / ConstantDefinition / UninitializedVariableDeclaration
+
+    A Crowbar header file defines an API boundary, either at the surface of a library or between pieces of a library or application.
+    :crowbar:ref:`IncludeStatement`\ s can only appear at the beginning of the header file, and header files cannot define behavior directly.
+    Conventionally, a header file has a ``.hro`` file extension.
+
+.. crowbar:element:: ImplementationFile <- IncludeStatement* ImplementationFileElement+
+.. crowbar:element:: ImplementationFileElement <- TypeDefinition / VariableDefinition / FunctionDefinition
+
+    A Crowbar implementation file defines the actual behavior of some piece of a library or application.
+    It can also define internal types, functions, and variables.
+    :crowbar:ref:`IncludeStatement`\ s can only appear at the beginning of the implementation file.
+    Conventionally, an implementation file has a ``.cro`` file extension.
diff --git a/language/type-definition.rst b/language/type-definition.rst
new file mode 100644
index 0000000..02616b8
--- /dev/null
+++ b/language/type-definition.rst
@@ -0,0 +1,23 @@
+Defining Types
+--------------
+
+.. crowbar:element:: TypeDefinition <- StructDefinition / EnumDefinition / UnionDefinition
+
+    Crowbar has three different kinds of user-defined types.
+
+.. crowbar:element:: StructDefinition <- 'struct' identifier '{' VariableDeclaration+ '}' ';'
+
+    A ``struct`` defines a composite type with several members.
+
+    .. todo::
+
+        define struct layout in memory
+
+.. crowbar:element:: EnumDefinition <- 'enum' identifier '{' EnumMember (',' EnumMember)* ','? '}' ';'
+                     EnumMember <- identifier ('=' Expression)?
+
+    An ``enum`` defines a type which can take one of several specified values.
+
+    .. todo::
+        
+        define enum value assignment, type-related behavior
diff --git a/syntax.md b/syntax.md
index 80fa54b..96f0b88 100644
--- a/syntax.md
+++ b/syntax.md
@@ -1,204 +1,8 @@
 # Syntax (old)
 
-The syntax of Crowbar mostly matches the syntax of C, with fewer obscure/advanced/edge case features.
-
-## Source Files
-
-A Crowbar source file is UTF-8.
-Crowbar source files can come in two varieties, an *implementation file* and a *header file*.
-An implementation file conventionally has a `.cro` extension, and a header file conventionally has a `.hro` extension.
-
-A Crowbar source file is read into memory in two phases: *scanning* (which converts text into an unstructured sequence of tokens) and *parsing* (which converts an unstructured sequence of tokens into a parse tree).
-
-## Scanning
-
-A *token* is one of the following kinds of token:
-
-- a *keyword*,
-- an *identifier*,
-- a *constant*,
-- a *string literal*,
-- or a *punctuator*.
-
-Tokens are separated by either *whitespace* or a *comment*.
-
-### Keywords
-
-A *keyword* is one of the following literal words:
-
-- `bool`
-- `break`
-- `case`
-- `char`
-- `const`
-- `continue`
-- `default`
-- `do`
-- `double`
-- `else`
-- `enum`
-- `extern`
-- `float`
-- `for`
-- `fragile`
-- `function`
-- `if`
-- `include`
-- `int`
-- `long`
-- `return`
-- `short`
-- `signed`
-- `sizeof`
-- `struct`
-- `switch`
-- `unsigned`
-- `void`
-- `while`
-
-### Identifiers
-
-An *identifier* is a sequence of one or more characters having Unicode categories within a legal set.
-
-The first character in an identifier must have one of the following Unicode categories:
-
-- `Pc` Connector Punctuation (e.g. `_`)
-- `Ll` Lowercase Letter (e.g. `h`)
-- `Lm` Modifier Letter (e.g. `ʹ`, U+02B9 Modifier Letter Prime)
-- `Lo` Other Letter (e.g. `א`, U+05D0 Hebrew Letter Alef)
-- `Lt` Titlecase Letter (e.g. `ǅ`, U+01C5 Latin Capital Letter D With Small Letter Z With Caron)
-- `Lu` Uppercase Letter (e.g. `B`)
-- `Mn` Nonspacing Mark (e.g. ` ̂`, U+0302 Combining Circumflex Accent)
-- `Sk` Modifier Symbol (e.g. `^`, U+005E Circumflex Accent)
-
-Subsequent characters may have any of the above-listed Unicode categories, or one of the following:
-
-- `Nd` Decimal Digit Number (e.g. `0`)
-- `Nl` Letter Number (e.g. `Ⅳ`, U+2163 Roman Numeral Four)
-- `No` Other Number (e.g. `¼`, U+00BC Vulgar Fraction One Quarter)
-
-### Constants
-
-A *constant* can have one of six types:
-
-- a *decimal constant*, a sequence of characters drawn from the set {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `_`};
-- a *binary constant*, a prefix (either `0b` or `0B`) followed by a sequence of characters drawn from the set {`0`, `1`, `_`};
-- an *octal constant*, the prefix `0o` followed by a sequence of characters drawn from the set {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `_`};
-- a *hexadecimal constant*, a prefix (either `0x` or `0X`) followed by a sequence of characters drawn from the set {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `A`, `a`, `B`, `b`, `C`, `c`, `D`, `d`, `E`, `e`, `F`, `f`, `_`};
-- a *floating-point constant*, a decimal constant followed by one of
-    - `.` followed by a decimal constant,
-    - either `e` or `E` followed by a decimal constant,
-    - or a `.` followed by a decimal constant followed by either an `e` or `E` followed by a decimal constant;
-- or a *character constant*, a `'` followed by either a single character or an *escape sequence* followed by another `'`. 
-
-#### Escape Sequences
-
-The following sequences of characters are *escape sequences*:
-
-- `\'`
-- `\"`
-- `\\`
-- `\r`
-- `\n`
-- `\t`
-- `\0`
-- `\x` followed by two characters drawn from the set {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `A`, `a`, `B`, `b`, `C`, `c`, `D`, `d`, `E`, `e`, `F`, `f`}
-- `\u` followed by four characters drawn from the set {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `A`, `a`, `B`, `b`, `C`, `c`, `D`, `d`, `E`, `e`, `F`, `f`}
-- `\U` followed by eight characters drawn from the set {`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `A`, `a`, `B`, `b`, `C`, `c`, `D`, `d`, `E`, `e`, `F`, `f`}
-
-### String Literals
-
-A *string literal* begins with a `"`.
-It then contains a sequence where each element is either an escape sequence or a character that is neither `"` nor `\`.
-It then ends with a `"`.
-
-### Punctuators
-
-The following sequences of characters form *punctuators*:
-
-- `[`
-- `]`
-- `(`
-- `)`
-- `{`
-- `}`
-- `.`
-- `,`
-- `+`
-- `-`
-- `*`
-- `/`
-- `%`
-- `;`
-- `!`
-- `&`
-- `|`
-- `^`
-- `~`
-- `>`
-- `<`
-- `=`
-- `->`
-- `++`
-- `--`
-- `>>`
-- `<<`
-- `<=`
-- `>=`
-- `==`
-- `!=`
-- `&&`
-- `||`
-- `+=`
-- `-=`
-- `*=`
-- `/=`
-- `%=`
-- `&=`
-- `|=`
-- `^=`
-
-### Whitespace
-
-A nonempty sequence of characters is considered to be *whitespace* if each character in it has a Unicode class of either Space Separator or Control Other.
-
-### Comments
-
-A *comment* can be either a *line comment* or a *block comment*.
-
-A *line comment* begins with the characters `//` if they occur outside of a string literal or comment, and ends with a newline character U+000A.
-
-A *block comment* begins with the characters `/*` if they occur outside of a string literal or comment, and ends with the characters `*/`.
-
-## Parsing
-
-The syntax of Crowbar is given as a [parsing expression grammar](https://en.wikipedia.org/wiki/Parsing_expression_grammar):
-
-### Entry points
-
-```PEG
-HeaderFile                ← HeaderFileElement+
-HeaderFileElement         ← IncludeStatement /
-                            TypeDeclaration /
-                            FunctionDeclaration
-
-ImplementationFile        ← ImplementationFileElement+
-ImplementationFileElement ← HeaderFileElement /
-                            FunctionDefinition
-```
-
 ### Top-level elements
 
 ```PEG
-IncludeStatement    ← 'include' string-literal ';'
-
-TypeDeclaration     ← StructDeclaration /
-                      EnumDeclaration
-StructDeclaration   ← 'struct' identifier '{' VariableDeclaration+ '}' ';'
-EnumDeclaration     ← 'enum' identifier '{' EnumBody '}' ';'
-EnumBody            ← identifier ('=' Expression)? ',' EnumBody /
-                      identifier ('=' Expression)? ','?
-
 FunctionDeclaration ← FunctionSignature ';'
 FunctionDefinition  ← FunctionSignature Block
 FunctionSignature   ← Type identifier '(' SignatureArguments? ')'
-- 
cgit v1.2.3