From e558d912a7e3583d700aa42261ef1fa17e5d0d55 Mon Sep 17 00:00:00 2001
From: Melody Horn <melody@boringcactus.com>
Date: Wed, 28 Oct 2020 22:17:02 -0600
Subject: syntax highlight Crowbar code

---
 .build.yml            |   2 +-
 _ext/.gitignore       |   1 +
 _ext/crowbar_lexer.py | 298 ++++++++++++++++++++++++++++++++++++++++++++++++++
 conf.py               |   9 +-
 safety.md             |  94 ----------------
 safety.rst            | 109 ++++++++++++++++++
 syntax.md             |  10 +-
 7 files changed, 420 insertions(+), 103 deletions(-)
 create mode 100644 _ext/.gitignore
 create mode 100644 _ext/crowbar_lexer.py
 delete mode 100644 safety.md
 create mode 100644 safety.rst

diff --git a/.build.yml b/.build.yml
index 5edf955..49d5db7 100644
--- a/.build.yml
+++ b/.build.yml
@@ -9,7 +9,7 @@ secrets:
     - b5cb9b2b-1461-4486-95e1-886451674a89
 tasks:
     - prep: |
-        sudo pip3 install --progress-bar off sphinx recommonmark rinohtype
+        sudo pip3 install --progress-bar off sphinx recommonmark rinohtype regex
     - build-and-test: |
         cd crowbar-spec
         make all check
diff --git a/_ext/.gitignore b/_ext/.gitignore
new file mode 100644
index 0000000..bee8a64
--- /dev/null
+++ b/_ext/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/_ext/crowbar_lexer.py b/_ext/crowbar_lexer.py
new file mode 100644
index 0000000..6f0ec5e
--- /dev/null
+++ b/_ext/crowbar_lexer.py
@@ -0,0 +1,298 @@
+from pygments.lexer import Lexer, LexerMeta, _TokenType, inherit, combined, include, _inherit, default
+from pygments.token import *
+from pygments.util import Future
+import regex as re
+
+# we're overriding here so we can use a unicode-aware regex library
+class RegexLexerMeta(LexerMeta):
+    """
+    Metaclass for RegexLexer, creates the self._tokens attribute from
+    self.tokens on the first instantiation.
+    """
+
+    def _process_regex(cls, regex, rflags, state):
+        """Preprocess the regular expression component of a token definition."""
+        if isinstance(regex, Future):
+            regex = regex.get()
+        return re.compile(regex, rflags).match
+
+    def _process_token(cls, token):
+        """Preprocess the token component of a token definition."""
+        assert type(token) is _TokenType or callable(token), \
+            'token type must be simple type or callable, not %r' % (token,)
+        return token
+
+    def _process_new_state(cls, new_state, unprocessed, processed):
+        """Preprocess the state transition action of a token definition."""
+        if isinstance(new_state, str):
+            # an existing state
+            if new_state == '#pop':
+                return -1
+            elif new_state in unprocessed:
+                return (new_state,)
+            elif new_state == '#push':
+                return new_state
+            elif new_state[:5] == '#pop:':
+                return -int(new_state[5:])
+            else:
+                assert False, 'unknown new state %r' % new_state
+        elif isinstance(new_state, combined):
+            # combine a new state from existing ones
+            tmp_state = '_tmp_%d' % cls._tmpname
+            cls._tmpname += 1
+            itokens = []
+            for istate in new_state:
+                assert istate != new_state, 'circular state ref %r' % istate
+                itokens.extend(cls._process_state(unprocessed,
+                                                  processed, istate))
+            processed[tmp_state] = itokens
+            return (tmp_state,)
+        elif isinstance(new_state, tuple):
+            # push more than one state
+            for istate in new_state:
+                assert (istate in unprocessed or
+                        istate in ('#pop', '#push')), \
+                    'unknown new state ' + istate
+            return new_state
+        else:
+            assert False, 'unknown new state def %r' % new_state
+
+    def _process_state(cls, unprocessed, processed, state):
+        """Preprocess a single state definition."""
+        assert type(state) is str, "wrong state name %r" % state
+        assert state[0] != '#', "invalid state name %r" % state
+        if state in processed:
+            return processed[state]
+        tokens = processed[state] = []
+        rflags = cls.flags
+        for tdef in unprocessed[state]:
+            if isinstance(tdef, include):
+                # it's a state reference
+                assert tdef != state, "circular state reference %r" % state
+                tokens.extend(cls._process_state(unprocessed, processed,
+                                                 str(tdef)))
+                continue
+            if isinstance(tdef, _inherit):
+                # should be processed already, but may not in the case of:
+                # 1. the state has no counterpart in any parent
+                # 2. the state includes more than one 'inherit'
+                continue
+            if isinstance(tdef, default):
+                new_state = cls._process_new_state(tdef.state, unprocessed, processed)
+                tokens.append((re.compile('').match, None, new_state))
+                continue
+
+            assert type(tdef) is tuple, "wrong rule def %r" % tdef
+
+            try:
+                rex = cls._process_regex(tdef[0], rflags, state)
+            except Exception as err:
+                raise ValueError("uncompilable regex %r in state %r of %r: %s" %
+                                 (tdef[0], state, cls, err)) from err
+
+            token = cls._process_token(tdef[1])
+
+            if len(tdef) == 2:
+                new_state = None
+            else:
+                new_state = cls._process_new_state(tdef[2],
+                                                   unprocessed, processed)
+
+            tokens.append((rex, token, new_state))
+        return tokens
+
+    def process_tokendef(cls, name, tokendefs=None):
+        """Preprocess a dictionary of token definitions."""
+        processed = cls._all_tokens[name] = {}
+        tokendefs = tokendefs or cls.tokens[name]
+        for state in list(tokendefs):
+            cls._process_state(tokendefs, processed, state)
+        return processed
+
+    def get_tokendefs(cls):
+        """
+        Merge tokens from superclasses in MRO order, returning a single tokendef
+        dictionary.
+        Any state that is not defined by a subclass will be inherited
+        automatically.  States that *are* defined by subclasses will, by
+        default, override that state in the superclass.  If a subclass wishes to
+        inherit definitions from a superclass, it can use the special value
+        "inherit", which will cause the superclass' state definition to be
+        included at that point in the state.
+        """
+        tokens = {}
+        inheritable = {}
+        for c in cls.__mro__:
+            toks = c.__dict__.get('tokens', {})
+
+            for state, items in toks.items():
+                curitems = tokens.get(state)
+                if curitems is None:
+                    # N.b. because this is assigned by reference, sufficiently
+                    # deep hierarchies are processed incrementally (e.g. for
+                    # A(B), B(C), C(RegexLexer), B will be premodified so X(B)
+                    # will not see any inherits in B).
+                    tokens[state] = items
+                    try:
+                        inherit_ndx = items.index(inherit)
+                    except ValueError:
+                        continue
+                    inheritable[state] = inherit_ndx
+                    continue
+
+                inherit_ndx = inheritable.pop(state, None)
+                if inherit_ndx is None:
+                    continue
+
+                # Replace the "inherit" value with the items
+                curitems[inherit_ndx:inherit_ndx+1] = items
+                try:
+                    # N.b. this is the index in items (that is, the superclass
+                    # copy), so offset required when storing below.
+                    new_inh_ndx = items.index(inherit)
+                except ValueError:
+                    pass
+                else:
+                    inheritable[state] = inherit_ndx + new_inh_ndx
+
+        return tokens
+
+    def __call__(cls, *args, **kwds):
+        """Instantiate cls after preprocessing its token definitions."""
+        if '_tokens' not in cls.__dict__:
+            cls._all_tokens = {}
+            cls._tmpname = 0
+            if hasattr(cls, 'token_variants') and cls.token_variants:
+                # don't process yet
+                pass
+            else:
+                cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
+
+        return type.__call__(cls, *args, **kwds)
+
+class RegexLexer(Lexer, metaclass=RegexLexerMeta):
+    """
+    Base for simple stateful regular expression-based lexers.
+    Simplifies the lexing process so that you need only
+    provide a list of states and regular expressions.
+    """
+
+    #: Flags for compiling the regular expressions.
+    #: Defaults to MULTILINE.
+    flags = re.MULTILINE
+
+    #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
+    #:
+    #: The initial state is 'root'.
+    #: ``new_state`` can be omitted to signify no state transition.
+    #: If it is a string, the state is pushed on the stack and changed.
+    #: If it is a tuple of strings, all states are pushed on the stack and
+    #: the current state will be the topmost.
+    #: It can also be ``combined('state1', 'state2', ...)``
+    #: to signify a new, anonymous state combined from the rules of two
+    #: or more existing ones.
+    #: Furthermore, it can be '#pop' to signify going back one step in
+    #: the state stack, or '#push' to push the current state on the stack
+    #: again.
+    #:
+    #: The tuple can also be replaced with ``include('state')``, in which
+    #: case the rules from the state named by the string are included in the
+    #: current one.
+    tokens = {}
+
+    def get_tokens_unprocessed(self, text, stack=('root',)):
+        """
+        Split ``text`` into (tokentype, text) pairs.
+        ``stack`` is the inital stack (default: ``['root']``)
+        """
+        pos = 0
+        tokendefs = self._tokens
+        statestack = list(stack)
+        statetokens = tokendefs[statestack[-1]]
+        while 1:
+            for rexmatch, action, new_state in statetokens:
+                m = rexmatch(text, pos)
+                if m:
+                    if action is not None:
+                        if type(action) is _TokenType:
+                            yield pos, action, m.group()
+                        else:
+                            yield from action(self, m)
+                    pos = m.end()
+                    if new_state is not None:
+                        # state transition
+                        if isinstance(new_state, tuple):
+                            for state in new_state:
+                                if state == '#pop':
+                                    if len(statestack) > 1:
+                                        statestack.pop()
+                                elif state == '#push':
+                                    statestack.append(statestack[-1])
+                                else:
+                                    statestack.append(state)
+                        elif isinstance(new_state, int):
+                            # pop, but keep at least one state on the stack
+                            # (random code leading to unexpected pops should
+                            # not allow exceptions)
+                            if abs(new_state) >= len(statestack):
+                                del statestack[1:]
+                            else:
+                                del statestack[new_state:]
+                        elif new_state == '#push':
+                            statestack.append(statestack[-1])
+                        else:
+                            assert False, "wrong state def: %r" % new_state
+                        statetokens = tokendefs[statestack[-1]]
+                    break
+            else:
+                # We are here only if all state tokens have been considered
+                # and there was not a match on any of them.
+                try:
+                    if text[pos] == '\n':
+                        # at EOL, reset state to "root"
+                        statestack = ['root']
+                        statetokens = tokendefs['root']
+                        yield pos, Text, '\n'
+                        pos += 1
+                        continue
+                    yield pos, Error, text[pos]
+                    pos += 1
+                except IndexError:
+                    break
+
+class CrowbarLexer(RegexLexer):
+    name = 'Crowbar'
+    aliases = ['crowbar']
+    filenames = ['*.cro', '*.hro']
+
+    tokens = {
+        'root': [
+            (r'bool|char|double|float|function|int|long|short|signed|unsigned|void', Keyword.Type),
+            (r'const|enum|extern|struct', Keyword.Declaration),
+            (r'include', Keyword.Namespace),
+            (r'break|case|continue|default|do|else|for|fragile|if|return|switch|while', Keyword),
+            (r"[\p{L}\p{Pc}\p{Sk}\p{Mn}][\p{L}\p{Pc}\p{Sk}\p{Mn}\p{N}]*", Name),
+            (r'0[bB][01_]+', Number.Bin),
+            (r'0o[0-7_]+', Number.Oct),
+            (r'0[xX][0-9a-fA-F_]+', Number.Hex),
+            (r'[0-9_]+(\.[0-9_]+|[eE][0-9_]+|\.[0-9_]+[eE][0-9_]+)', Number.Float),
+            (r'[0-9_]+', Number.Integer),
+            (r"""'([^\'\\]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})'""", String.Char),
+            (r'''([^\\"]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"''', String.Double),
+            (r'sizeof', Operator.Word),
+            (r'//[^\n]*', Comment.Single),
+            (r'/*.*?\*/', Comment.Multiline),
+            (r"->|\+\+|--|>>|<<|<=|>=|&&|\|\||[=!+\-*/%&|^]=|[.+\-*/%!&|^~><=]", Operator),
+            (r"[\[\](){},;]", Punctuation),
+            (r".", Text),
+        ]
+    }
+
+def setup(app):
+    app.add_lexer('crowbar', CrowbarLexer)
+
+    return {
+        'version': '0.1',
+        'parallel_read_safe': True,
+        'parallel_write_safe': True,
+    }
diff --git a/conf.py b/conf.py
index aef57e6..58f2f66 100644
--- a/conf.py
+++ b/conf.py
@@ -10,9 +10,9 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
+import os
+import sys
+sys.path.insert(0, os.path.abspath('_ext'))
 
 
 # -- Project information -----------------------------------------------------
@@ -29,6 +29,7 @@ author = 'boringcactus (Melody Horn)'
 # ones.
 extensions = [
     'recommonmark',
+    'crowbar_lexer',
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -39,6 +40,8 @@ templates_path = ['_templates']
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 
+highlight_language = 'crowbar'
+
 
 # -- Options for HTML output -------------------------------------------------
 
diff --git a/safety.md b/safety.md
deleted file mode 100644
index c50a06a..0000000
--- a/safety.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# Memory Safety
-
-Each item in Wikipedia's [list of types of memory errors](https://en.wikipedia.org/wiki/Memory_safety#Types_of_memory_errors) and what Crowbar does to prevent them.
-
-In general, Crowbar does its best to ensure that code will not exhibit any of the following memory errors.
-However, sometimes the compiler knows less than the programmer, and so code that looks dangerous is actually fine.
-Crowbar allows programmers to suspend the memory safety checks with the `fragile` keyword.
-
-## Access errors
-
-### Buffer overflow
-
-Crowbar addresses buffer overflow with bounds checking.
-In C, the type `char *` can point to a single character, a null-terminated string of unknown length, a buffer of fixed size, or nothing at all.
-In Crowbar, the type `char *` can only point to either a single character or nothing at all.
-If a buffer is declared as `char[50] name;` then it has type `char[50]`, and can be implicitly converted to `(char[50])*`, a pointer-to-50-chars.
-If memory is dynamically allocated, it works as follows:
-
-```crowbar
-void process(size_t bufferSize, char[bufferSize] buffer) {
-    // do some work with buffer, given that we know its size
-}
-
-int main(int argc, (char[1024?])[argc] argv) {
-    size_t bufferSize = getBufferSize();
-    (char[bufferSize])* buffer = malloc(bufferSize);
-    process(bufferSize, buffer);
-    free(buffer);
-}
-```
-
-Note that `malloc` as part of the Crowbar standard library has signature `(void[size])* malloc(size_t size);` and so no cast is needed above.
-In C, `buffer` in `main` would have type pointer-to-VLA-of-char, but `buffer` in `process` would have type VLA-of-char, and this conversion would emit a compiler warning.
-However, in Crowbar, a `(T[N])*` is always implicitly convertible to `T[N]`, so no warning exists.
-(This is translated into C by dereferencing `buffer` in `main`.)
-
-Note as well that the type of `argv` is complicated.
-This is because the elements of `argv` have unconstrained size.
-TODO figure out if that's the right way to handle that
-
-### Buffer over-read
-
-bounds checking again
-
-### Race condition
-
-uhhhhh idk
-
-### Page fault
-
-bounds checking, dubious-pointer checking
-
-### Use after free
-
-`free(x);` not followed by `x = NULL;` is a compiler error.
-`owned` and `borrowed` keywords 
-
-## Uninitialized variables
-
-forbid them in syntax
-
-### Null pointer dereference
-
-dubious-pointer checking
-
-### Wild pointers
-
-dubious-pointer checking
-
-## Memory leak
-
-### Stack exhaustion
-
-uhhhhhh idk
-
-### Heap exhaustion
-
-that counts as error handling, just the `malloc`-shaped kind
-
-### Double free
-
-this is just use-after-free but the use is calling free on it
-
-### Invalid free
-
-don't do that
-
-### Mismatched free
-
-how does that even happen
-
-### Unwanted aliasing
-
-uhhh don't do that?
diff --git a/safety.rst b/safety.rst
new file mode 100644
index 0000000..61cca97
--- /dev/null
+++ b/safety.rst
@@ -0,0 +1,109 @@
+*************
+Memory Safety
+*************
+
+In general, Crowbar does its best to ensure that code will not exhibit any of the following memory errors (pulled from `Wikipedia's list of memory errors`_.
+However, sometimes the compiler knows less than the programmer, and so code that looks dangerous is actually fine.
+Crowbar allows programmers to suspend the memory safety checks with the `fragile` keyword.
+
+.. _Wikipedia's list of memory errors: https://en.wikipedia.org/wiki/Memory_safety#Types_of_memory_errors
+
+Access errors
+=============
+
+Buffer overflow
+---------------
+
+Crowbar addresses buffer overflow with bounds checking.
+In C, the type ``char *`` can point to a single character, a null-terminated string of unknown length, a buffer of fixed size, or nothing at all.
+In Crowbar, the type ``char *`` can only point to either a single character or nothing at all.
+If a buffer is declared as ``char[50] name;`` then it has type ``char[50]``, and can be implicitly converted to ``(char[50])*``, a pointer-to-50-chars.
+If memory is dynamically allocated, it works as follows::
+
+    void process(size_t bufferSize, char[bufferSize] buffer) {
+        // do some work with buffer, given that we know its size
+    }
+
+    int main(int argc, (char[1024?])[argc] argv) {
+        size_t bufferSize = getBufferSize();
+        (char[bufferSize])* buffer = malloc(bufferSize);
+        process(bufferSize, buffer);
+        free(buffer);
+    }
+
+Note that ``malloc`` as part of the Crowbar standard library has signature ``(void[size])* malloc(size_t size);`` and so no cast is needed above.
+In C, ``buffer`` in ``main`` would have type pointer-to-VLA-of-char, but ``buffer`` in ``process`` would have type VLA-of-char, and this conversion would emit a compiler warning.
+However, in Crowbar, a ``(T[N])*`` is always implicitly convertible to ``T[N]``, so no warning exists.
+
+Note as well that the type of ``argv`` is complicated.
+This is because the elements of ``argv`` have unconstrained size.
+TODO figure out if that's the right way to handle that
+
+Buffer over-read
+----------------
+
+bounds checking again
+
+Race condition
+--------------
+
+uhhhhh idk
+
+Page fault
+----------
+
+bounds checking, dubious-pointer checking
+
+Use after free
+--------------
+
+``free(&x);`` will set ``x = NULL;``
+``owned`` and ``borrowed`` keywords 
+
+Uninitialized variables
+=======================
+
+forbid them in syntax
+
+Null pointer dereference
+------------------------
+
+dubious-pointer checking
+
+Wild pointers
+-------------
+
+dubious-pointer checking
+
+Memory leak
+===========
+
+Stack exhaustion
+----------------
+
+uhhhhhh idk
+
+Heap exhaustion
+---------------
+
+that counts as error handling, just the `malloc`-shaped kind
+
+Double free
+-----------
+
+this is just use-after-free but the use is calling free on it
+
+Invalid free
+------------
+
+don't do that
+
+Mismatched free
+---------------
+
+how does that even happen
+
+Unwanted aliasing
+-----------------
+
+uhhh don't do that?
diff --git a/syntax.md b/syntax.md
index 9320b1c..be62b47 100644
--- a/syntax.md
+++ b/syntax.md
@@ -176,7 +176,7 @@ The syntax of Crowbar is given as a [parsing expression grammar](https://en.wiki
 
 ### Entry points
 
-```
+```PEG
 HeaderFile                ← HeaderFileElement+
 HeaderFileElement         ← IncludeStatement /
                             TypeDeclaration /
@@ -189,7 +189,7 @@ ImplementationFileElement ← HeaderFileElement /
 
 ### Top-level elements
 
-```
+```PEG
 IncludeStatement    ← 'include' string-literal ';'
 
 TypeDeclaration     ← StructDeclaration /
@@ -208,7 +208,7 @@ SignatureArguments  ← Type identifier ',' SignatureArguments /
 
 ### Statements
 
-```
+```PEG
 Block                   ← '{' Statement* '}'
     
 Statement               ← VariableDefinition /
@@ -260,7 +260,7 @@ ExpressionStatement     ← Expression ';'
 
 ### Types
 
-```
+```PEG
 Type        ← 'const' BasicType /
               BasicType '*' /
               BasicType '[' Expression ']' /
@@ -285,7 +285,7 @@ IntegerType ← 'char' /
 
 ### Expressions
 
-```
+```PEG
 AssignmentTargetExpression ← identifier ATEElementSuffix*
 ATEElementSuffix           ← '[' Expression ']' /
                              '.' identifier /
-- 
cgit v1.2.3