diff options
| author | Melody Horn <melody@boringcactus.com> | 2020-10-28 22:17:02 -0600 | 
|---|---|---|
| committer | Melody Horn <melody@boringcactus.com> | 2020-10-28 22:17:02 -0600 | 
| commit | e558d912a7e3583d700aa42261ef1fa17e5d0d55 (patch) | |
| tree | 158a6572c24f09df957491e98e8c159fcc163ecc | |
| parent | cd6f0a93c69c622bfa8ca5f888da52e734903455 (diff) | |
| download | spec-e558d912a7e3583d700aa42261ef1fa17e5d0d55.tar.gz spec-e558d912a7e3583d700aa42261ef1fa17e5d0d55.zip | |
syntax highlight Crowbar code
| -rw-r--r-- | .build.yml | 2 | ||||
| -rw-r--r-- | _ext/.gitignore | 1 | ||||
| -rw-r--r-- | _ext/crowbar_lexer.py | 298 | ||||
| -rw-r--r-- | conf.py | 9 | ||||
| -rw-r--r-- | safety.md | 94 | ||||
| -rw-r--r-- | safety.rst | 109 | ||||
| -rw-r--r-- | syntax.md | 10 | 
7 files changed, 420 insertions, 103 deletions
| @@ -9,7 +9,7 @@ secrets:      - b5cb9b2b-1461-4486-95e1-886451674a89  tasks:      - prep: | -        sudo pip3 install --progress-bar off sphinx recommonmark rinohtype +        sudo pip3 install --progress-bar off sphinx recommonmark rinohtype regex      - build-and-test: |          cd crowbar-spec          make all check diff --git a/_ext/.gitignore b/_ext/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/_ext/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/_ext/crowbar_lexer.py b/_ext/crowbar_lexer.py new file mode 100644 index 0000000..6f0ec5e --- /dev/null +++ b/_ext/crowbar_lexer.py @@ -0,0 +1,298 @@ +from pygments.lexer import Lexer, LexerMeta, _TokenType, inherit, combined, include, _inherit, default +from pygments.token import * +from pygments.util import Future +import regex as re + +# we're overriding here so we can use a unicode-aware regex library +class RegexLexerMeta(LexerMeta): +    """ +    Metaclass for RegexLexer, creates the self._tokens attribute from +    self.tokens on the first instantiation. +    """ + +    def _process_regex(cls, regex, rflags, state): +        """Preprocess the regular expression component of a token definition.""" +        if isinstance(regex, Future): +            regex = regex.get() +        return re.compile(regex, rflags).match + +    def _process_token(cls, token): +        """Preprocess the token component of a token definition.""" +        assert type(token) is _TokenType or callable(token), \ +            'token type must be simple type or callable, not %r' % (token,) +        return token + +    def _process_new_state(cls, new_state, unprocessed, processed): +        """Preprocess the state transition action of a token definition.""" +        if isinstance(new_state, str): +            # an existing state +            if new_state == '#pop': +                return -1 +            elif new_state in unprocessed: +                return (new_state,) +            elif new_state == '#push': +                return new_state +            elif new_state[:5] == '#pop:': +                return -int(new_state[5:]) +            else: +                assert False, 'unknown new state %r' % new_state +        elif isinstance(new_state, combined): +            # combine a new state from existing ones +            tmp_state = '_tmp_%d' % cls._tmpname +            cls._tmpname += 1 +            itokens = [] +            for istate in new_state: +                assert istate != new_state, 'circular state ref %r' % istate +                itokens.extend(cls._process_state(unprocessed, +                                                  processed, istate)) +            processed[tmp_state] = itokens +            return (tmp_state,) +        elif isinstance(new_state, tuple): +            # push more than one state +            for istate in new_state: +                assert (istate in unprocessed or +                        istate in ('#pop', '#push')), \ +                    'unknown new state ' + istate +            return new_state +        else: +            assert False, 'unknown new state def %r' % new_state + +    def _process_state(cls, unprocessed, processed, state): +        """Preprocess a single state definition.""" +        assert type(state) is str, "wrong state name %r" % state +        assert state[0] != '#', "invalid state name %r" % state +        if state in processed: +            return processed[state] +        tokens = processed[state] = [] +        rflags = cls.flags +        for tdef in unprocessed[state]: +            if isinstance(tdef, include): +                # it's a state reference +                assert tdef != state, "circular state reference %r" % state +                tokens.extend(cls._process_state(unprocessed, processed, +                                                 str(tdef))) +                continue +            if isinstance(tdef, _inherit): +                # should be processed already, but may not in the case of: +                # 1. the state has no counterpart in any parent +                # 2. the state includes more than one 'inherit' +                continue +            if isinstance(tdef, default): +                new_state = cls._process_new_state(tdef.state, unprocessed, processed) +                tokens.append((re.compile('').match, None, new_state)) +                continue + +            assert type(tdef) is tuple, "wrong rule def %r" % tdef + +            try: +                rex = cls._process_regex(tdef[0], rflags, state) +            except Exception as err: +                raise ValueError("uncompilable regex %r in state %r of %r: %s" % +                                 (tdef[0], state, cls, err)) from err + +            token = cls._process_token(tdef[1]) + +            if len(tdef) == 2: +                new_state = None +            else: +                new_state = cls._process_new_state(tdef[2], +                                                   unprocessed, processed) + +            tokens.append((rex, token, new_state)) +        return tokens + +    def process_tokendef(cls, name, tokendefs=None): +        """Preprocess a dictionary of token definitions.""" +        processed = cls._all_tokens[name] = {} +        tokendefs = tokendefs or cls.tokens[name] +        for state in list(tokendefs): +            cls._process_state(tokendefs, processed, state) +        return processed + +    def get_tokendefs(cls): +        """ +        Merge tokens from superclasses in MRO order, returning a single tokendef +        dictionary. +        Any state that is not defined by a subclass will be inherited +        automatically.  States that *are* defined by subclasses will, by +        default, override that state in the superclass.  If a subclass wishes to +        inherit definitions from a superclass, it can use the special value +        "inherit", which will cause the superclass' state definition to be +        included at that point in the state. +        """ +        tokens = {} +        inheritable = {} +        for c in cls.__mro__: +            toks = c.__dict__.get('tokens', {}) + +            for state, items in toks.items(): +                curitems = tokens.get(state) +                if curitems is None: +                    # N.b. because this is assigned by reference, sufficiently +                    # deep hierarchies are processed incrementally (e.g. for +                    # A(B), B(C), C(RegexLexer), B will be premodified so X(B) +                    # will not see any inherits in B). +                    tokens[state] = items +                    try: +                        inherit_ndx = items.index(inherit) +                    except ValueError: +                        continue +                    inheritable[state] = inherit_ndx +                    continue + +                inherit_ndx = inheritable.pop(state, None) +                if inherit_ndx is None: +                    continue + +                # Replace the "inherit" value with the items +                curitems[inherit_ndx:inherit_ndx+1] = items +                try: +                    # N.b. this is the index in items (that is, the superclass +                    # copy), so offset required when storing below. +                    new_inh_ndx = items.index(inherit) +                except ValueError: +                    pass +                else: +                    inheritable[state] = inherit_ndx + new_inh_ndx + +        return tokens + +    def __call__(cls, *args, **kwds): +        """Instantiate cls after preprocessing its token definitions.""" +        if '_tokens' not in cls.__dict__: +            cls._all_tokens = {} +            cls._tmpname = 0 +            if hasattr(cls, 'token_variants') and cls.token_variants: +                # don't process yet +                pass +            else: +                cls._tokens = cls.process_tokendef('', cls.get_tokendefs()) + +        return type.__call__(cls, *args, **kwds) + +class RegexLexer(Lexer, metaclass=RegexLexerMeta): +    """ +    Base for simple stateful regular expression-based lexers. +    Simplifies the lexing process so that you need only +    provide a list of states and regular expressions. +    """ + +    #: Flags for compiling the regular expressions. +    #: Defaults to MULTILINE. +    flags = re.MULTILINE + +    #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}`` +    #: +    #: The initial state is 'root'. +    #: ``new_state`` can be omitted to signify no state transition. +    #: If it is a string, the state is pushed on the stack and changed. +    #: If it is a tuple of strings, all states are pushed on the stack and +    #: the current state will be the topmost. +    #: It can also be ``combined('state1', 'state2', ...)`` +    #: to signify a new, anonymous state combined from the rules of two +    #: or more existing ones. +    #: Furthermore, it can be '#pop' to signify going back one step in +    #: the state stack, or '#push' to push the current state on the stack +    #: again. +    #: +    #: The tuple can also be replaced with ``include('state')``, in which +    #: case the rules from the state named by the string are included in the +    #: current one. +    tokens = {} + +    def get_tokens_unprocessed(self, text, stack=('root',)): +        """ +        Split ``text`` into (tokentype, text) pairs. +        ``stack`` is the inital stack (default: ``['root']``) +        """ +        pos = 0 +        tokendefs = self._tokens +        statestack = list(stack) +        statetokens = tokendefs[statestack[-1]] +        while 1: +            for rexmatch, action, new_state in statetokens: +                m = rexmatch(text, pos) +                if m: +                    if action is not None: +                        if type(action) is _TokenType: +                            yield pos, action, m.group() +                        else: +                            yield from action(self, m) +                    pos = m.end() +                    if new_state is not None: +                        # state transition +                        if isinstance(new_state, tuple): +                            for state in new_state: +                                if state == '#pop': +                                    if len(statestack) > 1: +                                        statestack.pop() +                                elif state == '#push': +                                    statestack.append(statestack[-1]) +                                else: +                                    statestack.append(state) +                        elif isinstance(new_state, int): +                            # pop, but keep at least one state on the stack +                            # (random code leading to unexpected pops should +                            # not allow exceptions) +                            if abs(new_state) >= len(statestack): +                                del statestack[1:] +                            else: +                                del statestack[new_state:] +                        elif new_state == '#push': +                            statestack.append(statestack[-1]) +                        else: +                            assert False, "wrong state def: %r" % new_state +                        statetokens = tokendefs[statestack[-1]] +                    break +            else: +                # We are here only if all state tokens have been considered +                # and there was not a match on any of them. +                try: +                    if text[pos] == '\n': +                        # at EOL, reset state to "root" +                        statestack = ['root'] +                        statetokens = tokendefs['root'] +                        yield pos, Text, '\n' +                        pos += 1 +                        continue +                    yield pos, Error, text[pos] +                    pos += 1 +                except IndexError: +                    break + +class CrowbarLexer(RegexLexer): +    name = 'Crowbar' +    aliases = ['crowbar'] +    filenames = ['*.cro', '*.hro'] + +    tokens = { +        'root': [ +            (r'bool|char|double|float|function|int|long|short|signed|unsigned|void', Keyword.Type), +            (r'const|enum|extern|struct', Keyword.Declaration), +            (r'include', Keyword.Namespace), +            (r'break|case|continue|default|do|else|for|fragile|if|return|switch|while', Keyword), +            (r"[\p{L}\p{Pc}\p{Sk}\p{Mn}][\p{L}\p{Pc}\p{Sk}\p{Mn}\p{N}]*", Name), +            (r'0[bB][01_]+', Number.Bin), +            (r'0o[0-7_]+', Number.Oct), +            (r'0[xX][0-9a-fA-F_]+', Number.Hex), +            (r'[0-9_]+(\.[0-9_]+|[eE][0-9_]+|\.[0-9_]+[eE][0-9_]+)', Number.Float), +            (r'[0-9_]+', Number.Integer), +            (r"""'([^\'\\]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})'""", String.Char), +            (r'''([^\\"]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"''', String.Double), +            (r'sizeof', Operator.Word), +            (r'//[^\n]*', Comment.Single), +            (r'/*.*?\*/', Comment.Multiline), +            (r"->|\+\+|--|>>|<<|<=|>=|&&|\|\||[=!+\-*/%&|^]=|[.+\-*/%!&|^~><=]", Operator), +            (r"[\[\](){},;]", Punctuation), +            (r".", Text), +        ] +    } + +def setup(app): +    app.add_lexer('crowbar', CrowbarLexer) + +    return { +        'version': '0.1', +        'parallel_read_safe': True, +        'parallel_write_safe': True, +    } @@ -10,9 +10,9 @@  # add these directories to sys.path here. If the directory is relative to the  # documentation root, use os.path.abspath to make it absolute, like shown here.  # -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) +import os +import sys +sys.path.insert(0, os.path.abspath('_ext'))  # -- Project information ----------------------------------------------------- @@ -29,6 +29,7 @@ author = 'boringcactus (Melody Horn)'  # ones.  extensions = [      'recommonmark', +    'crowbar_lexer',  ]  # Add any paths that contain templates here, relative to this directory. @@ -39,6 +40,8 @@ templates_path = ['_templates']  # This pattern also affects html_static_path and html_extra_path.  exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +highlight_language = 'crowbar' +  # -- Options for HTML output ------------------------------------------------- diff --git a/safety.md b/safety.md deleted file mode 100644 index c50a06a..0000000 --- a/safety.md +++ /dev/null @@ -1,94 +0,0 @@ -# Memory Safety - -Each item in Wikipedia's [list of types of memory errors](https://en.wikipedia.org/wiki/Memory_safety#Types_of_memory_errors) and what Crowbar does to prevent them. - -In general, Crowbar does its best to ensure that code will not exhibit any of the following memory errors. -However, sometimes the compiler knows less than the programmer, and so code that looks dangerous is actually fine. -Crowbar allows programmers to suspend the memory safety checks with the `fragile` keyword. - -## Access errors - -### Buffer overflow - -Crowbar addresses buffer overflow with bounds checking. -In C, the type `char *` can point to a single character, a null-terminated string of unknown length, a buffer of fixed size, or nothing at all. -In Crowbar, the type `char *` can only point to either a single character or nothing at all. -If a buffer is declared as `char[50] name;` then it has type `char[50]`, and can be implicitly converted to `(char[50])*`, a pointer-to-50-chars. -If memory is dynamically allocated, it works as follows: - -```crowbar -void process(size_t bufferSize, char[bufferSize] buffer) { -    // do some work with buffer, given that we know its size -} - -int main(int argc, (char[1024?])[argc] argv) { -    size_t bufferSize = getBufferSize(); -    (char[bufferSize])* buffer = malloc(bufferSize); -    process(bufferSize, buffer); -    free(buffer); -} -``` - -Note that `malloc` as part of the Crowbar standard library has signature `(void[size])* malloc(size_t size);` and so no cast is needed above. -In C, `buffer` in `main` would have type pointer-to-VLA-of-char, but `buffer` in `process` would have type VLA-of-char, and this conversion would emit a compiler warning. -However, in Crowbar, a `(T[N])*` is always implicitly convertible to `T[N]`, so no warning exists. -(This is translated into C by dereferencing `buffer` in `main`.) - -Note as well that the type of `argv` is complicated. -This is because the elements of `argv` have unconstrained size. -TODO figure out if that's the right way to handle that - -### Buffer over-read - -bounds checking again - -### Race condition - -uhhhhh idk - -### Page fault - -bounds checking, dubious-pointer checking - -### Use after free - -`free(x);` not followed by `x = NULL;` is a compiler error. -`owned` and `borrowed` keywords  - -## Uninitialized variables - -forbid them in syntax - -### Null pointer dereference - -dubious-pointer checking - -### Wild pointers - -dubious-pointer checking - -## Memory leak - -### Stack exhaustion - -uhhhhhh idk - -### Heap exhaustion - -that counts as error handling, just the `malloc`-shaped kind - -### Double free - -this is just use-after-free but the use is calling free on it - -### Invalid free - -don't do that - -### Mismatched free - -how does that even happen - -### Unwanted aliasing - -uhhh don't do that? diff --git a/safety.rst b/safety.rst new file mode 100644 index 0000000..61cca97 --- /dev/null +++ b/safety.rst @@ -0,0 +1,109 @@ +************* +Memory Safety +************* + +In general, Crowbar does its best to ensure that code will not exhibit any of the following memory errors (pulled from `Wikipedia's list of memory errors`_. +However, sometimes the compiler knows less than the programmer, and so code that looks dangerous is actually fine. +Crowbar allows programmers to suspend the memory safety checks with the `fragile` keyword. + +.. _Wikipedia's list of memory errors: https://en.wikipedia.org/wiki/Memory_safety#Types_of_memory_errors + +Access errors +============= + +Buffer overflow +--------------- + +Crowbar addresses buffer overflow with bounds checking. +In C, the type ``char *`` can point to a single character, a null-terminated string of unknown length, a buffer of fixed size, or nothing at all. +In Crowbar, the type ``char *`` can only point to either a single character or nothing at all. +If a buffer is declared as ``char[50] name;`` then it has type ``char[50]``, and can be implicitly converted to ``(char[50])*``, a pointer-to-50-chars. +If memory is dynamically allocated, it works as follows:: + +    void process(size_t bufferSize, char[bufferSize] buffer) { +        // do some work with buffer, given that we know its size +    } + +    int main(int argc, (char[1024?])[argc] argv) { +        size_t bufferSize = getBufferSize(); +        (char[bufferSize])* buffer = malloc(bufferSize); +        process(bufferSize, buffer); +        free(buffer); +    } + +Note that ``malloc`` as part of the Crowbar standard library has signature ``(void[size])* malloc(size_t size);`` and so no cast is needed above. +In C, ``buffer`` in ``main`` would have type pointer-to-VLA-of-char, but ``buffer`` in ``process`` would have type VLA-of-char, and this conversion would emit a compiler warning. +However, in Crowbar, a ``(T[N])*`` is always implicitly convertible to ``T[N]``, so no warning exists. + +Note as well that the type of ``argv`` is complicated. +This is because the elements of ``argv`` have unconstrained size. +TODO figure out if that's the right way to handle that + +Buffer over-read +---------------- + +bounds checking again + +Race condition +-------------- + +uhhhhh idk + +Page fault +---------- + +bounds checking, dubious-pointer checking + +Use after free +-------------- + +``free(&x);`` will set ``x = NULL;`` +``owned`` and ``borrowed`` keywords  + +Uninitialized variables +======================= + +forbid them in syntax + +Null pointer dereference +------------------------ + +dubious-pointer checking + +Wild pointers +------------- + +dubious-pointer checking + +Memory leak +=========== + +Stack exhaustion +---------------- + +uhhhhhh idk + +Heap exhaustion +--------------- + +that counts as error handling, just the `malloc`-shaped kind + +Double free +----------- + +this is just use-after-free but the use is calling free on it + +Invalid free +------------ + +don't do that + +Mismatched free +--------------- + +how does that even happen + +Unwanted aliasing +----------------- + +uhhh don't do that? @@ -176,7 +176,7 @@ The syntax of Crowbar is given as a [parsing expression grammar](https://en.wiki  ### Entry points -``` +```PEG  HeaderFile                ← HeaderFileElement+  HeaderFileElement         ← IncludeStatement /                              TypeDeclaration / @@ -189,7 +189,7 @@ ImplementationFileElement ← HeaderFileElement /  ### Top-level elements -``` +```PEG  IncludeStatement    ← 'include' string-literal ';'  TypeDeclaration     ← StructDeclaration / @@ -208,7 +208,7 @@ SignatureArguments  ← Type identifier ',' SignatureArguments /  ### Statements -``` +```PEG  Block                   ← '{' Statement* '}'  Statement               ← VariableDefinition / @@ -260,7 +260,7 @@ ExpressionStatement     ← Expression ';'  ### Types -``` +```PEG  Type        ← 'const' BasicType /                BasicType '*' /                BasicType '[' Expression ']' / @@ -285,7 +285,7 @@ IntegerType ← 'char' /  ### Expressions -``` +```PEG  AssignmentTargetExpression ← identifier ATEElementSuffix*  ATEElementSuffix           ← '[' Expression ']' /                               '.' identifier / |