diff options
Diffstat (limited to '_ext')
-rw-r--r-- | _ext/.gitignore | 1 | ||||
-rw-r--r-- | _ext/crowbar_lexer.py | 298 |
2 files changed, 299 insertions, 0 deletions
diff --git a/_ext/.gitignore b/_ext/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/_ext/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/_ext/crowbar_lexer.py b/_ext/crowbar_lexer.py new file mode 100644 index 0000000..6f0ec5e --- /dev/null +++ b/_ext/crowbar_lexer.py @@ -0,0 +1,298 @@ +from pygments.lexer import Lexer, LexerMeta, _TokenType, inherit, combined, include, _inherit, default +from pygments.token import * +from pygments.util import Future +import regex as re + +# we're overriding here so we can use a unicode-aware regex library +class RegexLexerMeta(LexerMeta): + """ + Metaclass for RegexLexer, creates the self._tokens attribute from + self.tokens on the first instantiation. + """ + + def _process_regex(cls, regex, rflags, state): + """Preprocess the regular expression component of a token definition.""" + if isinstance(regex, Future): + regex = regex.get() + return re.compile(regex, rflags).match + + def _process_token(cls, token): + """Preprocess the token component of a token definition.""" + assert type(token) is _TokenType or callable(token), \ + 'token type must be simple type or callable, not %r' % (token,) + return token + + def _process_new_state(cls, new_state, unprocessed, processed): + """Preprocess the state transition action of a token definition.""" + if isinstance(new_state, str): + # an existing state + if new_state == '#pop': + return -1 + elif new_state in unprocessed: + return (new_state,) + elif new_state == '#push': + return new_state + elif new_state[:5] == '#pop:': + return -int(new_state[5:]) + else: + assert False, 'unknown new state %r' % new_state + elif isinstance(new_state, combined): + # combine a new state from existing ones + tmp_state = '_tmp_%d' % cls._tmpname + cls._tmpname += 1 + itokens = [] + for istate in new_state: + assert istate != new_state, 'circular state ref %r' % istate + itokens.extend(cls._process_state(unprocessed, + processed, istate)) + processed[tmp_state] = itokens + return (tmp_state,) + elif isinstance(new_state, tuple): + # push more than one state + for istate in new_state: + assert (istate in unprocessed or + istate in ('#pop', '#push')), \ + 'unknown new state ' + istate + return new_state + else: + assert False, 'unknown new state def %r' % new_state + + def _process_state(cls, unprocessed, processed, state): + """Preprocess a single state definition.""" + assert type(state) is str, "wrong state name %r" % state + assert state[0] != '#', "invalid state name %r" % state + if state in processed: + return processed[state] + tokens = processed[state] = [] + rflags = cls.flags + for tdef in unprocessed[state]: + if isinstance(tdef, include): + # it's a state reference + assert tdef != state, "circular state reference %r" % state + tokens.extend(cls._process_state(unprocessed, processed, + str(tdef))) + continue + if isinstance(tdef, _inherit): + # should be processed already, but may not in the case of: + # 1. the state has no counterpart in any parent + # 2. the state includes more than one 'inherit' + continue + if isinstance(tdef, default): + new_state = cls._process_new_state(tdef.state, unprocessed, processed) + tokens.append((re.compile('').match, None, new_state)) + continue + + assert type(tdef) is tuple, "wrong rule def %r" % tdef + + try: + rex = cls._process_regex(tdef[0], rflags, state) + except Exception as err: + raise ValueError("uncompilable regex %r in state %r of %r: %s" % + (tdef[0], state, cls, err)) from err + + token = cls._process_token(tdef[1]) + + if len(tdef) == 2: + new_state = None + else: + new_state = cls._process_new_state(tdef[2], + unprocessed, processed) + + tokens.append((rex, token, new_state)) + return tokens + + def process_tokendef(cls, name, tokendefs=None): + """Preprocess a dictionary of token definitions.""" + processed = cls._all_tokens[name] = {} + tokendefs = tokendefs or cls.tokens[name] + for state in list(tokendefs): + cls._process_state(tokendefs, processed, state) + return processed + + def get_tokendefs(cls): + """ + Merge tokens from superclasses in MRO order, returning a single tokendef + dictionary. + Any state that is not defined by a subclass will be inherited + automatically. States that *are* defined by subclasses will, by + default, override that state in the superclass. If a subclass wishes to + inherit definitions from a superclass, it can use the special value + "inherit", which will cause the superclass' state definition to be + included at that point in the state. + """ + tokens = {} + inheritable = {} + for c in cls.__mro__: + toks = c.__dict__.get('tokens', {}) + + for state, items in toks.items(): + curitems = tokens.get(state) + if curitems is None: + # N.b. because this is assigned by reference, sufficiently + # deep hierarchies are processed incrementally (e.g. for + # A(B), B(C), C(RegexLexer), B will be premodified so X(B) + # will not see any inherits in B). + tokens[state] = items + try: + inherit_ndx = items.index(inherit) + except ValueError: + continue + inheritable[state] = inherit_ndx + continue + + inherit_ndx = inheritable.pop(state, None) + if inherit_ndx is None: + continue + + # Replace the "inherit" value with the items + curitems[inherit_ndx:inherit_ndx+1] = items + try: + # N.b. this is the index in items (that is, the superclass + # copy), so offset required when storing below. + new_inh_ndx = items.index(inherit) + except ValueError: + pass + else: + inheritable[state] = inherit_ndx + new_inh_ndx + + return tokens + + def __call__(cls, *args, **kwds): + """Instantiate cls after preprocessing its token definitions.""" + if '_tokens' not in cls.__dict__: + cls._all_tokens = {} + cls._tmpname = 0 + if hasattr(cls, 'token_variants') and cls.token_variants: + # don't process yet + pass + else: + cls._tokens = cls.process_tokendef('', cls.get_tokendefs()) + + return type.__call__(cls, *args, **kwds) + +class RegexLexer(Lexer, metaclass=RegexLexerMeta): + """ + Base for simple stateful regular expression-based lexers. + Simplifies the lexing process so that you need only + provide a list of states and regular expressions. + """ + + #: Flags for compiling the regular expressions. + #: Defaults to MULTILINE. + flags = re.MULTILINE + + #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}`` + #: + #: The initial state is 'root'. + #: ``new_state`` can be omitted to signify no state transition. + #: If it is a string, the state is pushed on the stack and changed. + #: If it is a tuple of strings, all states are pushed on the stack and + #: the current state will be the topmost. + #: It can also be ``combined('state1', 'state2', ...)`` + #: to signify a new, anonymous state combined from the rules of two + #: or more existing ones. + #: Furthermore, it can be '#pop' to signify going back one step in + #: the state stack, or '#push' to push the current state on the stack + #: again. + #: + #: The tuple can also be replaced with ``include('state')``, in which + #: case the rules from the state named by the string are included in the + #: current one. + tokens = {} + + def get_tokens_unprocessed(self, text, stack=('root',)): + """ + Split ``text`` into (tokentype, text) pairs. + ``stack`` is the inital stack (default: ``['root']``) + """ + pos = 0 + tokendefs = self._tokens + statestack = list(stack) + statetokens = tokendefs[statestack[-1]] + while 1: + for rexmatch, action, new_state in statetokens: + m = rexmatch(text, pos) + if m: + if action is not None: + if type(action) is _TokenType: + yield pos, action, m.group() + else: + yield from action(self, m) + pos = m.end() + if new_state is not None: + # state transition + if isinstance(new_state, tuple): + for state in new_state: + if state == '#pop': + if len(statestack) > 1: + statestack.pop() + elif state == '#push': + statestack.append(statestack[-1]) + else: + statestack.append(state) + elif isinstance(new_state, int): + # pop, but keep at least one state on the stack + # (random code leading to unexpected pops should + # not allow exceptions) + if abs(new_state) >= len(statestack): + del statestack[1:] + else: + del statestack[new_state:] + elif new_state == '#push': + statestack.append(statestack[-1]) + else: + assert False, "wrong state def: %r" % new_state + statetokens = tokendefs[statestack[-1]] + break + else: + # We are here only if all state tokens have been considered + # and there was not a match on any of them. + try: + if text[pos] == '\n': + # at EOL, reset state to "root" + statestack = ['root'] + statetokens = tokendefs['root'] + yield pos, Text, '\n' + pos += 1 + continue + yield pos, Error, text[pos] + pos += 1 + except IndexError: + break + +class CrowbarLexer(RegexLexer): + name = 'Crowbar' + aliases = ['crowbar'] + filenames = ['*.cro', '*.hro'] + + tokens = { + 'root': [ + (r'bool|char|double|float|function|int|long|short|signed|unsigned|void', Keyword.Type), + (r'const|enum|extern|struct', Keyword.Declaration), + (r'include', Keyword.Namespace), + (r'break|case|continue|default|do|else|for|fragile|if|return|switch|while', Keyword), + (r"[\p{L}\p{Pc}\p{Sk}\p{Mn}][\p{L}\p{Pc}\p{Sk}\p{Mn}\p{N}]*", Name), + (r'0[bB][01_]+', Number.Bin), + (r'0o[0-7_]+', Number.Oct), + (r'0[xX][0-9a-fA-F_]+', Number.Hex), + (r'[0-9_]+(\.[0-9_]+|[eE][0-9_]+|\.[0-9_]+[eE][0-9_]+)', Number.Float), + (r'[0-9_]+', Number.Integer), + (r"""'([^\'\\]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})'""", String.Char), + (r'''([^\\"]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"''', String.Double), + (r'sizeof', Operator.Word), + (r'//[^\n]*', Comment.Single), + (r'/*.*?\*/', Comment.Multiline), + (r"->|\+\+|--|>>|<<|<=|>=|&&|\|\||[=!+\-*/%&|^]=|[.+\-*/%!&|^~><=]", Operator), + (r"[\[\](){},;]", Punctuation), + (r".", Text), + ] + } + +def setup(app): + app.add_lexer('crowbar', CrowbarLexer) + + return { + 'version': '0.1', + 'parallel_read_safe': True, + 'parallel_write_safe': True, + } |