aboutsummaryrefslogtreecommitdiff
path: root/_ext/crowbar_lexer.py
diff options
context:
space:
mode:
Diffstat (limited to '_ext/crowbar_lexer.py')
-rw-r--r--_ext/crowbar_lexer.py298
1 files changed, 298 insertions, 0 deletions
diff --git a/_ext/crowbar_lexer.py b/_ext/crowbar_lexer.py
new file mode 100644
index 0000000..6f0ec5e
--- /dev/null
+++ b/_ext/crowbar_lexer.py
@@ -0,0 +1,298 @@
+from pygments.lexer import Lexer, LexerMeta, _TokenType, inherit, combined, include, _inherit, default
+from pygments.token import *
+from pygments.util import Future
+import regex as re
+
+# we're overriding here so we can use a unicode-aware regex library
+class RegexLexerMeta(LexerMeta):
+ """
+ Metaclass for RegexLexer, creates the self._tokens attribute from
+ self.tokens on the first instantiation.
+ """
+
+ def _process_regex(cls, regex, rflags, state):
+ """Preprocess the regular expression component of a token definition."""
+ if isinstance(regex, Future):
+ regex = regex.get()
+ return re.compile(regex, rflags).match
+
+ def _process_token(cls, token):
+ """Preprocess the token component of a token definition."""
+ assert type(token) is _TokenType or callable(token), \
+ 'token type must be simple type or callable, not %r' % (token,)
+ return token
+
+ def _process_new_state(cls, new_state, unprocessed, processed):
+ """Preprocess the state transition action of a token definition."""
+ if isinstance(new_state, str):
+ # an existing state
+ if new_state == '#pop':
+ return -1
+ elif new_state in unprocessed:
+ return (new_state,)
+ elif new_state == '#push':
+ return new_state
+ elif new_state[:5] == '#pop:':
+ return -int(new_state[5:])
+ else:
+ assert False, 'unknown new state %r' % new_state
+ elif isinstance(new_state, combined):
+ # combine a new state from existing ones
+ tmp_state = '_tmp_%d' % cls._tmpname
+ cls._tmpname += 1
+ itokens = []
+ for istate in new_state:
+ assert istate != new_state, 'circular state ref %r' % istate
+ itokens.extend(cls._process_state(unprocessed,
+ processed, istate))
+ processed[tmp_state] = itokens
+ return (tmp_state,)
+ elif isinstance(new_state, tuple):
+ # push more than one state
+ for istate in new_state:
+ assert (istate in unprocessed or
+ istate in ('#pop', '#push')), \
+ 'unknown new state ' + istate
+ return new_state
+ else:
+ assert False, 'unknown new state def %r' % new_state
+
+ def _process_state(cls, unprocessed, processed, state):
+ """Preprocess a single state definition."""
+ assert type(state) is str, "wrong state name %r" % state
+ assert state[0] != '#', "invalid state name %r" % state
+ if state in processed:
+ return processed[state]
+ tokens = processed[state] = []
+ rflags = cls.flags
+ for tdef in unprocessed[state]:
+ if isinstance(tdef, include):
+ # it's a state reference
+ assert tdef != state, "circular state reference %r" % state
+ tokens.extend(cls._process_state(unprocessed, processed,
+ str(tdef)))
+ continue
+ if isinstance(tdef, _inherit):
+ # should be processed already, but may not in the case of:
+ # 1. the state has no counterpart in any parent
+ # 2. the state includes more than one 'inherit'
+ continue
+ if isinstance(tdef, default):
+ new_state = cls._process_new_state(tdef.state, unprocessed, processed)
+ tokens.append((re.compile('').match, None, new_state))
+ continue
+
+ assert type(tdef) is tuple, "wrong rule def %r" % tdef
+
+ try:
+ rex = cls._process_regex(tdef[0], rflags, state)
+ except Exception as err:
+ raise ValueError("uncompilable regex %r in state %r of %r: %s" %
+ (tdef[0], state, cls, err)) from err
+
+ token = cls._process_token(tdef[1])
+
+ if len(tdef) == 2:
+ new_state = None
+ else:
+ new_state = cls._process_new_state(tdef[2],
+ unprocessed, processed)
+
+ tokens.append((rex, token, new_state))
+ return tokens
+
+ def process_tokendef(cls, name, tokendefs=None):
+ """Preprocess a dictionary of token definitions."""
+ processed = cls._all_tokens[name] = {}
+ tokendefs = tokendefs or cls.tokens[name]
+ for state in list(tokendefs):
+ cls._process_state(tokendefs, processed, state)
+ return processed
+
+ def get_tokendefs(cls):
+ """
+ Merge tokens from superclasses in MRO order, returning a single tokendef
+ dictionary.
+ Any state that is not defined by a subclass will be inherited
+ automatically. States that *are* defined by subclasses will, by
+ default, override that state in the superclass. If a subclass wishes to
+ inherit definitions from a superclass, it can use the special value
+ "inherit", which will cause the superclass' state definition to be
+ included at that point in the state.
+ """
+ tokens = {}
+ inheritable = {}
+ for c in cls.__mro__:
+ toks = c.__dict__.get('tokens', {})
+
+ for state, items in toks.items():
+ curitems = tokens.get(state)
+ if curitems is None:
+ # N.b. because this is assigned by reference, sufficiently
+ # deep hierarchies are processed incrementally (e.g. for
+ # A(B), B(C), C(RegexLexer), B will be premodified so X(B)
+ # will not see any inherits in B).
+ tokens[state] = items
+ try:
+ inherit_ndx = items.index(inherit)
+ except ValueError:
+ continue
+ inheritable[state] = inherit_ndx
+ continue
+
+ inherit_ndx = inheritable.pop(state, None)
+ if inherit_ndx is None:
+ continue
+
+ # Replace the "inherit" value with the items
+ curitems[inherit_ndx:inherit_ndx+1] = items
+ try:
+ # N.b. this is the index in items (that is, the superclass
+ # copy), so offset required when storing below.
+ new_inh_ndx = items.index(inherit)
+ except ValueError:
+ pass
+ else:
+ inheritable[state] = inherit_ndx + new_inh_ndx
+
+ return tokens
+
+ def __call__(cls, *args, **kwds):
+ """Instantiate cls after preprocessing its token definitions."""
+ if '_tokens' not in cls.__dict__:
+ cls._all_tokens = {}
+ cls._tmpname = 0
+ if hasattr(cls, 'token_variants') and cls.token_variants:
+ # don't process yet
+ pass
+ else:
+ cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
+
+ return type.__call__(cls, *args, **kwds)
+
+class RegexLexer(Lexer, metaclass=RegexLexerMeta):
+ """
+ Base for simple stateful regular expression-based lexers.
+ Simplifies the lexing process so that you need only
+ provide a list of states and regular expressions.
+ """
+
+ #: Flags for compiling the regular expressions.
+ #: Defaults to MULTILINE.
+ flags = re.MULTILINE
+
+ #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
+ #:
+ #: The initial state is 'root'.
+ #: ``new_state`` can be omitted to signify no state transition.
+ #: If it is a string, the state is pushed on the stack and changed.
+ #: If it is a tuple of strings, all states are pushed on the stack and
+ #: the current state will be the topmost.
+ #: It can also be ``combined('state1', 'state2', ...)``
+ #: to signify a new, anonymous state combined from the rules of two
+ #: or more existing ones.
+ #: Furthermore, it can be '#pop' to signify going back one step in
+ #: the state stack, or '#push' to push the current state on the stack
+ #: again.
+ #:
+ #: The tuple can also be replaced with ``include('state')``, in which
+ #: case the rules from the state named by the string are included in the
+ #: current one.
+ tokens = {}
+
+ def get_tokens_unprocessed(self, text, stack=('root',)):
+ """
+ Split ``text`` into (tokentype, text) pairs.
+ ``stack`` is the inital stack (default: ``['root']``)
+ """
+ pos = 0
+ tokendefs = self._tokens
+ statestack = list(stack)
+ statetokens = tokendefs[statestack[-1]]
+ while 1:
+ for rexmatch, action, new_state in statetokens:
+ m = rexmatch(text, pos)
+ if m:
+ if action is not None:
+ if type(action) is _TokenType:
+ yield pos, action, m.group()
+ else:
+ yield from action(self, m)
+ pos = m.end()
+ if new_state is not None:
+ # state transition
+ if isinstance(new_state, tuple):
+ for state in new_state:
+ if state == '#pop':
+ if len(statestack) > 1:
+ statestack.pop()
+ elif state == '#push':
+ statestack.append(statestack[-1])
+ else:
+ statestack.append(state)
+ elif isinstance(new_state, int):
+ # pop, but keep at least one state on the stack
+ # (random code leading to unexpected pops should
+ # not allow exceptions)
+ if abs(new_state) >= len(statestack):
+ del statestack[1:]
+ else:
+ del statestack[new_state:]
+ elif new_state == '#push':
+ statestack.append(statestack[-1])
+ else:
+ assert False, "wrong state def: %r" % new_state
+ statetokens = tokendefs[statestack[-1]]
+ break
+ else:
+ # We are here only if all state tokens have been considered
+ # and there was not a match on any of them.
+ try:
+ if text[pos] == '\n':
+ # at EOL, reset state to "root"
+ statestack = ['root']
+ statetokens = tokendefs['root']
+ yield pos, Text, '\n'
+ pos += 1
+ continue
+ yield pos, Error, text[pos]
+ pos += 1
+ except IndexError:
+ break
+
+class CrowbarLexer(RegexLexer):
+ name = 'Crowbar'
+ aliases = ['crowbar']
+ filenames = ['*.cro', '*.hro']
+
+ tokens = {
+ 'root': [
+ (r'bool|char|double|float|function|int|long|short|signed|unsigned|void', Keyword.Type),
+ (r'const|enum|extern|struct', Keyword.Declaration),
+ (r'include', Keyword.Namespace),
+ (r'break|case|continue|default|do|else|for|fragile|if|return|switch|while', Keyword),
+ (r"[\p{L}\p{Pc}\p{Sk}\p{Mn}][\p{L}\p{Pc}\p{Sk}\p{Mn}\p{N}]*", Name),
+ (r'0[bB][01_]+', Number.Bin),
+ (r'0o[0-7_]+', Number.Oct),
+ (r'0[xX][0-9a-fA-F_]+', Number.Hex),
+ (r'[0-9_]+(\.[0-9_]+|[eE][0-9_]+|\.[0-9_]+[eE][0-9_]+)', Number.Float),
+ (r'[0-9_]+', Number.Integer),
+ (r"""'([^\'\\]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})'""", String.Char),
+ (r'''([^\\"]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"''', String.Double),
+ (r'sizeof', Operator.Word),
+ (r'//[^\n]*', Comment.Single),
+ (r'/*.*?\*/', Comment.Multiline),
+ (r"->|\+\+|--|>>|<<|<=|>=|&&|\|\||[=!+\-*/%&|^]=|[.+\-*/%!&|^~><=]", Operator),
+ (r"[\[\](){},;]", Punctuation),
+ (r".", Text),
+ ]
+ }
+
+def setup(app):
+ app.add_lexer('crowbar', CrowbarLexer)
+
+ return {
+ 'version': '0.1',
+ 'parallel_read_safe': True,
+ 'parallel_write_safe': True,
+ }