from pygments.lexer import Lexer, LexerMeta, _TokenType, inherit, combined, include, _inherit, default from pygments.token import * from pygments.util import Future import regex as re # we're overriding here so we can use a unicode-aware regex library class RegexLexerMeta(LexerMeta): """ Metaclass for RegexLexer, creates the self._tokens attribute from self.tokens on the first instantiation. """ def _process_regex(cls, regex, rflags, state): """Preprocess the regular expression component of a token definition.""" if isinstance(regex, Future): regex = regex.get() return re.compile(regex, rflags).match def _process_token(cls, token): """Preprocess the token component of a token definition.""" assert type(token) is _TokenType or callable(token), \ 'token type must be simple type or callable, not %r' % (token,) return token def _process_new_state(cls, new_state, unprocessed, processed): """Preprocess the state transition action of a token definition.""" if isinstance(new_state, str): # an existing state if new_state == '#pop': return -1 elif new_state in unprocessed: return (new_state,) elif new_state == '#push': return new_state elif new_state[:5] == '#pop:': return -int(new_state[5:]) else: assert False, 'unknown new state %r' % new_state elif isinstance(new_state, combined): # combine a new state from existing ones tmp_state = '_tmp_%d' % cls._tmpname cls._tmpname += 1 itokens = [] for istate in new_state: assert istate != new_state, 'circular state ref %r' % istate itokens.extend(cls._process_state(unprocessed, processed, istate)) processed[tmp_state] = itokens return (tmp_state,) elif isinstance(new_state, tuple): # push more than one state for istate in new_state: assert (istate in unprocessed or istate in ('#pop', '#push')), \ 'unknown new state ' + istate return new_state else: assert False, 'unknown new state def %r' % new_state def _process_state(cls, unprocessed, processed, state): """Preprocess a single state definition.""" assert type(state) is str, "wrong state name %r" % state assert state[0] != '#', "invalid state name %r" % state if state in processed: return processed[state] tokens = processed[state] = [] rflags = cls.flags for tdef in unprocessed[state]: if isinstance(tdef, include): # it's a state reference assert tdef != state, "circular state reference %r" % state tokens.extend(cls._process_state(unprocessed, processed, str(tdef))) continue if isinstance(tdef, _inherit): # should be processed already, but may not in the case of: # 1. the state has no counterpart in any parent # 2. the state includes more than one 'inherit' continue if isinstance(tdef, default): new_state = cls._process_new_state(tdef.state, unprocessed, processed) tokens.append((re.compile('').match, None, new_state)) continue assert type(tdef) is tuple, "wrong rule def %r" % tdef try: rex = cls._process_regex(tdef[0], rflags, state) except Exception as err: raise ValueError("uncompilable regex %r in state %r of %r: %s" % (tdef[0], state, cls, err)) from err token = cls._process_token(tdef[1]) if len(tdef) == 2: new_state = None else: new_state = cls._process_new_state(tdef[2], unprocessed, processed) tokens.append((rex, token, new_state)) return tokens def process_tokendef(cls, name, tokendefs=None): """Preprocess a dictionary of token definitions.""" processed = cls._all_tokens[name] = {} tokendefs = tokendefs or cls.tokens[name] for state in list(tokendefs): cls._process_state(tokendefs, processed, state) return processed def get_tokendefs(cls): """ Merge tokens from superclasses in MRO order, returning a single tokendef dictionary. Any state that is not defined by a subclass will be inherited automatically. States that *are* defined by subclasses will, by default, override that state in the superclass. If a subclass wishes to inherit definitions from a superclass, it can use the special value "inherit", which will cause the superclass' state definition to be included at that point in the state. """ tokens = {} inheritable = {} for c in cls.__mro__: toks = c.__dict__.get('tokens', {}) for state, items in toks.items(): curitems = tokens.get(state) if curitems is None: # N.b. because this is assigned by reference, sufficiently # deep hierarchies are processed incrementally (e.g. for # A(B), B(C), C(RegexLexer), B will be premodified so X(B) # will not see any inherits in B). tokens[state] = items try: inherit_ndx = items.index(inherit) except ValueError: continue inheritable[state] = inherit_ndx continue inherit_ndx = inheritable.pop(state, None) if inherit_ndx is None: continue # Replace the "inherit" value with the items curitems[inherit_ndx:inherit_ndx+1] = items try: # N.b. this is the index in items (that is, the superclass # copy), so offset required when storing below. new_inh_ndx = items.index(inherit) except ValueError: pass else: inheritable[state] = inherit_ndx + new_inh_ndx return tokens def __call__(cls, *args, **kwds): """Instantiate cls after preprocessing its token definitions.""" if '_tokens' not in cls.__dict__: cls._all_tokens = {} cls._tmpname = 0 if hasattr(cls, 'token_variants') and cls.token_variants: # don't process yet pass else: cls._tokens = cls.process_tokendef('', cls.get_tokendefs()) return type.__call__(cls, *args, **kwds) class RegexLexer(Lexer, metaclass=RegexLexerMeta): """ Base for simple stateful regular expression-based lexers. Simplifies the lexing process so that you need only provide a list of states and regular expressions. """ #: Flags for compiling the regular expressions. #: Defaults to MULTILINE. flags = re.MULTILINE #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}`` #: #: The initial state is 'root'. #: ``new_state`` can be omitted to signify no state transition. #: If it is a string, the state is pushed on the stack and changed. #: If it is a tuple of strings, all states are pushed on the stack and #: the current state will be the topmost. #: It can also be ``combined('state1', 'state2', ...)`` #: to signify a new, anonymous state combined from the rules of two #: or more existing ones. #: Furthermore, it can be '#pop' to signify going back one step in #: the state stack, or '#push' to push the current state on the stack #: again. #: #: The tuple can also be replaced with ``include('state')``, in which #: case the rules from the state named by the string are included in the #: current one. tokens = {} def get_tokens_unprocessed(self, text, stack=('root',)): """ Split ``text`` into (tokentype, text) pairs. ``stack`` is the inital stack (default: ``['root']``) """ pos = 0 tokendefs = self._tokens statestack = list(stack) statetokens = tokendefs[statestack[-1]] while 1: for rexmatch, action, new_state in statetokens: m = rexmatch(text, pos) if m: if action is not None: if type(action) is _TokenType: yield pos, action, m.group() else: yield from action(self, m) pos = m.end() if new_state is not None: # state transition if isinstance(new_state, tuple): for state in new_state: if state == '#pop': if len(statestack) > 1: statestack.pop() elif state == '#push': statestack.append(statestack[-1]) else: statestack.append(state) elif isinstance(new_state, int): # pop, but keep at least one state on the stack # (random code leading to unexpected pops should # not allow exceptions) if abs(new_state) >= len(statestack): del statestack[1:] else: del statestack[new_state:] elif new_state == '#push': statestack.append(statestack[-1]) else: assert False, "wrong state def: %r" % new_state statetokens = tokendefs[statestack[-1]] break else: # We are here only if all state tokens have been considered # and there was not a match on any of them. try: if text[pos] == '\n': # at EOL, reset state to "root" statestack = ['root'] statetokens = tokendefs['root'] yield pos, Text, '\n' pos += 1 continue yield pos, Error, text[pos] pos += 1 except IndexError: break class CrowbarLexer(RegexLexer): name = 'Crowbar' aliases = ['crowbar'] filenames = ['*.cro', '*.hro'] tokens = { 'root': [ (r'bool|char|double|float|function|int|long|short|signed|unsigned|void', Keyword.Type), (r'const|enum|extern|struct|union', Keyword.Declaration), (r'include', Keyword.Namespace), (r'break|case|continue|default|do|else|for|fragile|if|return|switch|while', Keyword), (r"[\p{L}\p{Pc}\p{Sk}\p{Mn}][\p{L}\p{Pc}\p{Sk}\p{Mn}\p{N}]*", Name), (r'0[bB][01_]+', Number.Bin), (r'0o[0-7_]+', Number.Oct), (r'0[xX][0-9a-fA-F_]+', Number.Hex), (r'[0-9_]+(\.[0-9_]+|[eE][0-9_]+|\.[0-9_]+[eE][0-9_]+)', Number.Float), (r'[0-9_]+', Number.Integer), (r"""'([^\'\\]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})'""", String.Char), (r'''([^\\"]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"''', String.Double), (r'sizeof', Operator.Word), (r'//[^\n]*', Comment.Single), (r'/*.*?\*/', Comment.Multiline), (r"->|\+\+|--|>>|<<|<=|>=|&&|\|\||[=!+\-*/%&|^]=|[.+\-*/%!&|^~><=]", Operator), (r"[\[\](){},;]", Punctuation), (r".", Text), ] } def setup(app): app.add_lexer('crowbar', CrowbarLexer) return { 'version': '0.1', 'parallel_read_safe': True, 'parallel_write_safe': True, }