More than 1 year has passed since last update.

Pythonで単純な字句解析器を考えてみた

Posted at 2021-12-22

キーワード処理

import re

def lexer(s, rules, skip_whitespace=True, errors="ignore"):
    """
    Parameters:
        s     : (str)                   lexer target string
        rules : (dict, 2Dlist, 2Dtuple) tokenize regex and keywordtype
        skip_whitespace: (bool)         you need skip space is `True`. when `False` is WHITESPACE token out.
        errors: (str)                   when `ignore` is errors token no output. when other value is `UNKNOWN` out.

    Example:
        >>> RULES = {
                "[0-9]+" : "NUMBERS" ,
                "[a-z]+" : "IDENTIFIERS" ,
                "\\*|\\+": "OPERATORS" 
        }
        >>> text = " hello how are 2 * 3 you? 123 4567867*98"
        >>> list(lexer(text, RULES, False))
        [('IDENTIFIERS', 'hello'), ('IDENTIFIERS', 'how'),
        ('IDENTIFIERS', 'are'), ('NUMBERS', '2'), ('OPERATORS', '*'),
        ('NUMBERS', '3'), ('IDENTIFIERS', 'you'), ('UNKNOWN', '?'),
        ('NUMBERS', '123'), ('NUMBERS', '4567867'), ('OPERATORS', '*'),
        ('NUMBERS', '98')]
    """

    if isinstance(rules, dict):
        rules = rules.items()
    try:
        uniqr = set((len(k), re.compile(k).match, v) for k, v in rules)
        regexes = sorted(uniqr, key=lambda x:x[0], reverse=True)
    except TypeError:
        raise TypeError("rules type need dict, list of tuple or tuple of tuple")

    re_ws = re.compile('\s+').match

    pos = 0
    length = len(s)
    while pos < length:
        n = re_ws(s, pos)
        if n:
            pos = n.end()
            if skip_whitespace:
                yield "WHITESPACE", n.group(0)
            continue

        m = None
        for _, reg, typ in regexes:
            m = reg(s, pos)
            if m:
                pos = m.end()
                yield typ, m.group(0)
                break

        if m:
            continue
        
        val = s[pos]
        pos += 1
        if errors == "ignore":
            yield "UNKNOWN", val
        else:
            raise ValueError(f"if we're here, no rule matched : `{val}`")

もっと単純な方法

未定義ワード（上でいうとUNKNOWN）の検出が要らなければ、実装だけなら単純になるのだが計算量がO(NM)になってしまい効率悪い。
3つ目の関数はお蔵入りだが、テンション上がった勢いでワンライナーができちゃったが、実際遅かったので意味なし。

def lexer_lazy(s, rules):
    if isinstance(rules, dict):
        rules = rules.items()
    try:
        mt = [(it.start(), v, it.group()) for k, v in rules for it in re.finditer(k, s)]
    except TypeError:
        raise TypeError("rules type need dict, list of tuple or tuple of tuple")
    mt.sort()
    return (m for _, *m in mt)


def lexer_okura(s, rules):
    ## 流石にやりすぎ、、
    return (m for _, *m in sorted((it.start(), v, it.group()) for k, v in rules.items() for it in re.finditer(k, s)))

スループット計測

>>> RULES = {
... "[0-9]+" : "NUMBERS" ,
... "[a-z]+" : "IDENTIFIERS" ,
... "\\*|\\+": "OPERATORS" 
... }
>>> 
>>> text = " hello how are 2 * 3 you? 123 4567867*98"
>>> 
>>> for token in lexer(text, RULES, False):
...     print(token)
('IDENTIFIERS', 'hello')
('IDENTIFIERS', 'how')
('IDENTIFIERS', 'are')
('NUMBERS', '2')      
('OPERATORS', '*')    
('NUMBERS', '3')      
('IDENTIFIERS', 'you')
('UNKNOWN', '?')      
('NUMBERS', '123')    
('NUMBERS', '4567867')
('OPERATORS', '*')    
('NUMBERS', '98')     
>>> 
>>> from timeit import timeit
>>> loop = 10000
>>> unitsec = 1000000
>>> print(timeit("list(lexer(text, RULES))", globals=globals(), number=loop) / loop * unitsec, "microsec/1time")
172.33013000000003 microsec/1time
>>> print(timeit("list(lexer_lazy(text, RULES))", globals=globals(), number=loop) / loop * unitsec, "microsec/1time")
69.40176999999998 microsec/1time
>>> print(timeit("list(lexer_okura(text, RULES))", globals=globals(), number=loop) / loop * unitsec, "microsec/1time")
118.08776 microsec/1time

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up