キーワード処理
import re
def lexer(s, rules, skip_whitespace=True, errors="ignore"):
"""
Parameters:
s : (str) lexer target string
rules : (dict, 2Dlist, 2Dtuple) tokenize regex and keywordtype
skip_whitespace: (bool) you need skip space is `True`. when `False` is WHITESPACE token out.
errors: (str) when `ignore` is errors token no output. when other value is `UNKNOWN` out.
Example:
>>> RULES = {
"[0-9]+" : "NUMBERS" ,
"[a-z]+" : "IDENTIFIERS" ,
"\\*|\\+": "OPERATORS"
}
>>> text = " hello how are 2 * 3 you? 123 4567867*98"
>>> list(lexer(text, RULES, False))
[('IDENTIFIERS', 'hello'), ('IDENTIFIERS', 'how'),
('IDENTIFIERS', 'are'), ('NUMBERS', '2'), ('OPERATORS', '*'),
('NUMBERS', '3'), ('IDENTIFIERS', 'you'), ('UNKNOWN', '?'),
('NUMBERS', '123'), ('NUMBERS', '4567867'), ('OPERATORS', '*'),
('NUMBERS', '98')]
"""
if isinstance(rules, dict):
rules = rules.items()
try:
uniqr = set((len(k), re.compile(k).match, v) for k, v in rules)
regexes = sorted(uniqr, key=lambda x:x[0], reverse=True)
except TypeError:
raise TypeError("rules type need dict, list of tuple or tuple of tuple")
re_ws = re.compile('\s+').match
pos = 0
length = len(s)
while pos < length:
n = re_ws(s, pos)
if n:
pos = n.end()
if skip_whitespace:
yield "WHITESPACE", n.group(0)
continue
m = None
for _, reg, typ in regexes:
m = reg(s, pos)
if m:
pos = m.end()
yield typ, m.group(0)
break
if m:
continue
val = s[pos]
pos += 1
if errors == "ignore":
yield "UNKNOWN", val
else:
raise ValueError(f"if we're here, no rule matched : `{val}`")
もっと単純な方法
未定義ワード(上でいうとUNKNOWN)の検出が要らなければ、実装だけなら単純になるのだが計算量がO(NM)になってしまい効率悪い。
3つ目の関数はお蔵入りだが、テンション上がった勢いでワンライナーができちゃったが、実際遅かったので意味なし。
def lexer_lazy(s, rules):
if isinstance(rules, dict):
rules = rules.items()
try:
mt = [(it.start(), v, it.group()) for k, v in rules for it in re.finditer(k, s)]
except TypeError:
raise TypeError("rules type need dict, list of tuple or tuple of tuple")
mt.sort()
return (m for _, *m in mt)
def lexer_okura(s, rules):
## 流石にやりすぎ、、
return (m for _, *m in sorted((it.start(), v, it.group()) for k, v in rules.items() for it in re.finditer(k, s)))
スループット計測
>>> RULES = {
... "[0-9]+" : "NUMBERS" ,
... "[a-z]+" : "IDENTIFIERS" ,
... "\\*|\\+": "OPERATORS"
... }
>>>
>>> text = " hello how are 2 * 3 you? 123 4567867*98"
>>>
>>> for token in lexer(text, RULES, False):
... print(token)
('IDENTIFIERS', 'hello')
('IDENTIFIERS', 'how')
('IDENTIFIERS', 'are')
('NUMBERS', '2')
('OPERATORS', '*')
('NUMBERS', '3')
('IDENTIFIERS', 'you')
('UNKNOWN', '?')
('NUMBERS', '123')
('NUMBERS', '4567867')
('OPERATORS', '*')
('NUMBERS', '98')
>>>
>>> from timeit import timeit
>>> loop = 10000
>>> unitsec = 1000000
>>> print(timeit("list(lexer(text, RULES))", globals=globals(), number=loop) / loop * unitsec, "microsec/1time")
172.33013000000003 microsec/1time
>>> print(timeit("list(lexer_lazy(text, RULES))", globals=globals(), number=loop) / loop * unitsec, "microsec/1time")
69.40176999999998 microsec/1time
>>> print(timeit("list(lexer_okura(text, RULES))", globals=globals(), number=loop) / loop * unitsec, "microsec/1time")
118.08776 microsec/1time