SudachiPyと正規化処理を揃えたかったので、該当処理だけ抜き出しました。
from sudachipy import dictionary
from sudachipy.utf8inputtextbuilder import UTF8InputTextBuilder
dictionary = dictionary.Dictionary()
input_text_plugins = dictionary.input_text_plugins
grammar = dictionary.grammar
def normalize(text):
builder = UTF8InputTextBuilder(text, grammar)
for plugin in input_text_plugins:
plugin.rewrite(builder)
return builder.get_text()
if __name__ == '__main__':
print(normalize('ABC123')) # -> abc123