LLMのlogit_biasのテストに必要になったけど情報少なかったので作ってみました。
import unicodedata
# pip install unidata-blocks
from unidata_blocks import get_block_by_name
def in_unicode_char_name(char, unicode_name_subword):
return not any(x in unicodedata.category(char) for x in ["Cc", "Cn"]) and unicode_name_subword in unicodedata.name(char)
def chars_from_unicodde_block_and_name(unicode_block_name, unicode_char_name_subword):
uniblock = get_block_by_name(unicode_block_name)
chars = [chr(i) for i in range(uniblock.code_start, uniblock.code_end+1) if in_unicode_char_name(chr(i), unicode_char_name_subword)]
return chars
hiragana_list= chars_from_unicodde_block_and_name("Hiragana", "HIRAGANA")
katakana_list = chars_from_unicodde_block_and_name("Katakana", "KATAKANA")
kanji_list = chars_from_unicodde_block_and_name("CJK Unified Ideographs", "IDEOGRAPH")
print(hiragana_list)
print(katakana_list)
print(kanji_list)