PythonでUnicode block名とUnicode 文字名の一部から一覧を返す

Last updated at 2025-02-27Posted at 2025-02-27

LLMのlogit_biasのテストに必要になったけど情報少なかったので作ってみました。

import unicodedata

# pip install unidata-blocks
from unidata_blocks import get_block_by_name

def in_unicode_char_name(char, unicode_name_subword):
	return not any(x in unicodedata.category(char) for x in ["Cc", "Cn"]) and unicode_name_subword in unicodedata.name(char)

def chars_from_unicodde_block_and_name(unicode_block_name, unicode_char_name_subword):
	uniblock = get_block_by_name(unicode_block_name)
	chars = [chr(i) for i in range(uniblock.code_start, uniblock.code_end+1) if in_unicode_char_name(chr(i), unicode_char_name_subword)]
	return chars

hiragana_list= chars_from_unicodde_block_and_name("Hiragana", "HIRAGANA")
katakana_list = chars_from_unicodde_block_and_name("Katakana", "KATAKANA")
kanji_list = chars_from_unicodde_block_and_name("CJK Unified Ideographs", "IDEOGRAPH")

print(hiragana_list)
print(katakana_list)
print(kanji_list)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up