やりたいこと
文字列を正規化した際、正規化後の文字列と正規化前の文字列がどう対応するかのマッピング情報を作成する。
プログラム例
import unicodedata
#
# NFKD に変換(ガ --> カ")
# nfkd_idx --> org_idx
def org_to_nfkd(text):
nfkd_chars = []
nfkd_to_org_map = {}
last_idx = 0
for i, c in enumerate(text):
decomposed = unicodedata.normalize('NFKD', c)
for j, c in enumerate(decomposed):
nfkd_chars.append(c)
nfkd_to_org_map[last_idx] = i
last_idx += 1
nfkd_text = ''.join(nfkd_chars)
return nfkd_text, nfkd_to_org_map
#
# NFKD を NFKC に変換(カ" --> ガ)
# nfkc_idx --> {start_idx: , end_idx: }
def nfkd_to_nfkc(nfkd_text):
nfkc_chars = []
nfkc_to_nfkd_map = {}
nfkd_idx = 0
nfkc_text = unicodedata.normalize("NFKC", nfkd_text)
for i, nfkc_c in enumerate(nfkc_text):
nfkd_chars = unicodedata.normalize("NFKD", nfkc_c)
nfkc_to_nfkd_map[i] = {"start_idx": nfkd_idx, "end_idx": nfkd_idx + len\
(nfkd_chars) - 1}
nfkc_chars.append(nfkc_c)
nfkd_idx += len(nfkd_chars)
nfkc_text = ''.join(nfkc_chars)
return nfkc_text, nfkc_to_nfkd_map
#
# nfkc_idx --> {sstart_idx: , end_idx: }
def normalize(org_text):
nfkd_text, nfkd_to_org_map = org_to_nfkd(org_text)
nfkc_text, nfkc_to_nfkd_map = nfkd_to_nfkc(nfkd_text)
nfkc_idxs = sorted(nfkc_to_nfkd_map.keys())
nfkc_to_org_map = {}
for nfkc_idx in nfkc_idxs:
# nfkc --> nfkd
nfkd_start_idx = nfkc_to_nfkd_map[nfkc_idx]["start_idx"]
nfkd_end_idx = nfkc_to_nfkd_map[nfkc_idx]["end_idx"]
# nfkd --> org
org_start_idx = nfkd_to_org_map[nfkd_start_idx]
org_end_idx = nfkd_to_org_map[nfkd_end_idx]
nfkc_to_org_map[nfkc_idx] = {"start_idx": org_start_idx, "end_idx": org_end_idx}
return nfkc_text, nfkc_to_org_map
text = "a b C ガ ガ"
norm_text, norm_to_org_map = normalize(text)
print(f"text: {text}")
print(f"norm_text: {norm_text}")
print(f"map: {norm_to_org_map}")
実行結果
実行結果
text: a b C ガ ガ
norm_text: a b C ガ ガ
map: {0: {'start_idx': 0, 'end_idx': 0}, 1: {'start_idx': 1, 'end_idx': 1}, 2: {'start_idx': 2, 'end_idx': 2}, 3: {'start_idx': 3, 'end_idx': 3}, 4: {'start_idx': 4, 'end_idx': 4}, 5: {'start_idx': 5, 'end_idx': 5}, 6: {'start_idx': 6, 'end_idx': 6}, 7: {'start_idx': 7, 'end_idx': 7}, 8: {'start_idx': 8, 'end_idx': 9}}
"ガ" は "ガ" に変換され、"ガ" は元の文字列の 8-9 に該当していることがわかる。