python で文字列を正規化した際の正規化前後の文字マッピング

Posted at 2026-03-21

やりたいこと

文字列を正規化した際、正規化後の文字列と正規化前の文字列がどう対応するかのマッピング情報を作成する。

プログラム例

import unicodedata

#
# NFKD に変換（ガ --> カ"）
# nfkd_idx --> org_idx
def org_to_nfkd(text):
    nfkd_chars = []
    nfkd_to_org_map = {}
    last_idx = 0

    for i, c in enumerate(text):
        decomposed = unicodedata.normalize('NFKD', c)
        for j, c in enumerate(decomposed):
            nfkd_chars.append(c)
            nfkd_to_org_map[last_idx] = i
            last_idx += 1

    nfkd_text = ''.join(nfkd_chars)
    return nfkd_text, nfkd_to_org_map

#
# NFKD を NFKC に変換（カ" --> ガ）
# nfkc_idx --> {start_idx: , end_idx: }
def nfkd_to_nfkc(nfkd_text):
    nfkc_chars = []
    nfkc_to_nfkd_map = {}
    nfkd_idx = 0

    nfkc_text = unicodedata.normalize("NFKC", nfkd_text)
    for i, nfkc_c in enumerate(nfkc_text):
        nfkd_chars = unicodedata.normalize("NFKD", nfkc_c)
        nfkc_to_nfkd_map[i] = {"start_idx": nfkd_idx, "end_idx": nfkd_idx + len\
(nfkd_chars) - 1}
        nfkc_chars.append(nfkc_c)
        nfkd_idx += len(nfkd_chars)

    nfkc_text = ''.join(nfkc_chars)
    return nfkc_text, nfkc_to_nfkd_map

#
# nfkc_idx --> {sstart_idx: , end_idx: }
def normalize(org_text):
    nfkd_text, nfkd_to_org_map = org_to_nfkd(org_text)
    nfkc_text, nfkc_to_nfkd_map = nfkd_to_nfkc(nfkd_text)
    nfkc_idxs = sorted(nfkc_to_nfkd_map.keys())

    nfkc_to_org_map = {}
    for nfkc_idx in nfkc_idxs:
        # nfkc --> nfkd
        nfkd_start_idx = nfkc_to_nfkd_map[nfkc_idx]["start_idx"]
        nfkd_end_idx = nfkc_to_nfkd_map[nfkc_idx]["end_idx"]

        # nfkd --> org
        org_start_idx = nfkd_to_org_map[nfkd_start_idx]
        org_end_idx = nfkd_to_org_map[nfkd_end_idx]

        nfkc_to_org_map[nfkc_idx] = {"start_idx": org_start_idx, "end_idx": org_end_idx}

    return nfkc_text, nfkc_to_org_map


text = "a ｂ Ｃ ガ ｶﾞ"
norm_text, norm_to_org_map = normalize(text)
print(f"text:      {text}")
print(f"norm_text: {norm_text}")
print(f"map:       {norm_to_org_map}")

実行結果

text:      a ｂ Ｃ ガ ｶﾞ
norm_text: a b C ガ ガ
map:       {0: {'start_idx': 0, 'end_idx': 0}, 1: {'start_idx': 1, 'end_idx': 1}, 2: {'start_idx': 2, 'end_idx': 2}, 3: {'start_idx': 3, 'end_idx': 3}, 4: {'start_idx': 4, 'end_idx': 4}, 5: {'start_idx': 5, 'end_idx': 5}, 6: {'start_idx': 6, 'end_idx': 6}, 7: {'start_idx': 7, 'end_idx': 7}, 8: {'start_idx': 8, 'end_idx': 9}}

"ｶﾞ" は "ガ" に変換され、"ガ" は元の文字列の 8-9 に該当していることがわかる。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up