0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

anonym

Last updated at Posted at 2025-06-15
# pythonは11以下じゃないと、インストール時にRUST関連のエラーが出るので注意
pip install pandas
pip install ja_ginza_electra
  • いかはふよう
# pip install -U spacy
# pip install -U spacy-transformers
# pip install -U transformers
# pip isntall ginza
import pandas as pd
import spacy
import re
import logging

# ロガーの設定
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ================================================================
# GiNZAモデルの読み込み
# ================================================================
def load_nlp_model():
    try:
        return spacy.load("ja_ginza_electra")
    except OSError:
        logger.warning("GiNZAモデル 'ja_ginza_electra' が見つかりません。")
        logger.info("インストールしてください: pip install -U ginza ja_ginza_electra")
        return None

nlp = load_nlp_model()

# ================================================================
# 1. 正規表現による定形情報の匿名化
# ================================================================
def anonymize_pii_by_regex(text: str) -> str:
    if not isinstance(text, str):
        return text
    patterns = [
        (r'\b\d{4}-\d{4}-\d{4}-\d{4}\b', '[CARD]'),         # クレジットカード番号(16桁)
        (r'\b\d{4}-\d{4}-\d{4}\b', '[MYNUMBER]'),           # マイナンバー(12桁)
        (r'\b\d{4}-\d{4}-\d{4}-\d{4}-\d{4}\b', '[CARD]'),   # 20桁(稀だが念のため)
        (r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '[EMAIL]'),
        (r'\d{2,4}-[Xx**]{4}-[Xx**]{4}', '[PHONE]'),       # マスキング済み電話番号
        (r'\(?0\d{1,4}\)?[- ]?\d{1,4}[- ]?\d{3,4}', '[PHONE]'),  # 通常電話番号
        (r'〒\s*\d{3}-\d{4}', '[POSTAL_CODE]'),
        (r'\d{4}年\d{1,2}月\d{1,2}日', '[DATE]'),
        (r'\d{4}[-/年]\d{1,2}[-/月]\d{1,2}(日)?', '[DATE]'),
    ]

# GiNZAの住所エンティティの後ろに「東入」「南通」などがあれば追加で置換

    for pattern, repl in patterns:
        text = re.sub(pattern, repl, text)
    return text
# ================================================================
# 2. GiNZAとルールベースの匿名化
# ================================================================
def anonymize_by_ginza_and_rules(text: str) -> str:
    if not nlp or not isinstance(text, str):
        return text

    placeholder_map = {
        "Person": "[PERSON]",
        "Postal_Address": "[ADDRESS]",
        "GPE": "[LOCATION]",
        "LOC": "[LOCATION]",
        "ORG": "[ORGANIZATION]",
        "DATE": "[DATE]",
        "TIME": "[TIME]",
        "MONEY": "[MONEY]",
        "NORP": "[GROUP]",
        "EVENT": "[EVENT]",
        "PRODUCT": "[PRODUCT]",
    }

    doc = nlp(text)
    chars = list(text)

    for ent in reversed(doc.ents):
        label = ent.label_
        if label in placeholder_map:
            start, end = ent.start_char, ent.end_char

            # 地名などの後に続く番地を追加でマッチ
            if label in {"GPE", "LOC", "Postal_Address"}:
                suffix_match = re.match(
                    r'[\s、::\-‐ー]*(?:[^\s、。]{1,10}[0-9一二三四五六七八九十\-丁目番地号条]){1,3}',
                    text[end:]
                )
                if suffix_match:
                    end += suffix_match.end()

            chars[start:end] = list(placeholder_map[label])

    return ''.join(chars)

# ================================================================
# 3. カスタム辞書による置換処理
# ================================================================
DEFAULT_CUSTOM_DICT = {
    "代表取締役": "[ROLE]", "取締役": "[ROLE]", "部長": "[ROLE]", "課長": "[ROLE]",
    "係長": "[ROLE]", "主任": "[ROLE]", "チームリーダー": "[ROLE]", "マネージャー": "[ROLE]",
    "さん": "", "": "", "": "", "ちゃん": "",
    "博士": "[DEGREE]", "修士": "[DEGREE]", "学士": "[DEGREE]", "PhD": "[DEGREE]",
    "生年月日": "***", "生まれ": "***", "誕生日": "***",
    "彼女": "[PERSON]]", "": "[PERSON]", "彼ら": "[PERSON]", "彼女たち": "[PERSON]",
    'マイナンバ': '[NUMBER]', 'カード': '***',
    'クレジットカード': '****', '電話番号': '[PHONE]', 'メールアドレス': '[EMAIL]',
}

def anonymize_by_custom_dict(text: str, custom_dict: dict) -> str:
    if not isinstance(text, str):
        return text
    for word, repl in sorted(custom_dict.items(), key=lambda x: len(x[0]), reverse=True):
        text = text.replace(word, repl)
    return text

# ================================================================
# 統括匿名化関数
# ================================================================
def anonymize_text(text: str) -> str:
    text = anonymize_pii_by_regex(text)
    text = anonymize_by_ginza_and_rules(text)
    text = anonymize_by_custom_dict(text, DEFAULT_CUSTOM_DICT)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# ================================================================
# テストデータの処理
# ================================================================
data = {
    "ID": list(range(1, 11)),
    "内容": [
        "山田太郎さんのメールは yamada.taro@example.com です。住所は東京都新宿区西新宿2-8-1。",
        "佐藤花子(sato.hanako@sample.jp)は大阪府大阪市北区梅田1丁目に住んでいます。電話番号は090-1234-5678です。",
        "田中一郎の住所:愛知県名古屋市中区栄3-5-7、連絡先:ichiro.tanaka@mail.ne.jp。生年月日は1985年5月10日。",
        "メール:kana.kawasaki@gmail.com。川崎佳奈の住所は福岡県福岡市中央区天神1丁目。彼女の電話番号は080-9876-5432です。",
        "鈴木次郎(メール: jiro_suzuki@foo.co.jp)に連絡を。住所:北海道札幌市北区北7条西5丁目。彼の生年月日は1992年11月22日。",
        "高橋美咲のメールアドレスは misaki.takahashi@web.co.jp です。東京都渋谷区神南1-2-3にお住まいです。電話番号は03-XXXX-XXXX(秘匿)。",
        "小林健太の連絡先はkenta.kobayashi@example.org、マイナンバーは1234-5678-9012です。彼は神奈川県横浜市中区日本大通1にいます。",
        "渡辺直美は1978年7月15日生まれ。彼女のクレジットカード番号は1234-5678-9012-3456です。住所は京都府京都市下京区四条通烏丸東入。",
        "伊藤大輔のメールはdaisuke.ito@mymail.com。大阪府吹田市千里万博公園1-1に住んでいます。彼の電話は070-1111-2222です。",
        "山本裕子の住所は埼玉県さいたま市大宮区桜木町1-1-1です。彼女のメールアドレスはyuko.yamamoto@domain.jpで、生年月日は1990年3月3日です。"
    ]
}
df = pd.DataFrame(data)


# ================================================================
# 匿名化1の実行
# ================================================================
df["匿名化後"] = df["内容"].apply(anonymize_text)


# ================================================================
# 匿名化2の実行
# ================================================================
import re
import pandas as pd

def anonymize_personal_info(text):
    # 電話番号やマイナンバーなどの数字とハイフンの連続
    phone_pattern = r'\b(?:\d{2,4}-){1,3}\d{2,4}\b'
    text = re.sub(phone_pattern, '[PHONE]', text)
    # 住所の数字とハイフン(例: 2-8-1, 3-5-7, 1-1-1など)
    address_pattern = r'\d{1,3}(-\d{1,3}){1,2}'
    text = re.sub(address_pattern, '[ADDRESS]', text)
    return text

# DataFrameの'内容'列に適用
df['匿名化後2'] = df['匿名化後'].apply(anonymize_personal_info)


# ================================================================
# 表示
# ================================================================
df

df.to_csv("anonymized_data.csv", index=False, encoding="utf-8")
text = '田中一郎の住所:愛知県名古屋市中区栄3-5-7、連絡先:ichiro.tanaka@mail.ne.jp。生年月日は1985年5月10日。'
doc = nlp(text)
print("--- Tokens ---")
for token in doc:
    print(f"'{token.text}' (POS: {token.pos_}, Dep: {token.dep_}, Entity: {token.ent_type_})")

0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?