LoginSignup
1
0

郵便番号辞書 Mozc形式作成手順

Posted at

MOZCの辞書形式で出力する手順とスクリプトです。
スクリプトはMITライセンスです。
phoepsilonix/japanese-zip-code-dictionary

wget -nc https://www.post.japanpost.jp/zipcode/dl/kogaki/zip/ken_all.zip
wget -nc https://www.post.japanpost.jp/zipcode/dl/jigyosyo/zip/jigyosyo.zip
unzip -o ken_all.zip
unzip -o jigyosyo.zip
uconv -x '::[ [:^Katakana:] & [:^Hiragana:] & [:^Han:] & [^ー・「」、,()]]  Fullwidth-Halfwidth; ::[\p{Nl}] Latin-ASCII;' -f cp932 -t UTF-8 KEN_ALL.CSV > KEN_ALL_UTF8.CSV
uconv -x '::[ [:^Katakana:] & [:^Hiragana:] & [:^Han:] & [^ー・「」、,()]]  Fullwidth-Halfwidth; ::[\p{Nl}] Latin-ASCII;' -f cp932 -t UTF-8 JIGYOSYO.CSV > JIGYOSYO_UTF8.CSV 
ken_all-convert-mozc-dictionary.awk
BEGIN{
    FS=","
    f=0
}
{
    gsub( "\"", "", $0 );
    gsub(/以下に掲載がない場合|.*くる場合|.*村一円/,"",$9);
    if (f==$3) {
        next;
    } else
    {
        f=0
    }
    if (($9 ~ /(.+、/ || $9 ~ /(.*・/) && f==0) f = $3;
    $9 = gensub(/([^]*)(.*/, "\\1", "G", $9)
    c[0] = gensub(/第?([0-90-9]+)地割(.*)/, "\\3", "G" ,$9)
}
{
    if (!a[$3,$7,$8,c[0]]++) {
        if(c[0] ~ /、/) {
            if (c[0] ~ /町/) {
                split(c[0], chou, "町")
                split(chou[2], array, "、")
                chou[1] = chou[1] "町"
            } else {
            split(c[0], array, "、")
            chou[1] = ""
        }
        for (x in array) {
            print substr($3, 1, 3) "-" substr($3,4,4) "\t" $7 $8 chou[1] array[x] "\t" "地名" "\t"
        }
        } else {
            print substr($3, 1, 3) "-" substr($3,4,4) "\t" $7 $8 c[0] "\t" "地名" "\t"
        }
    }
}
jigyosyo-convert-mozc-dictionary.awk
BEGIN{
    FS=","
    f=0
}
{
    gsub( "\"", "", $0 );
    gsub(/以下に掲載がない場合|.*くる場合|.*村一円/,"",$9);
    if (f==$8) {
        next;
    } else
    {
        f=0
    }
    if (($7 ~ /(.+、/ || $7 ~ /(.*・/) && f==0) f = $8;
}
{
    gsub(",",",",$3)
    gsub("(","(",$3)
    gsub(")",")",$3)
    gsub("㈱","(株)",$3)
    if (!a[$8,$4,$5,$6,$3]++) {
        print substr($8, 1, 3) "-" substr($8,4,4) "\t" $4 $5 $6 " " $3 "\t" "組織" "\t"
    }
}
awk -f ken_all-convert-mozc-dictionary.awk KEN_ALL_UTF8.CSV > KEN_ALL.txt
awk -f jigyosyo-convert-mozc-dictionary.awk JIGYOSYO_UTF8.CSV > JIGYOSYO.txt
1
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
0