LoginSignup
0
1

More than 1 year has passed since last update.

python全角⇔半角文字列変換

Last updated at Posted at 2021-02-25

strのtranslate関数の場合

ただし半角カナは非対応

ZEN = chr(0x3000) + "".join(chr(0xff01 + i) for i in range(94))
HAN = "".join(chr(0x20 + i) for i in range(95))
def to_hankaku(s, tdic=str.maketrans(ZEN, HAN)):
    return s.translate(tdic)
def to_zenkaku(s, tdic=str.maketrans(HAN, ZEN)):
    return s.translate(tdic)

結構速い

In [1]: %timeit to_hankaku("0123456789ABCDEF")
   ...: %timeit to_zenkaku("0123456789ABCDEF")
   ...:
1.09 µs ± 8.6 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
911 ns ± 2.09 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)

C言語実装pythonライブラリにした場合

str.translateでやった場合より10倍速い。上記のmaketrans translateではカナエリアに非対応だが、
興味本位でカナ、半角カナもついでに対応表を作って実装してみたら遅くなるかと思ったが、
微塵も変化なかった。

#include <unordered_map>

std::unordered_map<char32_t, char32_t*> ZEN2HAN = {
    {U'\x30A1', U"\xff67\x00"},        //ァ
    {U'\x30A2', U"\xff71\x00"},        //ア
    {U'\x30A3', U"\xff68\x00"},        //ィ
    {U'\x30A4', U"\xff72\x00"},        //イ
    {U'\x30A5', U"\xff69\x00"},        //ゥ
    {U'\x30A6', U"\xff73\x00"},        //ウ
    {U'\x30A7', U"\xff6A\x00"},        //ェ
    {U'\x30A8', U"\xff74\x00"},        //エ
    {U'\x30A9', U"\xff6B\x00"},        //ォ
    {U'\x30AA', U"\xff75\x00"},        //オ
    {U'\x30AB', U"\xff76\x00"},        //カ
    {U'\x30AC', U"\xff76\xFF9E\x00"},  //ガ
    {U'\x30AD', U"\xff77\x00"},        //キ
    {U'\x30AE', U"\xff77\xFF9E\x00"},  //ギ
    {U'\x30AF', U"\xff78\x00"},        //ク
    {U'\x30B0', U"\xff78\xFF9E\x00"},  //グ
    {U'\x30B1', U"\xff79\x00"},        //ケ
    {U'\x30B2', U"\xff79\xFF9E\x00"},  //ゲ
    {U'\x30B3', U"\xff7A\x00"},        //コ
    {U'\x30B4', U"\xff7A\xFF9E\x00"},  //ゴ
    {U'\x30B5', U"\xff7B\x00"},        //サ
    {U'\x30B6', U"\xff7B\xFF9E\x00"},  //ザ
    {U'\x30B7', U"\xff7C\x00"},        //シ
    {U'\x30B8', U"\xff7C\xFF9E\x00"},  //ジ
    {U'\x30B9', U"\xff7D\x00"},        //ス
    {U'\x30BA', U"\xff7D\xFF9E\x00"},  //ズ
    {U'\x30BB', U"\xff7E\x00"},        //セ
    {U'\x30BC', U"\xff7E\xFF9E\x00"},  //ゼ
    {U'\x30BD', U"\xff7F\x00"},        //ソ
    {U'\x30BE', U"\xff7F\xFF9E\x00"},  //ゾ
    {U'\x30BF', U"\xff80\x00"},        //タ
    {U'\x30C0', U"\xff80\xFF9E\x00"},  //ダ
    {U'\x30C1', U"\xff81\x00"},        //チ
    {U'\x30C2', U"\xff81\xFF9E\x00"},  //ヂ
    {U'\x30C3', U"\xff6F\x00"},        //ッ
    {U'\x30C4', U"\xff82\x00"},        //ツ
    {U'\x30C5', U"\xff82\xFF9E\x00"},  //ヅ
    {U'\x30C6', U"\xff83\x00"},        //テ
    {U'\x30C7', U"\xff83\xFF9E\x00"},  //デ
    {U'\x30C8', U"\xff84\x00"},        //ト
    {U'\x30C9', U"\xff84\xFF9E\x00"},  //ド
    {U'\x30CA', U"\xff85\x00"},        //ナ
    {U'\x30CB', U"\xff86\x00"},        //ニ
    {U'\x30CC', U"\xff87\x00"},        //ヌ
    {U'\x30CD', U"\xff88\x00"},        //ネ
    {U'\x30CE', U"\xff89\x00"},        //ノ
    {U'\x30CF', U"\xff8A\x00"},        //ハ
    {U'\x30D0', U"\xff8A\xFF9E\x00"},  //バ
    {U'\x30D1', U"\xff8A\xFF9F\x00"},  //パ
    {U'\x30D2', U"\xff8B\x00"},        //ヒ
    {U'\x30D3', U"\xff8B\xFF9E\x00"},  //ビ
    {U'\x30D4', U"\xff8B\xFF9F\x00"},  //ピ
    {U'\x30D5', U"\xff8C\x00"},        //フ
    {U'\x30D6', U"\xff8C\xFF9E\x00"},  //ブ
    {U'\x30D7', U"\xff8C\xFF9F\x00"},  //プ
    {U'\x30D8', U"\xff8D\x00"},        //ヘ
    {U'\x30D9', U"\xff8D\xFF9E\x00"},  //ベ
    {U'\x30DA', U"\xff8D\xFF9F\x00"},  //ペ
    {U'\x30DB', U"\xff8E\x00"},        //ホ
    {U'\x30DC', U"\xff8E\xFF9E\x00"},  //ボ
    {U'\x30DD', U"\xff8E\xFF9F\x00"},  //ポ
    {U'\x30DE', U"\xff8F\x00"},        //マ
    {U'\x30DF', U"\xFF90\x00"},        //ミ
    {U'\x30E0', U"\xFF91\x00"},        //ム
    {U'\x30E1', U"\xFF92\x00"},        //メ
    {U'\x30E2', U"\xFF93\x00"},        //モ
    {U'\x30E3', U"\xff6C\x00"},        //ャ
    {U'\x30E4', U"\xFF94\x00"},        //ヤ
    {U'\x30E5', U"\xff6D\x00"},        //ュ
    {U'\x30E6', U"\xFF95\x00"},        //ユ
    {U'\x30E7', U"\xff6E\x00"},        //ョ
    {U'\x30E8', U"\xFF96\x00"},        //ヨ
    {U'\x30E9', U"\xFF97\x00"},        //ラ
    {U'\x30EA', U"\xFF98\x00"},        //リ
    {U'\x30EB', U"\xFF99\x00"},        //ル
    {U'\x30EC', U"\xFF9A\x00"},        //レ
    {U'\x30ED', U"\xFF9B\x00"},        //ロ
    {U'\x30EF', U"\xFF9C\x00"},        //ワ
    {U'\x30F2', U"\xff66\x00"},        //ヲ
    {U'\x30F3', U"\xFF9D\x00"},        //ン
    {U'\x30F4', U"\xff73\xFF9E\x00"},  //ヴ
    {U'\x30FB', U"\xff65\x00"},        //・
    {U'\x30FC', U"\xff70\x00"},        //ー
    {U'\x3001', U"\xff64\x00"},        //、
    {U'\x300C', U"\xff62\x00"},        //「
    {U'\x300D', U"\xff63\x00"},        //」
    {U'\x309B', U"\xFF9E\x00"},        //゛
    {U'\x309C', U"\xFF9F\x00"},        //゜
};

const char32_t han2zen[62] = {U'\x30C4', U'\x30C6', U'\x30C8', U'\x30CA', U'\x30CB', U'\x30CC', U'\x30CD', U'\x30CE',
                              U'\x30CF', U'\x30D2', U'\x30D5', U'\x30D8', U'\x30DB', U'\x30DE', U'\x30DF', U'\x30E0',
                              U'\x30E1', U'\x30E2', U'\x30E4', U'\x30E6', U'\x30E8', U'\x30E9', U'\x30EA', U'\x30EB',
                              U'\x30EC', U'\x30ED', U'\x30EF', U'\x30F3', U'\x309B', U'\x309C', U'\x300C', U'\x300D',
                              U'\x3001', U'\x30FB', U'\x30F2', U'\x30A1', U'\x30A3', U'\x30A5', U'\x30A7', U'\x30A9',
                              U'\x30E3', U'\x30E5', U'\x30E7', U'\x30C3', U'\x30FC', U'\x30A2', U'\x30A4', U'\x30A6',
                              U'\x30A8', U'\x30AA', U'\x30AB', U'\x30AD', U'\x30AF', U'\x30B1', U'\x30B3', U'\x30B5',
                              U'\x30B7', U'\x30B9', U'\x30BB', U'\x30BD', U'\x30BF', U'\x30C1'};


static PyObject* to_hankaku(PyObject* str) {
    std::size_t len = (std::size_t)PyObject_Length(str);
    if(len == (std::size_t)-1 || PyUnicode_READY(str) == -1)
        return NULL;

    unsigned int kind = PyUnicode_KIND(str);
    if(kind == 1)
        return str;

    void* data = PyUnicode_DATA(str);
    PyObject* res = PyUnicode_New((Py_ssize_t)len, 0x10ffff);
    void* ret = PyUnicode_DATA(res);
    std::size_t i = 0, j = 0;
    char32_t writer;
    for(; i < len; ++i, ++j) {
        auto s = PyUnicode_READ(kind, data, i);
        writer = NULL;

        if(s > 0xff00 && s < 0xff5f)
            writer = (0x20 + (s % 0xff));
        else if(s == 0x3000)
            writer = 0x20;
        else if(s > 0x3098 && s < 0x30FD) {
            auto r = ZEN2HAN[s];
            if(r && r[1] != 0x00) { //半濁音、濁音付け
                PyUnicode_WRITE(4, ret, j++, r[0]);
                writer = r[1];
            } else if(r[0] != 0x00) {
                writer = r[0];
            }
        }

        PyUnicode_WRITE(4, ret, j, writer ? writer : s);
    }
    return res;
}

static PyObject* to_zenkaku(PyObject* str) {
    std::size_t len = (std::size_t)PyObject_Length(str);
    if(len == (std::size_t)-1 || PyUnicode_READY(str) == -1)
        return NULL;

    unsigned int kind = PyUnicode_KIND(str);
    void* data = PyUnicode_DATA(str);
    PyObject* res = PyUnicode_New((Py_ssize_t)len, 0x10ffff);
    void* ret = PyUnicode_DATA(res);
    std::size_t i = 0, j = 0;
    char32_t writer;

    for(; i < len; ++i, ++j) {
        auto s = PyUnicode_READ(kind, data, i);
        writer = NULL;

        if(s > 0x20 && s < 0x7f)
            writer = s + 0xfee0;
        else if(s == 0x20)
            writer = 0x3000;
        else if(s > 0xff62 && s < 0xff9f) {
            writer = han2zen[(int)s % 62];

            if(s == 0xff73 || (s > 0xff75 && s < 0xff82) || (s > 0xff89 && s < 0xff8F)) {
                auto next = PyUnicode_READ(kind, data, i + 1);
                if(next == 0xFF9E || next == 0x309B)  // 濁音付け
                    ++writer, ++i;
                else if(next == 0xFF9F || next == 0x309C)  // 半濁音付け
                    ++++writer, ++i;
            }
        }

        PyUnicode_WRITE(4, ret, j, writer ? writer : s);
    }
    return res;
}

圧倒的に速いので自作pythonライブラリに追加しよう。


In [3]: %timeit to_hankaku("0123456789ABCDEF")
   ...: %timeit to_zenkaku("0123456789ABCDEF")
   ...:
87 ns ± 0.472 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
86 ns ± 0.322 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)

0
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
1