strのtranslate関数の場合
ただし半角カナは非対応
ZEN = chr(0x3000) + "".join(chr(0xff01 + i) for i in range(94))
HAN = "".join(chr(0x20 + i) for i in range(95))
def to_hankaku(s, tdic=str.maketrans(ZEN, HAN)):
return s.translate(tdic)
def to_zenkaku(s, tdic=str.maketrans(HAN, ZEN)):
return s.translate(tdic)
結構速い
In [1]: %timeit to_hankaku("0123456789ABCDEF")
...: %timeit to_zenkaku("0123456789ABCDEF")
...:
1.09 µs ± 8.6 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
911 ns ± 2.09 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
C言語実装pythonライブラリにした場合
str.translateでやった場合より10倍速い。上記のmaketrans translateではカナエリアに非対応だが、
興味本位でカナ、半角カナもついでに対応表を作って実装してみたら遅くなるかと思ったが、
微塵も変化なかった。
#include <unordered_map>
std::unordered_map<char32_t, char32_t*> ZEN2HAN = {
{U'\x30A1', U"\xff67\x00"}, //ァ
{U'\x30A2', U"\xff71\x00"}, //ア
{U'\x30A3', U"\xff68\x00"}, //ィ
{U'\x30A4', U"\xff72\x00"}, //イ
{U'\x30A5', U"\xff69\x00"}, //ゥ
{U'\x30A6', U"\xff73\x00"}, //ウ
{U'\x30A7', U"\xff6A\x00"}, //ェ
{U'\x30A8', U"\xff74\x00"}, //エ
{U'\x30A9', U"\xff6B\x00"}, //ォ
{U'\x30AA', U"\xff75\x00"}, //オ
{U'\x30AB', U"\xff76\x00"}, //カ
{U'\x30AC', U"\xff76\xFF9E\x00"}, //ガ
{U'\x30AD', U"\xff77\x00"}, //キ
{U'\x30AE', U"\xff77\xFF9E\x00"}, //ギ
{U'\x30AF', U"\xff78\x00"}, //ク
{U'\x30B0', U"\xff78\xFF9E\x00"}, //グ
{U'\x30B1', U"\xff79\x00"}, //ケ
{U'\x30B2', U"\xff79\xFF9E\x00"}, //ゲ
{U'\x30B3', U"\xff7A\x00"}, //コ
{U'\x30B4', U"\xff7A\xFF9E\x00"}, //ゴ
{U'\x30B5', U"\xff7B\x00"}, //サ
{U'\x30B6', U"\xff7B\xFF9E\x00"}, //ザ
{U'\x30B7', U"\xff7C\x00"}, //シ
{U'\x30B8', U"\xff7C\xFF9E\x00"}, //ジ
{U'\x30B9', U"\xff7D\x00"}, //ス
{U'\x30BA', U"\xff7D\xFF9E\x00"}, //ズ
{U'\x30BB', U"\xff7E\x00"}, //セ
{U'\x30BC', U"\xff7E\xFF9E\x00"}, //ゼ
{U'\x30BD', U"\xff7F\x00"}, //ソ
{U'\x30BE', U"\xff7F\xFF9E\x00"}, //ゾ
{U'\x30BF', U"\xff80\x00"}, //タ
{U'\x30C0', U"\xff80\xFF9E\x00"}, //ダ
{U'\x30C1', U"\xff81\x00"}, //チ
{U'\x30C2', U"\xff81\xFF9E\x00"}, //ヂ
{U'\x30C3', U"\xff6F\x00"}, //ッ
{U'\x30C4', U"\xff82\x00"}, //ツ
{U'\x30C5', U"\xff82\xFF9E\x00"}, //ヅ
{U'\x30C6', U"\xff83\x00"}, //テ
{U'\x30C7', U"\xff83\xFF9E\x00"}, //デ
{U'\x30C8', U"\xff84\x00"}, //ト
{U'\x30C9', U"\xff84\xFF9E\x00"}, //ド
{U'\x30CA', U"\xff85\x00"}, //ナ
{U'\x30CB', U"\xff86\x00"}, //ニ
{U'\x30CC', U"\xff87\x00"}, //ヌ
{U'\x30CD', U"\xff88\x00"}, //ネ
{U'\x30CE', U"\xff89\x00"}, //ノ
{U'\x30CF', U"\xff8A\x00"}, //ハ
{U'\x30D0', U"\xff8A\xFF9E\x00"}, //バ
{U'\x30D1', U"\xff8A\xFF9F\x00"}, //パ
{U'\x30D2', U"\xff8B\x00"}, //ヒ
{U'\x30D3', U"\xff8B\xFF9E\x00"}, //ビ
{U'\x30D4', U"\xff8B\xFF9F\x00"}, //ピ
{U'\x30D5', U"\xff8C\x00"}, //フ
{U'\x30D6', U"\xff8C\xFF9E\x00"}, //ブ
{U'\x30D7', U"\xff8C\xFF9F\x00"}, //プ
{U'\x30D8', U"\xff8D\x00"}, //ヘ
{U'\x30D9', U"\xff8D\xFF9E\x00"}, //ベ
{U'\x30DA', U"\xff8D\xFF9F\x00"}, //ペ
{U'\x30DB', U"\xff8E\x00"}, //ホ
{U'\x30DC', U"\xff8E\xFF9E\x00"}, //ボ
{U'\x30DD', U"\xff8E\xFF9F\x00"}, //ポ
{U'\x30DE', U"\xff8F\x00"}, //マ
{U'\x30DF', U"\xFF90\x00"}, //ミ
{U'\x30E0', U"\xFF91\x00"}, //ム
{U'\x30E1', U"\xFF92\x00"}, //メ
{U'\x30E2', U"\xFF93\x00"}, //モ
{U'\x30E3', U"\xff6C\x00"}, //ャ
{U'\x30E4', U"\xFF94\x00"}, //ヤ
{U'\x30E5', U"\xff6D\x00"}, //ュ
{U'\x30E6', U"\xFF95\x00"}, //ユ
{U'\x30E7', U"\xff6E\x00"}, //ョ
{U'\x30E8', U"\xFF96\x00"}, //ヨ
{U'\x30E9', U"\xFF97\x00"}, //ラ
{U'\x30EA', U"\xFF98\x00"}, //リ
{U'\x30EB', U"\xFF99\x00"}, //ル
{U'\x30EC', U"\xFF9A\x00"}, //レ
{U'\x30ED', U"\xFF9B\x00"}, //ロ
{U'\x30EF', U"\xFF9C\x00"}, //ワ
{U'\x30F2', U"\xff66\x00"}, //ヲ
{U'\x30F3', U"\xFF9D\x00"}, //ン
{U'\x30F4', U"\xff73\xFF9E\x00"}, //ヴ
{U'\x30FB', U"\xff65\x00"}, //・
{U'\x30FC', U"\xff70\x00"}, //ー
{U'\x3001', U"\xff64\x00"}, //、
{U'\x300C', U"\xff62\x00"}, //「
{U'\x300D', U"\xff63\x00"}, //」
{U'\x309B', U"\xFF9E\x00"}, //゛
{U'\x309C', U"\xFF9F\x00"}, //゜
};
const char32_t han2zen[62] = {U'\x30C4', U'\x30C6', U'\x30C8', U'\x30CA', U'\x30CB', U'\x30CC', U'\x30CD', U'\x30CE',
U'\x30CF', U'\x30D2', U'\x30D5', U'\x30D8', U'\x30DB', U'\x30DE', U'\x30DF', U'\x30E0',
U'\x30E1', U'\x30E2', U'\x30E4', U'\x30E6', U'\x30E8', U'\x30E9', U'\x30EA', U'\x30EB',
U'\x30EC', U'\x30ED', U'\x30EF', U'\x30F3', U'\x309B', U'\x309C', U'\x300C', U'\x300D',
U'\x3001', U'\x30FB', U'\x30F2', U'\x30A1', U'\x30A3', U'\x30A5', U'\x30A7', U'\x30A9',
U'\x30E3', U'\x30E5', U'\x30E7', U'\x30C3', U'\x30FC', U'\x30A2', U'\x30A4', U'\x30A6',
U'\x30A8', U'\x30AA', U'\x30AB', U'\x30AD', U'\x30AF', U'\x30B1', U'\x30B3', U'\x30B5',
U'\x30B7', U'\x30B9', U'\x30BB', U'\x30BD', U'\x30BF', U'\x30C1'};
static PyObject* to_hankaku(PyObject* str) {
std::size_t len = (std::size_t)PyObject_Length(str);
if(len == (std::size_t)-1 || PyUnicode_READY(str) == -1)
return NULL;
unsigned int kind = PyUnicode_KIND(str);
if(kind == 1)
return str;
void* data = PyUnicode_DATA(str);
PyObject* res = PyUnicode_New((Py_ssize_t)len, 0x10ffff);
void* ret = PyUnicode_DATA(res);
std::size_t i = 0, j = 0;
char32_t writer;
for(; i < len; ++i, ++j) {
auto s = PyUnicode_READ(kind, data, i);
writer = NULL;
if(s > 0xff00 && s < 0xff5f)
writer = (0x20 + (s % 0xff));
else if(s == 0x3000)
writer = 0x20;
else if(s > 0x3098 && s < 0x30FD) {
auto r = ZEN2HAN[s];
if(r && r[1] != 0x00) { //半濁音、濁音付け
PyUnicode_WRITE(4, ret, j++, r[0]);
writer = r[1];
} else if(r[0] != 0x00) {
writer = r[0];
}
}
PyUnicode_WRITE(4, ret, j, writer ? writer : s);
}
return res;
}
static PyObject* to_zenkaku(PyObject* str) {
std::size_t len = (std::size_t)PyObject_Length(str);
if(len == (std::size_t)-1 || PyUnicode_READY(str) == -1)
return NULL;
unsigned int kind = PyUnicode_KIND(str);
void* data = PyUnicode_DATA(str);
PyObject* res = PyUnicode_New((Py_ssize_t)len, 0x10ffff);
void* ret = PyUnicode_DATA(res);
std::size_t i = 0, j = 0;
char32_t writer;
for(; i < len; ++i, ++j) {
auto s = PyUnicode_READ(kind, data, i);
writer = NULL;
if(s > 0x20 && s < 0x7f)
writer = s + 0xfee0;
else if(s == 0x20)
writer = 0x3000;
else if(s > 0xff62 && s < 0xff9f) {
writer = han2zen[(int)s % 62];
if(s == 0xff73 || (s > 0xff75 && s < 0xff82) || (s > 0xff89 && s < 0xff8F)) {
auto next = PyUnicode_READ(kind, data, i + 1);
if(next == 0xFF9E || next == 0x309B) // 濁音付け
++writer, ++i;
else if(next == 0xFF9F || next == 0x309C) // 半濁音付け
++++writer, ++i;
}
}
PyUnicode_WRITE(4, ret, j, writer ? writer : s);
}
return res;
}
圧倒的に速いので自作pythonライブラリに追加しよう。
In [3]: %timeit to_hankaku("0123456789ABCDEF")
...: %timeit to_zenkaku("0123456789ABCDEF")
...:
87 ns ± 0.472 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
86 ns ± 0.322 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)