More than 5 years have passed since last update.

UTF-32、UTF-16、UTF-8 の相互変換

Last updated at 2017-11-21Posted at 2017-11-19

C++17 では codecvt が非推奨になるようです。
https://cpprefjp.github.io/reference/codecvt.html
UTF-32、UTF-16、UTF-8 の相互変換くらいは外部ライブラリを使用せずに変換したいです。

UTF-32 から UTF-16、UTF-8 への変換をまとめてみました。

Unicode の詳しい解説は、別途 Wikipedia 等を参照してください。
https://ja.wikipedia.org/wiki/Unicode

UTF-8

UTF-8 は UTF-32 のコードを、1～4バイトの可変長で扱います。
以下のような形になっています。

0xxxxxxx                            0 - 127
110yyyyx 10xxxxxx                   128 - 2047
1110yyyy 10yxxxxx 10xxxxxx          2048 - 65535
11110yyy 10yyxxxx 10xxxxxx 10xxxxxx 65536 - 0x10FFFF

先頭のバイトから、何バイト続くか判定できます。

@akinomyoga さんのコメントより、y がすべて 0 になっているものは不正な文字として扱います。

UTF-32 から UTF-8

UTF-32 から UTF-8 への変換は以下のようになります。

bool ConvChU32ToU8(const char32_t u32Ch, std::array<char, 4>& u8Ch) {
    if (u32Ch < 0 || u32Ch > 0x10FFFF) {
        return false;
    }

    if (u32Ch < 128) {
        u8Ch[0] = char(u32Ch);
        u8Ch[1] = 0;
        u8Ch[2] = 0;
        u8Ch[3] = 0;
    } else if (u32Ch < 2048) {
        u8Ch[0] = 0xC0 | char(u32Ch >> 6);
        u8Ch[1] = 0x80 | (char(u32Ch) & 0x3F);
        u8Ch[2] = 0;
        u8Ch[3] = 0;
    } else if (u32Ch < 65536) {
        u8Ch[0] = 0xE0 | char(u32Ch >> 12);
        u8Ch[1] = 0x80 | (char(u32Ch >> 6) & 0x3F);
        u8Ch[2] = 0x80 | (char(u32Ch) & 0x3F);
        u8Ch[3] = 0;
    } else {
        u8Ch[0] = 0xF0 | char(u32Ch >> 18);
        u8Ch[1] = 0x80 | (char(u32Ch >> 12) & 0x3F);
        u8Ch[2] = 0x80 | (char(u32Ch >> 6) & 0x3F);
        u8Ch[3] = 0x80 | (char(u32Ch) & 0x3F);
    }

    return true;
}

UTF-8 から UTF-32

UTF-8 から UTF-32 は以下のようになります。

@akinomyoga さんのプルリクエストより判定にあやまりがあったことが判明しました。
こちら
ありがとうございました。

int GetU8ByteCount(char ch) {
    if (0 <= uint8_t(ch) && uint8_t(ch) < 0x80) {
        return 1;
    }
    if (0xC2 <= uint8_t(ch) && uint8_t(ch) < 0xE0) {
        return 2;
    }
    if (0xE0 <= uint8_t(ch) && uint8_t(ch) < 0xF0) {
        return 3;
    }
    if (0xF0 <= uint8_t(ch) && uint8_t(ch) < 0xF8) {
        return 4;
    }
    return 0;
}

bool IsU8LaterByte(char ch) {
    return 0x80 <= uint8_t(ch) && uint8_t(ch) < 0xC0;
}

bool ConvChU8ToU32(const std::array<char, 4>& u8Ch, char32_t& u32Ch) {
    int numBytes = GetU8ByteCount(u8Ch[0]);
    if (numBytes == 0) {
        return false;
    }
    switch (numBytes) {
        case 1:
            u32Ch = char32_t(uint8_t(u8Ch[0]));
            break;
        case 2:
            if (!IsU8LaterByte(u8Ch[1])) {
                return false;
            }
            if ((uint8_t(u8Ch[0]) & 0x1E) == 0) {
                return false;
            }

            u32Ch = char32_t(u8Ch[0] & 0x1F) << 6;
            u32Ch |= char32_t(u8Ch[1] & 0x3F);
            break;
        case 3:
            if (!IsU8LaterByte(u8Ch[1]) || !IsU8LaterByte(u8Ch[2])) {
                return false;
            }
            if ((uint8_t(u8Ch[0]) & 0x0F) == 0 &&
                (uint8_t(u8Ch[1]) & 0x20) == 0) {
                return false;
            }

            u32Ch = char32_t(u8Ch[0] & 0x0F) << 12;
            u32Ch |= char32_t(u8Ch[1] & 0x3F) << 6;
            u32Ch |= char32_t(u8Ch[2] & 0x3F);
            break;
        case 4:
            if (!IsU8LaterByte(u8Ch[1]) || !IsU8LaterByte(u8Ch[2]) ||
                !IsU8LaterByte(u8Ch[3])) {
                return false;
            }
            if ((uint8_t(u8Ch[0]) & 0x07) == 0 &&
                (uint8_t(u8Ch[1]) & 0x30) == 0) {
                return false;
            }

            u32Ch = char32_t(u8Ch[0] & 0x07) << 18;
            u32Ch |= char32_t(u8Ch[1] & 0x3F) << 12;
            u32Ch |= char32_t(u8Ch[2] & 0x3F) << 6;
            u32Ch |= char32_t(u8Ch[3] & 0x3F);
            break;
        default:
            return false;
    }

    return true;
}

UTF-16

UTF-16 は1文字2バイトとして扱う予定でしたが、2バイトでは収まりきらなくなったため、サロゲートペアという方法を用いて2または4バイトとなりました。
サロゲートペアは前半が 0xD800 ～ 0xDBFF、後半が 0xDC00 ～ 0xDFFF の組み合わせにより、16ビットでは表現できない UTF-32 の文字を表します。

UTF-32 から UTF-16

サロゲートペアの計算方法は以下のようになります。
Wikipediaより

u32Ch  UTF-32 の文字
u16Hi  UTF-16 の前半
u16Low UTF-16 の後半
u16Hi  = (u32Ch - 0x10000) / 0x400 + 0xD800
u16Low = (u32Ch - 0x10000) % 0x400 + 0xDC00

コードにすると以下のようになります。

bool ConvChU32ToU16(const char32_t u32Ch, std::array<char16_t, 2>& u16Ch) {
    if (u32Ch < 0 || u32Ch > 0x10FFFF) {
        return false;
    }

    if (u32Ch < 0x10000) {
        u16Ch[0] = char16_t(u32Ch);
        u16Ch[1] = 0;
    } else {
        u16Ch[0] = char16_t((u32Ch - 0x10000) / 0x400 + 0xD800);
        u16Ch[1] = char16_t((u32Ch - 0x10000) % 0x400 + 0xDC00);
    }

    return true;
}

UTF-16 から UTF-32

サロゲートペアの計算方法は以下のようになります。
Wikipediaより

u32Ch  UTF-32 の文字
u16Hi  UTF-16 の前半
u16Low UTF-16 の後半
u32Ch = 0x10000 + (u16Hi - 0xD800) * 0x400 + (u16Low - 0xDC00)

コードにすると以下のようになります。

bool IsU16HighSurrogate(char16_t ch) { return 0xD800 <= ch && ch < 0xDC00; }

bool IsU16LowSurrogate(char16_t ch) { return 0xDC00 <= ch && ch < 0xE000; }

bool ConvChU16ToU32(const std::array<char16_t, 2>& u16Ch, char32_t& u32Ch) {
    if (IsU16HighSurrogate(u16Ch[0])) {
        if (IsU16LowSurrogate(u16Ch[1])) {
            u32Ch = 0x10000 + (char32_t(u16Ch[0]) - 0xD800) * 0x400 +
                    (char32_t(u16Ch[1]) - 0xDC00);
        } else if (u16Ch[1] == 0) {
            u32Ch = u16Ch[0];
        } else {
            return false;
        }
    } else if (IsU16LowSurrogate(u16Ch[0])) {
        if (u16Ch[1] == 0) {
            u32Ch = u16Ch[0];
        } else {
            return false;
        }
    } else {
        u32Ch = u16Ch[0];
    }

    return true;
}

UTF-8 から UTF-16 または UTF-16 から UTF-8

一度 UTF-32 に変換し、UTF-8、UTF-16 へ変換します。

UTF-8 から UTF-16

bool ConvChU8ToU16(const std::array<char, 4>& u8Ch,
                   std::array<char16_t, 2>& u16Ch) {
    char32_t u32Ch;
    if (!ConvChU8ToU32(u8Ch, u32Ch)) {
        return false;
    }
    if (!ConvChU32ToU16(u32Ch, u16Ch)) {
        return false;
    }
    return true;
}

UTF-16 から UTF-8

bool ConvChU16ToU8(const std::array<char16_t, 2>& u16Ch,
                   std::array<char, 4>& u8Ch) {
    char32_t u32Ch;
    if (!ConvChU16ToU32(u16Ch, u32Ch)) {
        return false;
    }
    if (!ConvChU32ToU8(u32Ch, u8Ch)) {
        return false;
    }
    return true;
}

文字列

後は文字列の各文字を変換していきます。

bool ConvU8ToU16(const std::string& u8Str, std::u16string& u16Str) {
    for (auto u8It = u8Str.begin(); u8It != u8Str.end(); ++u8It) {
        auto numBytes = GetU8ByteCount((*u8It));
        if (numBytes == 0) {
            return false;
        }

        std::array<char, 4> u8Ch;
        u8Ch[0] = (*u8It);
        for (int i = 1; i < numBytes; i++) {
            ++u8It;
            if (u8It == u8Str.end()) {
                return false;
            }
            u8Ch[i] = (*u8It);
        }

        std::array<char16_t, 2> u16Ch;
        if (!ConvChU8ToU16(u8Ch, u16Ch)) {
            return false;
        }

        u16Str.push_back(u16Ch[0]);
        if (u16Ch[1] != 0) {
            u16Str.push_back(u16Ch[1]);
        }
    }
    return true;
}

bool ConvU8ToU32(const std::string& u8Str, std::u32string& u32Str) {
    for (auto u8It = u8Str.begin(); u8It != u8Str.end(); ++u8It) {
        auto numBytes = GetU8ByteCount((*u8It));
        if (numBytes == 0) {
            return false;
        }

        std::array<char, 4> u8Ch;
        u8Ch[0] = (*u8It);
        for (int i = 1; i < numBytes; i++) {
            ++u8It;
            if (u8It == u8Str.end()) {
                return false;
            }
            u8Ch[i] = (*u8It);
        }

        char32_t u32Ch;
        if (!ConvChU8ToU32(u8Ch, u32Ch)) {
            return false;
        }

        u32Str.push_back(u32Ch);
    }
    return true;
}

bool ConvU16ToU8(const std::u16string& u16Str, std::string& u8Str) {
    for (auto u16It = u16Str.begin(); u16It != u16Str.end(); ++u16It) {
        std::array<char16_t, 2> u16Ch;
        if (IsU16HighSurrogate((*u16It))) {
            u16Ch[0] = (*u16It);
            ++u16It;
            if (u16It == u16Str.end()) {
                return false;
            }
            u16Ch[1] = (*u16It);
        } else {
            u16Ch[0] = (*u16It);
            u16Ch[1] = 0;
        }

        std::array<char, 4> u8Ch;
        if (!ConvChU16ToU8(u16Ch, u8Ch)) {
            return false;
        }
        if (u8Ch[0] != 0) {
            u8Str.push_back(u8Ch[0]);
        }
        if (u8Ch[1] != 0) {
            u8Str.push_back(u8Ch[1]);
        }
        if (u8Ch[2] != 0) {
            u8Str.push_back(u8Ch[2]);
        }
        if (u8Ch[3] != 0) {
            u8Str.push_back(u8Ch[3]);
        }
    }
    return true;
}

bool ConvU16ToU32(const std::u16string& u16Str, std::u32string& u32Str) {
    for (auto u16It = u16Str.begin(); u16It != u16Str.end(); ++u16It) {
        std::array<char16_t, 2> u16Ch;
        if (IsU16HighSurrogate((*u16It))) {
            u16Ch[0] = (*u16It);
            ++u16It;
            if (u16It == u16Str.end()) {
                return false;
            }
            u16Ch[1] = (*u16It);
        } else {
            u16Ch[0] = (*u16It);
            u16Ch[1] = 0;
        }

        char32_t u32Ch;
        if (!ConvChU16ToU32(u16Ch, u32Ch)) {
            return false;
        }
        u32Str.push_back(u32Ch);
    }
    return true;
}

bool ConvU32ToU8(const std::u32string& u32Str, std::string& u8Str) {
    for (auto u32It = u32Str.begin(); u32It != u32Str.end(); ++u32It) {
        std::array<char, 4> u8Ch;
        if (!ConvChU32ToU8((*u32It), u8Ch)) {
            return false;
        }

        if (u8Ch[0] != 0) {
            u8Str.push_back(u8Ch[0]);
        }
        if (u8Ch[1] != 0) {
            u8Str.push_back(u8Ch[1]);
        }

        if (u8Ch[2] != 0) {
            u8Str.push_back(u8Ch[2]);
        }
        if (u8Ch[3] != 0) {
            u8Str.push_back(u8Ch[3]);
        }
    }
    return true;
}

bool ConvU32ToU16(const std::u32string& u32Str, std::u16string& u16Str) {
    for (auto u32It = u32Str.begin(); u32It != u32Str.end(); ++u32It) {
        std::array<char16_t, 2> u16Ch;
        if (!ConvChU32ToU16((*u32It), u16Ch)) {
            return false;
        }

        if (u16Ch[0] != 0) {
            u16Str.push_back(u16Ch[0]);
        }
        if (u16Ch[1] != 0) {
            u16Str.push_back(u16Ch[1]);
        }
    }
    return true;
}

参照

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up