More than 3 years have passed since last update.

C++でunicode escape sequenceをutf8にする

Last updated at 2020-10-07Posted at 2013-06-28

パフォーマンス上の問題から仕方なくC++で書いてる時にうっかりエスケープシーケンスされたUnicodeに出くわす事があります。知らない人向けに書いておくと、\uXXXX(Xは十六進)の6バイトの形でUnicodeの1文字が表される文字の表記方法です。

あとサロゲートペアってのに気を使う必要があります。
unicode全部は今のところ111万2,064文字分の空間がありWikipediaのUnicode、\uXXXXの16進4文字じゃ6万5536文字しか無くて圧倒的に足りないので、\uXXXX\uXXXXXという2つのペアで1つのunicodeに対応させてる物を言います。
詳しくはこちらutf8のRFCを。

std::stringの形でエスケープシーケンスされたUnicode貰った時にそれを日本語にして表示したいって状況がよくあると思うのでそれを実現する関数書きました。失敗したらfalseが返ります。
第一引数がエスケープシーケンスされた文字列。第二引数がutf8を入れて欲しい文字列。

#include <string>

//   Char. number range  |        UTF-8 octet sequence
//      (hexadecimal)    |              (binary)
//   --------------------+---------------------------------------------
//   0000 0000-0000 007F | 0xxxxxxx
//   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
//   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
//   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

bool decode_unicode_escape_to_utf8(const std::string& src, std::string& dst) {
  std::string result("");
  result.resize(src.size());

  for (size_t i = 0; i < result.size(); ++i) {
    result[i] = '\0';
  }

  size_t result_index = 0;
  bool in_surrogate_pair = false;
  size_t surrogate_buffer = 0;
  for (size_t i = 0; i < src.size(); ++i) {
    if (i + 1 < src.size() && src[i] == '\\' && src[i+1] == 'u') {
      size_t octet = 0;
      {  // calculate octet
        const char* const hex = &src[i + 2];
        for (int j = 0; j < 4; ++j) {
          octet *= 16;
          if ('0' <= hex[j] && hex[j] <= '9') {
            octet += static_cast<int>(hex[j] - '0');
          } else if ('a' <= hex[j] && hex[j] <= 'f') {
            octet += static_cast<int>(hex[j] - 'a' + 10);
          } else if ('A' <= hex[j] && hex[j] <= 'F') {
            octet += static_cast<int>(hex[j] - 'A' + 10);
          } else {
            return false;
          }
        }
      }
      {  // fill up sequence
        char* const sequence = &result[result_index];
        if (in_surrogate_pair) {
          if (0xdc00 <= octet && octet <= 0xdfff) {
            // low surrogate pair
            const size_t joined = surrogate_buffer + (octet & 0x03ff) + 0x10000;
            sequence[0] = (static_cast<char>(joined >> 18)  &  0x3) | 0xf0;
            sequence[1] = (static_cast<char>(joined >> 12)  & 0x3f) | 0x80;
            sequence[2] = (static_cast<char>(joined >> 6)   & 0x3f) | 0x80;
            sequence[3] = (static_cast<char>(joined & 0xff) & 0x3f) | 0x80;
            result_index += 4;
            in_surrogate_pair = false;
          } else {
            return false;
          }
        } else if (octet < 0x7f) {
          sequence[0] = static_cast<char>(octet) & 0x7f;
          result_index += 1;
        } else if (octet < 0x7ff) {
          sequence[0] = (static_cast<char>(octet >> 6) & 0xdf) | 0xc0;
          sequence[1] = (static_cast<char>(octet)      & 0x3f) | 0x80;
          result_index += 2;
        } else if (0xdbff < octet) {
          // high surrogate pair
          in_surrogate_pair = true;
          surrogate_buffer = (octet & 0x03ff) * 0x400;
        } else {
          sequence[0] = (static_cast<char>(octet >> 12) & 0x0f) | 0xe0;
          sequence[1] = (static_cast<char>(octet >> 6)  & 0x3f) | 0x80;
          sequence[2] = (static_cast<char>(octet)       & 0x3f) | 0x80;
          result_index += 3;
        }
      }
      i += 5;  // \\uXXXX is 6 bytes, so + 5 here, and + 1 in next loop
    } else {  // not unicode
      if (in_surrogate_pair) {
        return false;
      }
      result[result_index] = src[i];
      result_index += 1;
    }
    // next char
  }
  result.resize(result_index);
  dst.swap(result);
  return true;
}

使い方はこんな感じ。

#include <assert.h>
#include <iostream>
int main() {
  std::string input("\u30ed\u30c3\u30af\u30d5\u30ea\u30fc"), result;

  bool complete = decode_unicode_escape_to_utf8(input, result);
  assert(complete);

  std::cout << result << std::endl;
}

出力
ロックフリー

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up