Help us understand the problem. What is going on with this article?

C++でunicode escape sequenceをutf8にする

More than 5 years have passed since last update.

パフォーマンス上の問題から仕方なくC++で書いてる時にうっかりエスケープシーケンスされたUnicodeに出くわす事があります。知らない人向けに書いておくと、\uXXXX(Xは十六進)の6バイトの形でUnicodeの1文字が表される文字の表記方法です。

あとサロゲートペアってのに気を使う必要があります。
unicode全部は今のところ111万2,064文字分の空間がありWikipediaのUnicode、\uXXXXの16進4文字じゃ6万5536文字しか無くて圧倒的に足りないので、\uXXXX\uXXXXXという2つのペアで1つのunicodeに対応させてる物を言います。
詳しくはこちらutf8のRFCを。

std::stringの形でエスケープシーケンスされたUnicode貰った時にそれを日本語にして表示したいって状況がよくあると思うのでそれを実現する関数書きました。失敗したらfalseが返ります。
第一引数がエスケープシーケンスされた文字列。第二引数がutf8を入れて欲しい文字列。

#include <string>

//   Char. number range  |        UTF-8 octet sequence
//      (hexadecimal)    |              (binary)
//   --------------------+---------------------------------------------
//   0000 0000-0000 007F | 0xxxxxxx
//   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
//   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
//   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

bool decode_unicode_escape_to_utf8(const std::string& src, std::string& dst) {
  std::string result("");
  result.resize(src.size());

  for (size_t i = 0; i < result.size(); ++i) {
    result[i] = '\0';
  }

  size_t result_index = 0;
  bool in_surrogate_pair = false;
  size_t surrogate_buffer = 0;
  for (size_t i = 0; i < src.size(); ++i) {
    if (i + 1 < src.size() && src[i] == '\\' && src[i+1] == 'u') {
      size_t octet = 0;
      {  // calculate octet
        const char* const hex = &src[i + 2];
        for (int j = 0; j < 4; ++j) {
          octet *= 16;
          if ('0' <= hex[j] && hex[j] <= '9') {
            octet += static_cast<int>(hex[j] - '0');
          } else if ('a' <= hex[j] && hex[j] <= 'f') {
            octet += static_cast<int>(hex[j] - 'a' + 10);
          } else if ('A' <= hex[j] && hex[j] <= 'F') {
            octet += static_cast<int>(hex[j] - 'A' + 10);
          } else {
            return false;
          }
        }
      }
      {  // fill up sequence
        char* const sequence = &result[result_index];
        if (in_surrogate_pair) {
          if (0xdc00 <= octet && octet <= 0xdfff) {
            // low surrogate pair
            const size_t joined = surrogate_buffer + (octet & 0x03ff) + 0x10000;
            sequence[0] = (static_cast<char>(joined >> 18)  &  0x3) | 0xf0;
            sequence[1] = (static_cast<char>(joined >> 12)  & 0x3f) | 0x80;
            sequence[2] = (static_cast<char>(joined >> 6)   & 0x3f) | 0x80;
            sequence[3] = (static_cast<char>(joined & 0xff) & 0x3f) | 0x80;
            result_index += 4;
            in_surrogate_pair = false;
          } else {
            return false;
          }
        } else if (octet < 0x7f) {
          sequence[0] = static_cast<char>(octet) & 0x7f;
          result_index += 1;
        } else if (octet < 0x7ff) {
          sequence[0] = (static_cast<char>(octet >> 6) & 0xdf) | 0xc0;
          sequence[1] = (static_cast<char>(octet)      & 0x3f) | 0x80;
          result_index += 2;
        } else if (0xdbff) {
          // high surrogate pair
          in_surrogate_pair = true;
          surrogate_buffer = (octet & 0x03ff) * 0x400;
        } else {
          sequence[0] = (static_cast<char>(octet >> 12) & 0x0f) | 0xe0;
          sequence[1] = (static_cast<char>(octet >> 6)  & 0x3f) | 0x80;
          sequence[2] = (static_cast<char>(octet)       & 0x3f) | 0x80;
          result_index += 3;
        }
      }
      i += 5;  // \\uXXXX is 6 bytes, so + 5 here, and + 1 in next loop
    } else {  // not unicode
      if (in_surrogate_pair) {
        return false;
      }
      result[result_index] = src[i];
      result_index += 1;
    }
    // next char
  }
  result.resize(result_index);
  dst.swap(result);
  return true;
}

使い方はこんな感じ。

#include <assert.h>
#include <iostream>
int main() {
  std::string input("\u30ed\u30c3\u30af\u30d5\u30ea\u30fc"), result;

  bool complete = decode_unicode_escape_to_utf8(input, result);
  assert(complete);

  std::cout << result << std::endl;
}

出力
ロックフリー

Why not register and get more from Qiita?
  1. We will deliver articles that match you
    By following users and tags, you can catch up information on technical fields that you are interested in as a whole
  2. you can read useful information later efficiently
    By "stocking" the articles you like, you can search right away
Comments
No comments
Sign up for free and join this conversation.
If you already have a Qiita account
Why do not you register as a user and use Qiita more conveniently?
You need to log in to use this function. Qiita can be used more conveniently after logging in.
You seem to be reading articles frequently this month. Qiita can be used more conveniently after logging in.
  1. We will deliver articles that match you
    By following users and tags, you can catch up information on technical fields that you are interested in as a whole
  2. you can read useful information later efficiently
    By "stocking" the articles you like, you can search right away
ユーザーは見つかりませんでした