More than 5 years have passed since last update.

不正なバイト列を考慮して UTF-8 の文字列から1文字ずつ取り出す

Last updated at 2015-11-18Posted at 2015-01-19

ライブラリやプログラミング言語で UTF-8 の文字列関数を実装する場合、不正なバイト列に遭遇しても、処理を続けることが求められます。

最小限必要な関数は次の文字のバイト数を返す関数です。
C 言語の標準関数であれば mblen や mbtowc などが挙げられますが、これらの関数は不正なバイト列に遭遇すると、負の値を返すため、処理を止めるか、不正なバイト列よりも後ろの箇所を切り捨てざるを得なくなります。

不正なバイト列のサイズの計算方法は Unicode Standard に記載されています。

バイト列の範囲表

Unicode Standard 7.0 の3章に次のような表が記載されています (Table 3-7. Well-Formed UTF-8 Byte Sequences)。

      コードポイント  1番目のバイト 2番目のバイト 3番目のバイト 4番目のバイト
  U+0000 -   U+007F   00 - 7F
  U+0080 -   U+07FF   C2 - DF    80 - BF
  U+0800 -   U+0FFF   E0         A0 - BF     80 - BF
  U+1000 -   U+CFFF   E1 - EC    80 - BF     80 - BF
  U+D000 -   U+D7FF   ED         80 - 9F     80 - BF
  U+E000 -   U+FFFF   EE - EF    80 - BF     80 - BF
 U+10000 -  U+3FFFF   F0         90 - BF     80 - BF    80 - BF
 U+40000 -  U+FFFFF   F1 - F3    80 - BF     80 - BF    80 - BF
U+100000 - U+10FFFF   F4         80 - 8F     80 - BF    80 - BF

RFC 3629 では次のように定義されています。

UTF8-octets = *( UTF8-char )
UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
UTF8-1      = %x00-7F
UTF8-2      = %xC2-DF UTF8-tail
UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / %xF4 %x80-8F 2( UTF8-tail )
UTF8-tail   = %x80-BF

テストケース

Unicode Standard の3章に記載されるテストケースを使います。

# include <stdio.h>
# include <stdint.h>

# define UTF8_LEAD(c) ((uint8_t)(c) < 0x80 || ((uint8_t)(c) > 0xC1 && (uint8_t)(c) < 0xF5))
# define UTF8_TRAIL(c) (((uint8_t)(c) & 0xC0) == 0x80)

uint8_t utf8_fwd(const char*);
uint8_t utf8_next(const char*, uint32_t*);
void print_each_char(const char*);

int main(void)
{
    // http://en.wikipedia.org/wiki/UTF-8#Examples
    // char* str = "\u0024\u00A2\u20AC\U00010348";
    char* str = "\x24\xC2\xA2\xE2\x82\xAC\xF0\x90\x8D\x88";

    // Unicode 7.0.0 D93b
    // http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf
    char* str2 = "\x41\xC0\xAF\x41\xF4\x80\x80\x41";
    char* str3 = "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64";

    puts("expected: <0024 00A2 20AC 10348>");
    print_each_char(str);
    puts("expected: <41 FFFD FFFD FFFD 41>");
    print_each_char(str2);
    puts("expected: <0061 FFFD FFFD FFFD 0062 FFFD 0063 FFFD FFFD 0064>");
    print_each_char(str3);

    return 0;
}

C 言語

PCRE の pcre_valid_utf8.c を参考にしました (mpyw さん情報提供ありがとうございました)。unsigned char の代わりに stdint.h (C99) の uint8_t を使いました。

C 標準の文字列関数との互換性を考慮して print_each_char の引数は const char にしました (STR04-C)。

コードを短くするためにビット演算や状態遷移などを使う選択肢がありますが、判読がむずかしくなったり、処理速度が遅くなる可能性があります。

関数版

文字のバイト数のみ

void print_each_char(const char* str)
{
    uint8_t size;
    while (*str) {
        size = utf8_fwd(str);
        printf("%.*s\n", size, str);
        str += size;   
    }
}

uint8_t utf8_fwd(const char* str)
{
    uint8_t lead = *str;
    uint8_t trail;
    uint8_t count; 
    uint8_t size;

    if (lead == 0) {
        return 0;
    } else if (lead < 0x80) {
        return 1;
    } else if (lead < 0xC2) {
        return 1;
    } else if (lead < 0xE0) {
        count = 2;
    } else if (lead < 0xF0) {
        count = 3;
    } else if (lead < 0xF5) {
        count = 4;
    } else {
        return 1;
    }

    trail = *(++str);

    if (count == 3) {
        if ((lead == 0xE0 && 0xA0 > trail) ||
            (lead == 0xED && trail > 0x9F)
        ) {
            return 1;
        }
    } else if (count == 4) {
        if ((lead == 0xF0 && 0x90 > trail) ||
            (lead == 0xF4 && trail > 0x8F)
        ) {
            return 1;
        }
    }

    for (size = 1; size < count; ++size) {

        if (!UTF8_TRAIL(trail)) {
            return size;
        }

        trail = *(++str);
    }
    return size;
}

コードポイント

void print_each_char(const char* str)
{
    uint8_t size;
    uint32_t cp;

    while (*str) {
        size = utf8_next(str, &cp);

        if (cp == 0xFFFD) {
            printf("U+FFFD %d \xEF\xBF\xBD\n", size);
        } else {
            printf("U+%X %.*s\n", cp, size, str);
        }

        str += size;   
    }
}

uint8_t utf8_next(const char* str, uint32_t* cp)
{
    uint8_t lead = *str;
    uint8_t trail;
    uint8_t count; 
    uint8_t size;

    *cp = 0xFFFD;

    if (lead == 0) {
        return 0;
    } else if (lead < 0x80) {
        *cp = lead;
        return 1;
    } else if (lead < 0xC2) {
        return 1;
    } else if (lead < 0xE0) {
        count = 2;
    } else if (lead < 0xF0) {
        count = 3;
    } else if (lead < 0xF5) {
        count = 4;
    } else {
        return 1;
    }

    trail = *(++str);

    if (count == 3) {
        if ((lead == 0xE0 && 0xA0 > trail) ||
            (lead == 0xED && trail > 0x9F)
        ) {
            return 1;
        }
    } else if (count == 4) {
        if ((lead == 0xF0 && 0x90 > trail) ||
            (lead == 0xF4 && trail > 0x8F)
        ) {
            return 1;
        }
    }

    for (size = 1; size < count; ++size) {

        if (!UTF8_TRAIL(trail)) {
            return size;
        }

        trail = *(++str);
    }

    if (size == 2) {
        *cp = ((lead & 0x1F) << 6) | (trail & 0x3F);
    } else if (size == 3) {
        *cp = ((lead & 0x1F) << 12)
            | (((uint8_t) *(str - 2) & 0x3F) <<  6)
            | ((uint8_t) *(str - 1) & 0x3F);
    } else {
        *cp = ((lead & 0x7) << 18)
            | (((uint8_t) *(str - 3) & 0x3F) << 12)
            | (((uint8_t) *(str - 2) & 0x3F) << 6)
            | ((uint8_t) *(str - 1) & 0x3F);
    }

    return size;
}

マクロ版

ICU の U8_FWD_1、U8_NEXT_OR_FFFD (utf8.h) のようにマクロにすれば速度の改善が期待できます。これらのマクロの使い方はこちらの記事を参照してください。文字数や部分文字列を求める場合、コードポイントが不要なので、その分のコードを省略すれば、さらに速度を改善できます。

文字のバイト数のみ

# define UTF8_TRAIL_BYTES(lead) \
  ((((uint8_t) lead) < 0xC2 ? \
  0 : (((uint8_t) lead) < 0xE0 ? \
  1 : (((uint8_t) lead) < 0xF0 ? \
  2 : (((uint8_t) lead) < 0xF5 ? \
  3 : 0)))))

# define UTF8_FWD(s, i) \
  do { \
    uint8_t __lead = (s)[(i)++]; \
    if (__lead > 0x7F) { \
      uint8_t __count = UTF8_TRAIL_BYTES(__lead); \
      uint8_t __trail = (s)[(i)]; \
      if (__count == 2) { \
        if ((__lead == 0xE0 && (0xA0 > __trail)) || \
            (__lead == 0xED && (__trail > 0x9F)) \
        ) { \
          __count = 0; \
        } \
      } else if (__count == 3) { \
        if ((__lead == 0xF0 && (0x90 > __trail)) || \
            (__lead == 0xF4 && (__trail > 0x8F)) \
        ) { \
          __count = 0; \
        } \
      } \
      while (__count > 0 && UTF8_TRAIL(__trail)) { \
        __trail = (s)[++(i)]; \
        --__count; \
      } \
    } \
  } while (0)

void print_each_char(const char* str)
{
    int32_t previous = 0;
    int32_t current = 0;

    while (str[current]) {
        previous = current;
        UTF8_FWD(str, current);
        printf("%.*s\n", current - previous, str + previous);
    }
}

コードポイント

# define UTF8_CP_UNSAFE(s, i, c) \
  do { \
    (c) = (uint8_t) *((s)+(i)); \
    if ((c) > 0x7F) { \
      if ((c) < 0xE0) { \
        (c) = (((c) & 0x1F) << 6) | ((uint8_t) *((s)+(i)+1) & 0x3F); \
      } else if ((c) < 0xF0) { \
        (c) = (((c) & 0x1F) << 12) \
            | (((uint8_t) *((s)+(i)+1) & 0x3F) << 6) \
            | ((uint8_t) *((s)+(i)+2) & 0x3F); \
      } else { \
        (c) = (((c) & 0x7) << 18) \
            | (((uint8_t) *((s)+(i)+1) & 0x3F) << 12) \
            | (((uint8_t) *((s)+(i)+2) & 0x3F) << 6) \
            | ((uint8_t) *((s)+(i)+3) & 0x3F); \
      }\
    } \
  } while (0)

# define UTF8_TRAIL_BYTES(lead) \
  ((((uint8_t) lead) < 0xC2 ? \
  0 : (((uint8_t) lead) < 0xE0 ? \
  1 : (((uint8_t) lead) < 0xF0 ? \
  2 : (((uint8_t) lead) < 0xF5 ? \
  3 : 0)))))

# define UTF8_NEXT(s, i, c) \
  do { \
    uint8_t __lead = (s)[(i)++]; \
    (c) = UTF8_LEAD(__lead) ? __lead : 0xFFFD; \
    if (__lead > 0x7F) { \
      uint8_t __count = UTF8_TRAIL_BYTES(__lead); \
      uint8_t __size = __count + 1; \
      uint8_t __trail = (s)[(i)]; \
      if (__count == 2) { \
        if ((__lead == 0xE0 && (0xA0 > __trail)) || \
            (__lead == 0xED && (__trail > 0x9F)) \
        ) { \
          (c) = 0xFFFD; \
          __count = 0; \
        } \
      } else if (__count == 3) { \
        if ((__lead == 0xF0 && (0x90 > __trail)) || \
            (__lead == 0xF4 && (__trail > 0x8F)) \
        ) { \
          (c) = 0xFFFD; \
          __count = 0; \
        } \
      } \
      while (__count > 0) { \
        if (!UTF8_TRAIL(__trail)) { \
          (c) = 0xFFFD; \
          break; \
        } \
        __trail = (s)[++(i)]; \
        --__count; \
      } \
      if ((c) == __lead) { \
        UTF8_CP_UNSAFE((s), (i) - __size, (c)); \
      } \
    } \
  } while (0)

void print_each_char(const char* str)
{
    int32_t previous = 0;
    int32_t current = 0;
    int32_t cp;

    while (str[current]) {
        previous = current;
        UTF8_NEXT(str, current, cp);
        if (cp == 0xFFFD) {
            printf("U+%X \xEF\xBF\xBD\n", cp);
        } else {
            printf("U+%X %.*s\n", cp, current - previous, str + previous);
        }

    }
}

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up