LoginSignup
9
8

More than 5 years have passed since last update.

ICU を使って UTF-8 の文字列から1文字ずつ取り出す

Last updated at Posted at 2015-01-31

書記素クラスターを扱う必要がある場合、BreakIterator を使う必要があります。こちらの記事をご参照ください。

ビルドオプション

run:
    clang main.c -licuuc -o main
    ./main

C 言語

utf8.h

U8_FWD_1

#include <stdio.h>
#include <string.h>
#include <unicode/utf8.h>

void print_each_char(const char*, size_t);

int main(void)
{
    const char* str = "?野家";
    size_t size = strlen(str);
    print_each_char(str, size);
}

void print_each_char(const char* str, size_t size)
{
    int32_t previous;
    int32_t current = 0;
    int32_t buf_size = 0;
    int32_t* str_size = (int32_t*) &size;

    while (current < size) {
        previous = current;
        U8_FWD_1(str, current, *str_size);
        buf_size = current - previous;
        printf("%.*s\n", buf_size, str + previous);
    }

}

U8_NEXT_OR_FFFD

コードポイントのための変数を用意する必要があります。

void print_each_char(const char* str, size_t size)
{
    int32_t previous;
    int32_t current = 0;
    int32_t buf_size = 0;
    int32_t* str_size = (int32_t*) &size;
    UChar32 cp;

    while (current < size) {
        previous = current;
        U8_NEXT_OR_FFFD(str, current, *str_size, cp);
        buf_size = current - previous;
        printf("%.*s\n", buf_size, str + previous);
    }

}

UText

#include <stdio.h>
#include <unicode/utext.h>

void print_each_char(const char*);

int main(void)
{
    const char* str = "?野家";
    print_each_char(str);
}

void print_each_char(const char* str)
{
    UText *ut;
    UErrorCode status = U_ZERO_ERROR;
    UChar32 cp;
    uint8_t size = 0;
    int64_t previous;

    ut = utext_openUTF8(NULL, str, -1, &status);

    for (cp = utext_next32From(ut, 0); cp > -1; cp = UTEXT_NEXT32(ut)) {
        previous = utext_getPreviousNativeIndex(ut);
        size =  UTEXT_GETNATIVEINDEX(ut) - previous;
        printf("%.*s\n", size, str + previous);
    }
}

UString

#include <stdio.h>
#include <string.h>
#include <unicode/ustring.h>
#include <unicode/utf8.h>

void print_each_char(const char*);

int main(void)
{
    const char* str = "?野家";
    print_each_char(str);
}


void print_each_char(const char* str)
{
    UChar dest[128];
    int32_t capacity = 128;
    int32_t length;
    UErrorCode status;
    int32_t dest_pos = 0;
    UChar32 cp;

    char buf[5];
    int32_t size = 5;
    int32_t pos = 0;
    UBool error;

    u_strFromUTF8(dest, capacity, &length, str, -1, &status);

    while (dest_pos < length) {
        U16_NEXT(dest, dest_pos, length, cp);
        U8_APPEND((uint8_t*) buf, pos, size, cp, error);
        printf("%.*s\n", size, buf);
        memset(buf, 0, size);
        pos = 0;
    }
}

UCharIterator

#include <stdio.h>
#include <string.h>
#include <unicode/uiter.h>
#include <unicode/utf8.h>

void print_each_char(const char*);

int main(void)
{
    const char* str = "?野家";
    print_each_char(str);
}


void print_each_char(const char* str)
{
    UCharIterator iter;
    UChar32 cp = 0;

    char buf[5];
    int32_t size = 5;
    int32_t pos = 0;
    UBool error;

    uiter_setUTF8(&iter, str, -1);

    for (cp = uiter_next32(&iter); cp !=  U_SENTINEL; cp = uiter_next32(&iter)) {
        U8_APPEND((uint8_t*) buf, pos, size, cp, error);
        printf("%.*s\n", size, buf);
        memset(buf, 0, size);
        pos = 0;
    }
}

C++

StringCharacterIterator

ustream.h をインクルードしてビルドオプションに -licuio をビルドオプションに追加すれば、UnicodeString を std::cout で出力させることができる、

#include <iostream>
#include <unicode/schriter.h>
#include <unicode/ustream.h>

void print_each_char(std::string);

int main(void)
{
    std::string str("?野家");
    print_each_char(str);
}

void print_each_char(std::string str)
{
    UnicodeString ustr(str.c_str());
    StringCharacterIterator it(ustr);
    UChar32 cp;

    for (cp = it.first32(); it.hasNext(); cp = it.next32()) {
        UnicodeString buf(cp);
        std::cout << buf << '\n';
    }
}

UnicodeString::toUTF8String を使うのであれば次のとおり。

void print_each_char(std::string str)
{
    UnicodeString ustr(str.c_str());
    StringCharacterIterator it(ustr);
    UChar32 cp;

    for (cp = it.first32(); it.hasNext(); cp = it.next32()) {
        UnicodeString ubuf(cp);
        std::string buf;
        ubuf.toUTF8String(buf);
        std::cout << buf << '\n';
    }
}

UCharCharacterIterator

ustream.h を使うために、ビルドオプションに -licuio を追加する。

#include <iostream>
#include <unicode/uchriter.h>
#include <unicode/ustream.h>

int main(void)
{
    UnicodeString ustr("?野家");
    UCharCharacterIterator it(ustr.getTerminatedBuffer(), ustr.length());
    UChar32 cp;

    while (it.hasNext()) {
        cp = it.next32PostInc();
        UnicodeString buf(cp);
        std::cout << buf << '\n';
    }

    return 0;
}

UnicodeString

UChar 単位であることに注意する必要がある。

#include <iostream>
#include <unicode/unistr.h>
#include <unicode/ustream.h>

void print_each_char(std::string);

int main(void)
{
    std::string str("?野家");
    print_each_char(str);
}

void print_each_char(std::string str)
{
    UnicodeString ustr(str.c_str());
    int32_t length = ustr.length();
    int32_t size = 0;
    UChar32 cp;

    for (int32_t i = 0; i < length; i += size) {
        cp = ustr.char32At(i);
        size = cp < 0x10000 ? 1 : 2;
        UnicodeString buf(cp);
        std::cout << buf << '\n';
    }
}
9
8
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
9
8