LoginSignup
3
1

More than 5 years have passed since last update.

ICU の BreakIterator を使って書記素を1つずつ取り出す

Last updated at Posted at 2015-01-30

ICU には異体字セレクターや絵文字の国旗など複数のコードポイントで構成される書記素を扱えるように BreakIterator (ubrk.h) が用意されています。char 型の文字列を BreakIterator に直接渡すことはできないので、UText (utext.h) や UnicodeString (ustring.h) を使います。書記素クラスターを構成するコードポイントの数には上限がないので、バリデーションの際にはバイトサイズやコードポイント単位で文字数を求める必要があります。ICU の API はこちらの記事をご参照ください。

ビルドオプション

C 言語版の場合、次のようになります。

run:
    clang main.c -licuuc -o main
    ./main

C++ の場合、次のとおりです。

run:
    clang++ main.cpp -licuuc -o main
    ./main

C 言語

#include <stdio.h>
#include <unicode/ubrk.h>

void print_each_grapheme(const char *);

int main(void)
{
    const char* str = "葛\U000E0101飾区";
    print_each_grapheme(str);

    return 0;
}

void print_each_grapheme(const char *str)
{
    UText          *ut;
    UBreakIterator *bi;
    UErrorCode status = U_ZERO_ERROR;

    int32_t current;
    int32_t previous;
    int32_t size;

    ut = utext_openUTF8(NULL, str, -1, &status);
    bi = ubrk_open(UBRK_CHARACTER, uloc_getDefault(), NULL, 0, &status);
    ubrk_setUText(bi, ut, &status);
    current = ubrk_first(bi);

    while (current != UBRK_DONE) {
        previous = current;
        current = ubrk_next(bi);

        if (current == UBRK_DONE) {
            break;
        }

        size = current - previous;
        printf("%.*s\n", size, str + previous);
    }

    utext_close(ut);
    ubrk_close(bi);
}

for 文を使う場合、次のように書くことができる。

void print_each_grapheme(const char *str)
{
    UText          *ut;
    UBreakIterator *bi;
    UErrorCode status = U_ZERO_ERROR;

    int32_t current;
    int32_t previous;
    int32_t size;

    ut = utext_openUTF8(NULL, str, -1, &status);
    bi = ubrk_open(UBRK_CHARACTER, uloc_getDefault(), NULL, 0, &status);
    ubrk_setUText(bi, ut, &status);

    for (
        previous = ubrk_first(bi), current = ubrk_next(bi);
        current != UBRK_DONE;
        previous = current, current = ubrk_next(bi)
    ) {

        size = current - previous;
        printf("%d %d\n", current, previous);
    }

    utext_close(ut);
    ubrk_close(bi);
}

C++

#include <iostream>
#include <unicode/brkiter.h>

void print_each_grapheme(std::string);

int main(void)
{
    std::string str("葛\U000E0101飾区");
    print_each_grapheme(str);
}

void print_each_grapheme(std::string str)
{
    UText *ut;
    UErrorCode status = U_ZERO_ERROR;
    int32_t previous;
    int32_t current;
    int32_t size;

    BreakIterator *it = BreakIterator::createCharacterInstance(
        Locale::getDefault(), status
    );

    ut = utext_openUTF8(NULL, str.c_str(), -1, &status);
    it->setText(ut, status);
    current = it->first();

    while (current != BreakIterator::DONE) {
        previous = current;
        current = it->next();

        if (current == UBRK_DONE) {
            break;
        }

        size = current - previous;
        std::cout << str.substr(previous, size) << '\n';
    }
}

for 文を使えば、次のように書くことができる。

void print_each_grapheme(std::string str)
{
    UText *ut;
    UErrorCode status = U_ZERO_ERROR;
    int32_t previous;
    int32_t current;
    int32_t size;

    BreakIterator *it = BreakIterator::createCharacterInstance(
        Locale::getDefault(), status
    );

    ut = utext_openUTF8(NULL, str.c_str(), -1, &status);
    it->setText(ut, status);

    for (
        previous = it->first(), current = it->next(); 
        current != BreakIterator::DONE;
        previous = current, current = it->next()   
    ) {

        size = current - previous;
        std::cout << str.substr(previous, size) << '\n';
    }
}
3
1
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
3
1