書記素クラスターを扱う必要がある場合、BreakIterator を使う必要があります。こちらの記事をご参照ください。
ビルドオプション
run:
clang main.c -licuuc -o main
./main
C 言語
utf8.h
U8_FWD_1
#include <stdio.h>
#include <string.h>
#include <unicode/utf8.h>
void print_each_char(const char*, size_t);
int main(void)
{
const char* str = "?野家";
size_t size = strlen(str);
print_each_char(str, size);
}
void print_each_char(const char* str, size_t size)
{
int32_t previous;
int32_t current = 0;
int32_t buf_size = 0;
int32_t* str_size = (int32_t*) &size;
while (current < size) {
previous = current;
U8_FWD_1(str, current, *str_size);
buf_size = current - previous;
printf("%.*s\n", buf_size, str + previous);
}
}
U8_NEXT_OR_FFFD
コードポイントのための変数を用意する必要があります。
void print_each_char(const char* str, size_t size)
{
int32_t previous;
int32_t current = 0;
int32_t buf_size = 0;
int32_t* str_size = (int32_t*) &size;
UChar32 cp;
while (current < size) {
previous = current;
U8_NEXT_OR_FFFD(str, current, *str_size, cp);
buf_size = current - previous;
printf("%.*s\n", buf_size, str + previous);
}
}
UText
#include <stdio.h>
#include <unicode/utext.h>
void print_each_char(const char*);
int main(void)
{
const char* str = "?野家";
print_each_char(str);
}
void print_each_char(const char* str)
{
UText *ut;
UErrorCode status = U_ZERO_ERROR;
UChar32 cp;
uint8_t size = 0;
int64_t previous;
ut = utext_openUTF8(NULL, str, -1, &status);
for (cp = utext_next32From(ut, 0); cp > -1; cp = UTEXT_NEXT32(ut)) {
previous = utext_getPreviousNativeIndex(ut);
size = UTEXT_GETNATIVEINDEX(ut) - previous;
printf("%.*s\n", size, str + previous);
}
}
UString
#include <stdio.h>
#include <string.h>
#include <unicode/ustring.h>
#include <unicode/utf8.h>
void print_each_char(const char*);
int main(void)
{
const char* str = "?野家";
print_each_char(str);
}
void print_each_char(const char* str)
{
UChar dest[128];
int32_t capacity = 128;
int32_t length;
UErrorCode status;
int32_t dest_pos = 0;
UChar32 cp;
char buf[5];
int32_t size = 5;
int32_t pos = 0;
UBool error;
u_strFromUTF8(dest, capacity, &length, str, -1, &status);
while (dest_pos < length) {
U16_NEXT(dest, dest_pos, length, cp);
U8_APPEND((uint8_t*) buf, pos, size, cp, error);
printf("%.*s\n", size, buf);
memset(buf, 0, size);
pos = 0;
}
}
UCharIterator
#include <stdio.h>
#include <string.h>
#include <unicode/uiter.h>
#include <unicode/utf8.h>
void print_each_char(const char*);
int main(void)
{
const char* str = "?野家";
print_each_char(str);
}
void print_each_char(const char* str)
{
UCharIterator iter;
UChar32 cp = 0;
char buf[5];
int32_t size = 5;
int32_t pos = 0;
UBool error;
uiter_setUTF8(&iter, str, -1);
for (cp = uiter_next32(&iter); cp != U_SENTINEL; cp = uiter_next32(&iter)) {
U8_APPEND((uint8_t*) buf, pos, size, cp, error);
printf("%.*s\n", size, buf);
memset(buf, 0, size);
pos = 0;
}
}
C++
StringCharacterIterator
ustream.h
をインクルードしてビルドオプションに -licuio
をビルドオプションに追加すれば、UnicodeString を std::cout で出力させることができる、
#include <iostream>
#include <unicode/schriter.h>
#include <unicode/ustream.h>
void print_each_char(std::string);
int main(void)
{
std::string str("?野家");
print_each_char(str);
}
void print_each_char(std::string str)
{
UnicodeString ustr(str.c_str());
StringCharacterIterator it(ustr);
UChar32 cp;
for (cp = it.first32(); it.hasNext(); cp = it.next32()) {
UnicodeString buf(cp);
std::cout << buf << '\n';
}
}
UnicodeString::toUTF8String を使うのであれば次のとおり。
void print_each_char(std::string str)
{
UnicodeString ustr(str.c_str());
StringCharacterIterator it(ustr);
UChar32 cp;
for (cp = it.first32(); it.hasNext(); cp = it.next32()) {
UnicodeString ubuf(cp);
std::string buf;
ubuf.toUTF8String(buf);
std::cout << buf << '\n';
}
}
UCharCharacterIterator
ustream.h
を使うために、ビルドオプションに -licuio
を追加する。
#include <iostream>
#include <unicode/uchriter.h>
#include <unicode/ustream.h>
int main(void)
{
UnicodeString ustr("?野家");
UCharCharacterIterator it(ustr.getTerminatedBuffer(), ustr.length());
UChar32 cp;
while (it.hasNext()) {
cp = it.next32PostInc();
UnicodeString buf(cp);
std::cout << buf << '\n';
}
return 0;
}
UnicodeString
UChar
単位であることに注意する必要がある。
#include <iostream>
#include <unicode/unistr.h>
#include <unicode/ustream.h>
void print_each_char(std::string);
int main(void)
{
std::string str("?野家");
print_each_char(str);
}
void print_each_char(std::string str)
{
UnicodeString ustr(str.c_str());
int32_t length = ustr.length();
int32_t size = 0;
UChar32 cp;
for (int32_t i = 0; i < length; i += size) {
cp = ustr.char32At(i);
size = cp < 0x10000 ? 1 : 2;
UnicodeString buf(cp);
std::cout << buf << '\n';
}
}