やりたいこと
UNIコード(0000~10ffff)の中に、漢字に割り当てられているのがいくつあるのか?正規表現を使い、調べてみようと思います。
調査に使用するのは次の通り
- Code Point(0~1114111)を使う
- 漢字抽出に使う正規表現は
/\p{sc=Han}/u
漢字総数、抽出コード
const arr = [];
for (let n = 0; n <= 1114111; n++) {
let str = String.fromCodePoint(n);
if (/\p{sc=Han}/u.test(str)) {
arr.push({
codepoint: n,
kanji: str,
length: str.length,
unicode: `U+${n.toString(16)}`,
})
}
}
console.log(`漢字総数 = ${arr.length}`);
"漢字総数 = 99030" //結果
配列arr内、昇順10
for (let n = 0; n < 10; n++) console.log(arr.at(n));
// 結果
{
codepoint: 11904,
kanji: "⺀",
length: 1,
unicode: "U+2e80"
}
{
codepoint: 11905,
kanji: "⺁",
length: 1,
unicode: "U+2e81"
}
{
codepoint: 11906,
kanji: "⺂",
length: 1,
unicode: "U+2e82"
}
{
codepoint: 11907,
kanji: "⺃",
length: 1,
unicode: "U+2e83"
}
{
codepoint: 11908,
kanji: "⺄",
length: 1,
unicode: "U+2e84"
}
{
codepoint: 11909,
kanji: "⺅",
length: 1,
unicode: "U+2e85"
}
{
codepoint: 11910,
kanji: "⺆",
length: 1,
unicode: "U+2e86"
}
{
codepoint: 11911,
kanji: "⺇",
length: 1,
unicode: "U+2e87"
}
{
codepoint: 11912,
kanji: "⺈",
length: 1,
unicode: "U+2e88"
}
{
codepoint: 11913,
kanji: "⺉",
length: 1,
unicode: "U+2e89"
}
配列arr内、降順10
for (let n = 0; n < 10; n++) console.log(arr.at(~n));
// 結果
{
codepoint: 205743,
kanji: "𲎯",
length: 2,
unicode: "U+323af"
}
{
codepoint: 205742,
kanji: "𲎮",
length: 2,
unicode: "U+323ae"
}
{
codepoint: 205741,
kanji: "𲎭",
length: 2,
unicode: "U+323ad"
}
{
codepoint: 205740,
kanji: "𲎬",
length: 2,
unicode: "U+323ac"
}
{
codepoint: 205739,
kanji: "𲎫",
length: 2,
unicode: "U+323ab"
}
{
codepoint: 205738,
kanji: "𲎪",
length: 2,
unicode: "U+323aa"
}
{
codepoint: 205737,
kanji: "𲎩",
length: 2,
unicode: "U+323a9"
}
{
codepoint: 205736,
kanji: "𲎨",
length: 2,
unicode: "U+323a8"
}
{
codepoint: 205735,
kanji: "𲎧",
length: 2,
unicode: "U+323a7"
}
{
codepoint: 205734,
kanji: "𲎦",
length: 2,
unicode: "U+323a6"
}
配列arr内、ランダム10
for (let n = 0; n < 10; n++) {
let r = Math.floor(Math.random() * arr.length);
console.log(arr.at(r));
}
// 結果
{
codepoint: 157400,
kanji: "𦛘",
length: 2,
unicode: "U+266d8"
}
{
codepoint: 187686,
kanji: "𭴦",
length: 2,
unicode: "U+2dd26"
}
{
codepoint: 37145,
kanji: "鄙",
length: 1,
unicode: "U+9119"
}
{
codepoint: 136452,
kanji: "𡔄",
length: 2,
unicode: "U+21504"
}
{
codepoint: 147258,
kanji: "𣼺",
length: 2,
unicode: "U+23f3a"
}
{
codepoint: 27462,
kanji: "歆",
length: 1,
unicode: "U+6b46"
}
{
codepoint: 141714,
kanji: "𢦒",
length: 2,
unicode: "U+22992"
}
{
codepoint: 143382,
kanji: "𣀖",
length: 2,
unicode: "U+23016"
}
{
codepoint: 18532,
kanji: "䡤",
length: 1,
unicode: "U+4864"
}
{
codepoint: 21068,
kanji: "剌",
length: 1,
unicode: "U+524c"
}
まとめ
- UNIコード内に割り当てられている漢字総数 = 99030
-
length
が2の漢字などは、正直全く読めない - 降順10では、環境依存なだけなのか?それとも未割当なのか?自分の所では「□」表示が多い(スマホだと、もっと「□」表示が多いかも)
以上、まとめでした。
その他、
- こんなのあるよ
- もっと簡潔に出来るよ
- それ間違ってるよ
などありましたら、コメントでお待ちしております。