はじめに
Rのstringi
パッケージを用いた文字列の変換についてまとめます。
目次
- Rのstringiパッケージによる文字列の変換
- 参考
Rのstringiパッケージによる文字列の変換
まず、使用する文字列ベクトルを用意しておきます。
R
library(tidyverse)
library(stringi)
str <- c("alphabet",
"ALPHABET",
"alphabet",
"ALPHABET",
".,;:+-*/^_~&| ",
".,;:+-*/^_~&| ",
"ひらがな",
"カタカナ",
"カタカナ",
"漢字",
"0123456789",
"0123456789",
"エタノール水溶液1,000mL",
"50%のエタノール水溶液1,000mg")
str
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000mg"
str_alphabet <- c("abcdefghijklmnopqrstuvwxyz",
"ABCDEFGHIJKLMNOPQRSTUVWXYZ")
str_alphabet
# [1] "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
str_hiragana <- c("あかさたなはまやらわがざだばぱ",
"いきしちにひみいりゐぎじぢびぴ",
"うくすつぬふむゆるうぐずづぶぷ",
"えけせてねへめえれゑげぜでべぺ",
"おこそとのほもよろをごぞどぼぽ",
"きゃぎゃしゃじゃちゃぢゃにゃひゃびゃぴゃみゃりゃ",
"きゅぎゅしゅじゅちゅぢゅにゅひゅびゅぴゅみゅりゅ",
"きょぎょしょじょちょぢょにょひょびょぴょみょりょ",
"ん")
str_hiragana
# [1] "あかさたなはまやらわがざだばぱ"
# [2] "いきしちにひみいりゐぎじぢびぴ"
# [3] "うくすつぬふむゆるうぐずづぶぷ"
# [4] "えけせてねへめえれゑげぜでべぺ"
# [5] "おこそとのほもよろをごぞどぼぽ"
# [6] "きゃぎゃしゃじゃちゃぢゃにゃひゃびゃぴゃみゃりゃ"
# [7] "きゅぎゅしゅじゅちゅぢゅにゅひゅびゅぴゅみゅりゅ"
# [8] "きょぎょしょじょちょぢょにょひょびょぴょみょりょ"
# [9] "ん"
stri_trans_general():一般的な変換
大文字・小文字
R
# Any -> 大文字
stri_trans_general(str, "Upper")
stri_trans_general(str, "upper")
stri_trans_general(str, "Any-Upper")
stri_trans_general(str, "any-upper")
# [1] "ALPHABET" "ALPHABET"
# [3] "ALPHABET" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000ML" "50%のエタノール水溶液1,000MG"
# Any -> 小文字
stri_trans_general(str, "Lower")
stri_trans_general(str, "Any-Lower")
stri_trans_general(str, "lower")
stri_trans_general(str, "any-lower")
# [1] "alphabet" "alphabet"
# [3] "alphabet" "alphabet"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000ml" "50%のエタノール水溶液1,000mg"
半角・全角
R
# 半角 -> 全角
stri_trans_general(str, "Halfwidth-Fullwidth")
stri_trans_general(str, "halfwidth-fullwidth")
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000mg"
stri_trans_general(str_alphabet, "halfwidth-fullwidth")
# [1] "abcdefghijklmnopqrstuvwxyz"
# [2] "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
# 全角 -> 半角
stri_trans_general(str, "Fullwidth-Halfwidth")
stri_trans_general(str, "fullwidth-halfwidth")
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000mg"
stri_trans_general(str_alphabet, "halfwidth-fullwidth") %>% print() %>%
stri_trans_general("fullwidth-halfwidth")
# [1] "abcdefghijklmnopqrstuvwxyz"
# [2] "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
# [1] "abcdefghijklmnopqrstuvwxyz"
# [2] "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
stri_trans_general(str_hiragana, "Hiragana-Katakana") %>% print() %>%
stri_trans_general("fullwidth-halfwidth")
# [1] "アカサタナハマヤラワガザダバパ"
# [2] "イキシチニヒミイリヰギジヂビピ"
# [3] "ウクスツヌフムユルウグズヅブプ"
# [4] "エケセテネヘメエレヱゲゼデベペ"
# [5] "オコソトノホモヨロヲゴゾドボポ"
# [6] "キャギャシャジャチャヂャニャヒャビャピャミャリャ"
# [7] "キュギュシュジュチュヂュニュヒュビュピュミュリュ"
# [8] "キョギョショジョチョヂョニョヒョビョピョミョリョ"
# [9] "ン"
# [1] "アカサタナハマヤラワガザダバパ" "イキシチニヒミイリヰギジヂビピ"
# [3] "ウクスツヌフムユルウグズヅブプ" "エケセテネヘメエレヱゲゼデベペ"
# [5] "オコソトノホモヨロヲゴゾドボポ" "キャギャシャジャチャヂャニャヒャビャピャミャリャ"
# [7] "キュギュシュジュチュヂュニュヒュビュピュミュリュ" "キョギョショジョチョヂョニョヒョビョピョミョリョ"
# [9] "ン"
ひらがな・カタカナ
R
# ひらがな -> カタカナ
stri_trans_general(str, "Hiragana-Katakana")
stri_trans_general(str, "hiragana-katakana")
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ヒラガナ" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%ノエタノール水溶液1,000mg"
stri_trans_general(str_hiragana, "Hiragana-Katakana")
stri_trans_general(str_hiragana, "hiragana-katakana")
# [1] "アカサタナハマヤラワガザダバパ"
# [2] "イキシチニヒミイリヰギジヂビピ"
# [3] "ウクスツヌフムユルウグズヅブプ"
# [4] "エケセテネヘメエレヱゲゼデベペ"
# [5] "オコソトノホモヨロヲゴゾドボポ"
# [6] "キャギャシャジャチャヂャニャヒャビャピャミャリャ"
# [7] "キュギュシュジュチュヂュニュヒュビュピュミュリュ"
# [8] "キョギョショジョチョヂョニョヒョビョピョミョリョ"
# [9] "ン"
# カタカナ -> ひらがな
stri_trans_general(str, "Katakana-Hiragana")
stri_trans_general(str, "katakana-hiragana")
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "かたかな"
# [9] "かたかな" "漢字"
# [11] "0123456789" "0123456789"
# [13] "えたのおる水溶液1,000mL" "50%のえたのおる水溶液1,000mg"
英字・ギリシャ文字・キリル文字・ハングル
R
# 英字 -> ギリシャ文字
stri_trans_general(str, "Latin-Greek")
stri_trans_general(str, "latin-greek")
# [1] "ἀλφαβετ" "ἈΛΦΑΒΕΤ"
# [3] "alphabet" "ALPHABET"
# [5] ".,;·+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000μγ"
stri_trans_general(str_latin, "latin-greek")
# [1] "ἀβκδεφγἱικλμνοπκρστυυυξυζ"
# [2] "ἈΒΚΔΕΦΓἹΙΚΛΜΝΟΠΚΡΣΤΥΥΥΞΥΖ"
# 英字 -> キリル文字
stri_trans_general(str, "Latin-Cyrillic")
stri_trans_general(str, "latin-cyrillic")
# [1] "алпхабет" "АЛПХАБЕТ"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000мг"
stri_trans_general(str_latin, "latin-cyrillic")
# [1] "абцдефгхийклмнопкрстувуксыз"
# [2] "АБЦДЕФГХИЙКЛМНОПКРСТУВУКСЫЗ"
# 英字 -> ハングル
stri_trans_general(str, "Latin-Hangul")
stri_trans_general(str, "latin-hangul")
# [1] "앒하벹" "앒하벹"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000믁"
# 英字 -> Indic
stri_trans_general(str, "Latin-Interindic")
stri_trans_general(str, "latin-interindic")
ひらがな・カタカナ・英字
R
# ひらがな -> 英字
stri_trans_general(str, "Hiragana-Latin")
stri_trans_general(str, "hiragana-latin")
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "hiragana" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノ̄ル水溶液1,000mL" "50%noエタノール水溶液1,000mg"
stri_trans_general(str_hiragana, "Hiragana-Latin")
# [1] "akasatanahamayarawagazadabapa" "ikishichinihimiiriwigijidjibipi"
# [3] "ukusutsunufumuyuruuguzudzubupu" "ekesetenehemeerewegezedebepe"
# [5] "okosotonohomoyorowogozodobopo" "kyagyashajachadjanyahyabyapyamyarya"
# [7] "kyugyushujuchudjunyuhyubyupyumyuryu" "kyogyoshojochodjonyohyobyopyomyoryo"
# [9] "n"
# カタカナ -> 英字
stri_trans_general(str, "Katakana-Latin")
stri_trans_general(str, "katakana-latin")
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "katakana"
# [9] "katakana" "漢字"
# [11] "0123456789" "0123456789"
# [13] "etanōru水溶液1,000mL" "50%のetanōru水溶液1,000mg"
# 英字 -> ひらがな
stri_trans_general(str, "Latin-Hiragana")
stri_trans_general(str, "latin-hiragana")
# [1] "あるぷはべて" "あるぷはべて"
# [3] "alphabet" "ALPHABET"
# [5] "。、;:+-*/^_&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1、000むぐ"
stri_trans_general(str_alphabet, "Latin-Hiragana")
# [1] "あぶくでふぐひじくるむのぷくるすてぅゔうくせぃず"
# [2] "あぶくでふぐひじくるむのぷくるすてぅゔうくせぃず"
# 英字 -> カタカナ
stri_trans_general(str, "Latin-Katakana")
stri_trans_general(str, "latin-katakana")
# [1] "アルプハベテ" "アルプハベテ"
# [3] "アルプハベテ" "アルプハベテ"
# [5] "。、;:+-*/^_&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000ムル" "50%のエタノール水溶液1、000ムグ"
stri_trans_general(str_alphabet, "Latin-Katakana")
# [1] "アブクデフグヒジクルムノプクルステゥヴウクセィズ"
# [2] "アブクデフグヒジクルムノプクルステゥヴウクセィズ"
# Any -> 英字
stri_trans_general(str, "Any-Latin")
stri_trans_general(str, "any-latin")
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "hiragana" "katakana"
# [9] "katakana" "hàn zì"
# [11] "0123456789" "0123456789"
# [13] "etanōru shuǐ róng yè1,000mL" "50%noetanōru shuǐ róng yè1,000mg"
# Any -> ひらがな
stri_trans_general(str, "Any-Hiragana")
stri_trans_general(str, "any-hiragana")
# [1] "あるぷはべて" "あるぷはべて"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "かたかな"
# [9] "かたかな" "漢字"
# [11] "0123456789" "0123456789"
# [13] "えたのおる水溶液1,000mL" "50%のえたのおる水溶液1、000むぐ"
# Any -> カタカナ
stri_trans_general(str, "Any-Katakana")
stri_trans_general(str, "any-katakana")
# [1] "アルプハベテ" "アルプハベテ"
# [3] "アルプハベテ" "アルプハベテ"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ヒラガナ" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000ムル" "50%ノエタノール水溶液1、000ムグ"
Unicode Normalization
全角英数字を半角に、半角カタカナを全角に、というように “まとも” な形に変換します。'NFKC'が一番まともそう。
R
stri_trans_general(str, 'NFC')
stri_trans_general(str, 'Any-NFC')
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000mg"
stri_trans_general(str, 'NFD')
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000mg"
stri_trans_general(str, 'NFKD')
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000mg"
stri_trans_general(str, 'NFKC')
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000mg"
複数の変換
複数の変換を同時にもできる。
R
# カタカナ -> ひらがな and 半角 -> 全角
stri_trans_general(str, "katakana-hiragana; halfwidth-fullwidth")
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "かたかな"
# [9] "かたかな" "漢字"
# [11] "0123456789" "0123456789"
# [13] "えたのおる水溶液1,000mL" "50%のえたのおる水溶液1,000mg"
# ひらがな -> カタカナ and 全角 -> 半角
stri_trans_general(str, "hiragana-katakana; fullwidth-halfwidth")
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ヒラガナ" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%ノエタノール水溶液1,000mg"
# 半角 -> 全角 and Any -> 大文字
stri_trans_general(str, "halfwidth-fullwidth; upper")
stri_trans_general(str, "halfwidth-fullwidth; Any-Upper")
# [1] "ALPHABET" "ALPHABET"
# [3] "ALPHABET" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000ML" "50%のエタノール水溶液1,000MG"
stri_trans_toupper(), stri_trans_tolower(), stri_trans_totitle():大文字・小文字変換
stringr
パッケージのstr_to_upper()
, str_to_lower()
, str_to_title()
, str_to_sentence()
と同じようです。
R
stri_trans_toupper(str)
# [1] "ALPHABET" "ALPHABET"
# [3] "ALPHABET" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000ML" "50%のエタノール水溶液1,000MG"
str_to_upper(str)
# [1] "ALPHABET" "ALPHABET"
# [3] "ALPHABET" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000ML" "50%のエタノール水溶液1,000MG"
stri_trans_tolower(str)
# [1] "alphabet" "alphabet"
# [3] "alphabet" "alphabet"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000ml" "50%のエタノール水溶液1,000mg"
str_to_lower(str)
# [1] "alphabet" "alphabet"
# [3] "alphabet" "alphabet"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000ml" "50%のエタノール水溶液1,000mg"
stri_trans_totitle(str)
# [1] "Alphabet" "Alphabet"
# [3] "Alphabet" "Alphabet"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000Ml" "50%のエタノール水溶液1,000mg"
str_to_title(str)
# [1] "Alphabet" "Alphabet"
# [3] "Alphabet" "Alphabet"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000Ml" "50%のエタノール水溶液1,000mg"
stri_trans_totitle(str_c(str, " ", str))
# [1] "Alphabet Alphabet" "Alphabet Alphabet"
# [3] "Alphabet Alphabet" "Alphabet Alphabet"
# [5] ".,;:+-*/^_~&| .,;:+-*/^_~&| " ".,;:+-*/^_~&| .,;:+-*/^_~&| "
# [7] "ひらがな ひらがな" "カタカナ カタカナ"
# [9] "カタカナ カタカナ" "漢字 漢字"
# [11] "0123456789 0123456789" "0123456789 0123456789"
# [13] "エタノール水溶液1,000Ml エタノール水溶液1,000Ml" "50%のエタノール水溶液1,000mg 50%のエタノール水溶液1,000mg"
str_to_title(str_c(str, " ", str))
# [1] "Alphabet Alphabet" "Alphabet Alphabet"
# [3] "Alphabet Alphabet" "Alphabet Alphabet"
# [5] ".,;:+-*/^_~&| .,;:+-*/^_~&| " ".,;:+-*/^_~&| .,;:+-*/^_~&| "
# [7] "ひらがな ひらがな" "カタカナ カタカナ"
# [9] "カタカナ カタカナ" "漢字 漢字"
# [11] "0123456789 0123456789" "0123456789 0123456789"
# [13] "エタノール水溶液1,000Ml エタノール水溶液1,000Ml" "50%のエタノール水溶液1,000mg 50%のエタノール水溶液1,000mg"
stri_trans_totitle(str_c(str, " ", str), type='sentence')
# [1] "Alphabet alphabet" "Alphabet alphabet"
# [3] "Alphabet alphabet" "Alphabet alphabet"
# [5] ".,;:+-*/^_~&| .,;:+-*/^_~&| " ".,;:+-*/^_~&| .,;:+-*/^_~&| "
# [7] "ひらがな ひらがな" "カタカナ カタカナ"
# [9] "カタカナ カタカナ" "漢字 漢字"
# [11] "0123456789 0123456789" "0123456789 0123456789"
# [13] "エタノール水溶液1,000ml エタノール水溶液1,000ml" "50%のエタノール水溶液1,000mg 50%のエタノール水溶液1,000mg"
str_to_sentence(str_c(str, " ", str))
# [1] "Alphabet alphabet" "Alphabet alphabet"
# [3] "Alphabet alphabet" "Alphabet alphabet"
# [5] ".,;:+-*/^_~&| .,;:+-*/^_~&| " ".,;:+-*/^_~&| .,;:+-*/^_~&| "
# [7] "ひらがな ひらがな" "カタカナ カタカナ"
# [9] "カタカナ カタカナ" "漢字 漢字"
# [11] "0123456789 0123456789" "0123456789 0123456789"
# [13] "エタノール水溶液1,000ml エタノール水溶液1,000ml" "50%のエタノール水溶液1,000mg 50%のエタノール水溶液1,000mg"
stri_trans_nfc(), stri_trans_nfd() 等:Unicode Normalization
NFC, NFKC, NFD, NFKD, or NFKC_Casefold Unicode Normalization Form
全角英数字を半角に、半角カタカナを全角に、というように “まとも” な形に変換します。'NFKC'が一番まともそう。
R
# Unicode Normalization
# NFC (Canonical Decomposition, followed by Canonical Composition)
stri_trans_nfc(str)
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000mg"
# NFD (Canonical Decomposition)
stri_trans_nfd(str)
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000mg"
# NFKC (Compatibility Decomposition, followed by Canonical Composition)
stri_trans_nfkc(str)
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000mg"
# NFKD (Compatibility Decomposition)
stri_trans_nfkd(str)
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000mg"
# NFKC_Casefold (combination of NFKC, case folding, and removing ignorable characters which was introduced with Unicode 5.2)
stri_trans_nfkc_casefold(str)
# [1] "alphabet" "alphabet"
# [3] "alphabet" "alphabet"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000ml" "50%のエタノール水溶液1,000mg"
stri_trans_isnfc(str)
# [1] TRUE TRUE
# [3] TRUE TRUE
# [5] TRUE TRUE
# [7] TRUE TRUE
# [9] TRUE TRUE
# [11] TRUE TRUE
# [13] TRUE TRUE
stri_trans_isnfd(str)
# [1] TRUE TRUE
# [3] TRUE TRUE
# [5] TRUE TRUE
# [7] FALSE TRUE
# [9] TRUE TRUE
# [11] TRUE TRUE
# [13] TRUE TRUE
stri_trans_isnfkd(str)
# [1] TRUE TRUE
# [3] FALSE FALSE
# [5] TRUE FALSE
# [7] FALSE TRUE
# [9] FALSE TRUE
# [11] TRUE FALSE
# [13] FALSE FALSE
stri_trans_isnfkc(str)
# [1] TRUE TRUE
# [3] FALSE FALSE
# [5] TRUE FALSE
# [7] TRUE TRUE
# [9] FALSE TRUE
# [11] TRUE FALSE
# [13] FALSE FALSE
stri_trans_isnfkc_casefold(str)
# [1] TRUE FALSE
# [3] FALSE FALSE
# [5] TRUE FALSE
# [7] TRUE TRUE
# [9] FALSE TRUE
# [11] TRUE FALSE
# [13] FALSE FALSE
NFKCとNFKDは同じように見えますが、違うようです。
R
stri_trans_nfkc(str) == stri_trans_nfkd(str)
# [1] TRUE TRUE
# [3] TRUE TRUE
# [5] TRUE TRUE
# [7] FALSE TRUE
# [9] TRUE TRUE
# [11] TRUE TRUE
# [13] TRUE TRUE
str_length(stri_trans_nfkc("ひらがな"))
# [1] 4
str_length(stri_trans_nfkd("ひらがな"))
# [1] 5
stri_trans_char():対応する1文字ずつの置換
対応する1文字ずつを置換します。
R
# 小文字英字 -> 大文字英字
stri_trans_char(str, "abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
# [1] "ALPHABET" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000MG"
# 半角英字 -> 全角英字
stri_trans_char(str, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000mg"
# 小文字a・i・u・e・o -> 全角大文字A・I・U・E・O
stri_trans_char(str, "aiueo", "AIUEO")
stri_trans_char(str, "aiueo", stri_trans_general("aiueo", "halfwidth-fullwidth; upper"))
# [1] "AlphAbEt" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000mg"
# 半角・全角カンマ -> 半角・全角アンダースコア
stri_trans_char(str, ",,", "__")
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] "._;:+-*/^_~&| " "._;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1_000mL" "50%のエタノール水溶液1_000mg"
# 半角数字・カンマ -> 全角数字・カンマ
stri_trans_char(str, "0123456789,", "0123456789,")
stri_trans_char(str, "0123456789,", stri_trans_general("0123456789,", "halfwidth-fullwidth"))
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".,;:+-*/^_~&| " ".,;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "0123456789" "0123456789"
# [13] "エタノール水溶液1,000mL" "50%のエタノール水溶液1,000mg"
# アラビア数字 -> 漢数字
stri_trans_char(str, "0123456789,0123456789,", "〇一二三四五六七八九、〇一二三四五六七八九、")
# [1] "alphabet" "ALPHABET"
# [3] "alphabet" "ALPHABET"
# [5] ".、;:+-*/^_~&| " ".、;:+-*/^_~&| "
# [7] "ひらがな" "カタカナ"
# [9] "カタカナ" "漢字"
# [11] "〇一二三四五六七八九" "〇一二三四五六七八九"
# [13] "エタノール水溶液一、〇〇〇mL" "五〇%のエタノール水溶液一、〇〇〇mg"
stri_datetime_format():日付フォーマット
R
## 日付
date <- stri_datetime_create(1964, 10, 10)
date
# [1] "1964-10-10 12:00:00 JST"
stri_datetime_format(date, "date_long", locale = "@calendar=japanese")
# [1] "昭和39年10月10日"
stri_datetime_format(date, "date_full", locale = "@calendar=japanese")
# [1] "昭和39年10月10日土曜日"
stri_datetime_format(date, "date_full", locale = "ja_JP@calendar=japanese")
# [1] "昭和39年10月10日土曜日"
stri_datetime_format(date, "date_full", locale = "@calendar=us")
# [1] "1964年10月10日土曜日"
stri_datetime_format(date, "date_full", locale = "@calendar=hebrew")
# [1] "AM5725年2月4日土曜日"
参考
- https://cran.r-project.org/web/packages/stringi/index.html
- https://github.com/gagolews/stringi
-
https://stringi.gagolewski.com/
- https://stringi.gagolewski.com/_static/vignette/stringi.pdf
- R Package stringi Reference
- https://stringi.gagolewski.com/rapi.html
- https://stringi.gagolewski.com/rapi/stri_trans_general.html
- https://stringi.gagolewski.com/rapi/stri_trans_casemap.html
- https://stringi.gagolewski.com/rapi/stri_trans_nf.html
- https://stringi.gagolewski.com/rapi/stri_trans_char.html
- https://stringi.gagolewski.com/rapi/stri_datetime_create.html
- https://stringi.gagolewski.com/rapi/stri_datetime_format.html
- http://userguide.icu-project.org/transforms/general
- https://r4ds.had.co.nz/strings.html#stringi