はじめに
機械学習の勉強を始めたVBAユーザです。
備忘録としてPython・Rの文法をVBAと比較しながらまとめていきたいと思います。
目次
文字列操作
以前の記事では、PythonとRの文字列操作についてVBAとの比較をしましたが、その続きで、Rの文字列操作のパッケージstringr
を使ってみます。
参考記事に「R標準のbase
パッケージが提供する関数でも文字列処理は可能だが、 stringr
のほうが統一的なインターフェイスに合理的な挙動で使いやすい。」とありますが、実際に使ってみると、確かに、関数のネーミングと引数の順序に統一性があって、感動的に使いやすいです。
参考:stringr — Rの文字列をまともな方法で処理する
文字列の結合
R(stringr)
library(stringr)
s1 <- "abc"
s2 <- "def"
s3 <- "ghij"
str_c(s1, s2, s3)
# "abcdefghij"
文字列の長さ
R(stringr)
s <- "abcdefghij"
str_length(s)
# 10
文字列の取り出し
R(stringr)
s <- "abcdefghij"
str_sub(s, 1, 2)
# "ab"
str_sub(s, -2, -1)
# "ij"
str_sub(s, 4, 6)
# "def"
文字列の検索
R(stringr)
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_detect(s, "def")
# TRUE
str_detect(t, "def")
# TRUE
str_count(s, "def")
# 1
str_count(t, "def")
# 2
str_locate(s, "def")
# start end
# [1,] 4 6
str_locate(t, "def")
# start end
# [1,] 4 6
class(str_locate(t, "def"))
# "matrix"
str_locate_all(t, "def")
# [[1]]
# start end
# [1,] 4 6
# [2,] 14 16
class(str_locate_all(t, "def"))
# "list"
文字列の置換
R(stringr)
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_replace(s, "def", "DEF")
# "abcDEFghij"
str_replace(t, "def", "DEF")
# "abcDEFghijabcdefghij"
str_replace_all(t, "def", "DEF")
# "abcDEFghijabcDEFghij"
文字列の変換
大文字と小文字の変換
R(stringr)
s <- "abcDEFghij"
str_to_upper(s) # 大文字に
# "ABCDEFGHIJ"
str_to_lower(s) # 小文字に
# "abcdefghij"
str_to_title(s) # 先頭のみ大文字・それ以外は小文字に
# "abcdefghij"
str_to_sentence(s) # 先頭のみ大文字・それ以外は小文字に
# "Abcdefghij"
ss <- "abc def ghij"
str_to_title(ss)
# "Abc Def Ghij"
str_to_sentence(ss)
# "Abc def ghij"
t <- ""
for (i in 1:str_length(s)) {
stemp = str_sub(s,i,i)
if (stemp == str_to_lower(stemp)) {
stemp = str_to_upper(stemp)
} else if (stemp == str_to_upper(stemp)) {
stemp = str_to_lower(stemp)
}
t <- str_c(t, stemp)
}
t # 大文字・小文字の入れ替え
# "ABCdefGHIJ"
s == str_to_upper(s) # すべて大文字かどうかの判定
# FALSE
s == str_to_lower(s) # すべて小文字かどうかの判定
# FALSE
全角と半角の変換
R(stringr)
文字列の反転
R(stringr)
s <- "abcdefghij"
t <- ""
for (i in 1:str_length(s)) {
t <- str_c(t, str_sub(s, -i, -i))
}
t
# "jihgfedcba"
文字列の繰り返し
R(stringr)
str_dup("A", 3)
# "AAA"
str_dup("def", 3)
# "defdefdef"
スペース
スペースの文字列
R(stringr)
str_c("-", str_dup(" ", 3), "-")
# "- -"
# "- -"
s <- str_c(str_dup(" ", 2), "d",
str_dup(" ", 3), "e",
str_dup(" ", 4), "f",
str_dup(" ", 5))
str_c("-", s, "-")
# "- d e f -"
前後の不要なスペースの削除
R(stringr)
str_trim(s, side="left")
# "d e f "
str_trim(s, side="right")
# " d e f"
str_trim(s, side="both")
# "d e f"
文字列ベクトルについて
stringr
パッケージの関数は、文字列(1個の文字列)だけでなく、文字列のベクトルやデータフレームでも使えます。
例えば、3個の文字列からなる文字列ベクトルにstr_length
関数を使うと、その各要素の文字列に対してそれぞれstr_length
関数を使った結果の数値3個からなるベクトルが返ります。
R(stringr)
s1 <- "abcdefghij"
s2 <- "cdefghijkl"
s3 <- "efghijklmn"
ss <- c(s1, s2, s3)
ss
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"
str_c(ss, "_1")
# [1] "abcdefghij_1" "cdefghijkl_1" "efghijklmn_1"
str_length(ss)
# [1] 10 10 10
str_sub(ss, 1, 2)
# [1] "ab" "cd" "ef"
str_sub(ss, -2, -1)
# [1] "ij" "kl" "mn"
str_sub(ss, 2, 3)
# [1] "bc" "de" "fg"
str_detect(ss, "def")
# [1] TRUE TRUE FALSE
str_count(ss, "def")
# [1] 1 1 0
str_locate(ss, "def")
# start end
# [1,] 4 6
# [2,] 2 4
# [3,] NA NA
str_locate_all(ss, "def")
# [[1]]
# start end
# [1,] 4 6
#
# [[2]]
# start end
# [1,] 2 4
#
# [[3]]
# start end
#
str_replace(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"
str_replace_all(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"
str_to_upper(ss)
# [1] "ABCDEFGHIJ" "CDEFGHIJKL" "EFGHIJKLMN"
str_to_lower(ss)
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"
str_to_title(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"
str_to_sentence(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"
ss == str_to_upper(ss)
# [1] FALSE FALSE FALSE
ss == str_to_lower(ss)
# [1] TRUE TRUE TRUE
str_dup(ss, 2)
# [1] "abcdefghijabcdefghij" "cdefghijklcdefghijkl" "efghijklmnefghijklmn"
tt <- str_c(" ", ss, " _1 ")
tt
# [1] " abcdefghij _1 " " cdefghijkl _1 " " efghijklmn _1 "
str_trim(tt)
# [1] "abcdefghij _1" "cdefghijkl _1" "efghijklmn _1"
str_trim(tt, side="left")
# [1] "abcdefghij _1 " "cdefghijkl _1 " "efghijklmn _1 "
str_trim(tt, side="right")
# [1] " abcdefghij _1" " cdefghijkl _1" " efghijklmn _1"
ベクトルやデータフレームについては、また別の記事でまとめたいと思います。
まとめ
一覧
各言語で使用する文字列操作関数等を一覧にまとめます。比較のために、EXCELでの計算も示しました。
s1 = "abc"
s2 = "def"
s3 = "ghij"
s = "abcdefghij"
t = "abcdefghijabcdefghij"
u = "abcDEFghij"
v = "abcDEFghij"
w = " d e f "
とします。また、EXCELのセルにそれぞれ
A1セル:="abc"
A2セル:="def"
A3セル:="ghij"
A4セル:="abcdefghij"
A5セル:="abcdefghijabcdefghij"
A6セル:="abcDEFghij"
A7セル:="abcDEFghij"
A8セル:=" d e f "
が入力されているものとします。
文字列の基本的操作
||Python|R|R(stringr)|VBA|EXCEL|結果|
|:--|:--|:--|:--|:--|:--|:--|:--|
|結合|s1 + s2 + s3|paste0(s1, s2, s3)
paste(s1, s2, s3, sep="")|str_c(s1, s2, s3)|s1 & s2 & s3|=A1&A2&A3
=CONCATENATE(
A1,A2,A3)|abcdefghij|
|長さ|len(s)|nchar(s)|str_length(s)|Len(s)|=LEN(A4)|10|
|反転|s[::-1]|||StrReverse(s)||jihgfedcba|
|繰り返し|'A' * 3||str_dup("A", 3)|String(3, "A")|=REPT("A",3)|AAA|
|繰り返し|'def' * 3||str_dup("def", 3)||=REPT("def",3)|defdefdef|
文字列の取り出し
||Python|R|R(stringr)|VBA|EXCEL|結果|
|:--|:--|:--|:--|:--|:--|:--|:--|
|左から|s[8:10]
s[0:2]
s[:2]|substr(s, 1, 2)
substring(s, 1, 2)|str_sub(s, 1, 2)|Left(s, 2)|=LEFT(A4,2)|ab|
|右から|s[len(s)-2:len(s)]
s[-2:]|substr(s,
nchar(s)-2+1,
nchar(s))|str_sub(s, -2, -1)|Right(s, 2)|=RIGHT(A4,2)|ij|
|途中|s[3:6]|substr(s, 4, 6)|str_sub(s, 4, 6)|Mid(s, 4, 3)|=MID(A4,4,3)|def|
注意)「途中」の文字列の取り出しについて、Python, Rの関数では取り出す文字列を「どこからどこまで」と指定しますが、VBA, EXCELの関数では「どこから何文字分」と指定します。
文字列の検索
||Python|R|R(stringr)|VBA|EXCEL|結果|
|:--|:--|:--|:--|:--|:--|:--|:--|
|検索|s.find('def')||str_locate(s, "def")|InStr(1, s, "def")|=FIND("def",A4,1)
=SEARCH("def",A4,1)|3,4|
|後ろからの検索|t.rfind('def')||||InStrRev(t, "def")|13,14|
|カウント|t.count('def')||str_count(t, "def")|||2|
注意)str_detect
, str_locate
関数については、上記参照。
文字列の置換
||Python|R|R(stringr)|VBA|EXCEL|結果|
|:--|:--|:--|:--|:--|:--|:--|:--|
|置換|s.replace('def', 'DEF')|sub("def", "DEF", s)|str_replace(s, "def", "DEF")|Replace(s, "def", "DEF")|=SUBSTITUTE(
A4,"def","DEF")
=REPLACE(A4,
FIND("def",A4),
LEN("def"),"DEF")|abcDEFghij|
|最初の1つだけ置換||sub("def", "DEF", t)|str_replace(t, "def", "DEF")|||abcDEFghij
abcdefghij|
|すべて置換|t.replace('def', 'DEF')|gsub("def", "DEF", t)|str_replace_all(t, "def", "DEF")|Replace(t, "def", "DEF")|=SUBSTITUTE(
A5,"def","DEF")|abcDEFghij
abcDEFghij|
文字列の変換
||Python|R|R(stringr)|VBA|EXCEL|結果|
|:--|:--|:--|:--|:--|:--|:--|:--|
|大文字に|u.upper()|toupper(u)|str_to_upper(u)|UCase(u)|=UPPER(A6)|ABCDEFGHIJ|
|小文字に|u.lower()|tolower(u)|str_to_lower(u)|LCase(u)|=LOWER(A6)|abcdefghij|
|先頭のみ大文字・それ以外は小文字に|u.capitalize()||str_to_title(u)
str_to_sentence(u)|StrConv(u, vbProperCase)|=PROPER(A6)|Abcdefghij|
|大文字と小文字を入れ替え|u.swapcase()|chartr("A-Za-z", "a-zA-z", u)||||ABCdefGHIJ|
|大文字かどうかの判定|u.isupper()|u == toupper(u)|u == str_to_upper(u)|||False|
|小文字かどうかの判定|u.islower()|u == tolower(u)|u == str_to_lower(u)|||False|
|全角に||chartr("A-Za-z", "A-Za-z", u)||StrConv(u, vbWide)|=JIS(A6)|abcDEFghij|
|半角に||chartr("A-Za-z", "A-Za-z", v)||StrConv(v, vbNarrow)|=ASC(A7)|abcDEFghij|
文字列のスペース
||Python|R|R(stringr)|VBA|EXCEL|結果|
|:--|:--|:--|:--|:--|:--|:--|:--|
|スペース|' ' * 3||str_dup(" ", 3)|Space(3)|=REPT(" ",3)|" "|
|両側スペース削除|w.strip(' ')||str_trim(s, side="both")|Trim(w)|=TRIM(A8)|"d e f"|
|左スペース削除|w.lstrip(' ')||str_trim(s, side="left")|LTrim(w)||"d e f "|
|右スペース削除|w.rstrip(' ')||str_trim(s, side="right")|RTrim(w)||" d e f"|
注意)EXCELのTRIM関数は文字列の中のスペースも1つを除いて削除されてd e f
となります。
プログラム全体
参考までに使ったプログラムの全体を示します。
Python, VBAのコードは前回の記事参照。
R(stringr)
library(stringr)
# 文字列の結合
s1 <- "abc"
s2 <- "def"
s3 <- "ghij"
str_c(s1, s2, s3)
# "abcdefghij"
# 文字列の長さ
s <- "abcdefghij"
str_length(s)
# 10
# 文字列の取り出し
s <- "abcdefghij"
str_sub(s, 1, 2)
# "ab"
str_sub(s, -2, -1)
# "ij"
str_sub(s, 4, 6)
# "def"
# 文字列の検索
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_detect(s, "def")
# TRUE
str_detect(t, "def")
# TRUE
str_count(s, "def")
# 1
str_count(t, "def")
# 2
str_locate(s, "def")
# start end
# [1,] 4 6
str_locate(t, "def")
# start end
# [1,] 4 6
class(str_locate(t, "def"))
# "matrix"
str_locate_all(t, "def")
# [[1]]
# start end
# [1,] 4 6
# [2,] 14 16
class(str_locate_all(t, "def"))
# "list"
# 文字列の置換
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_replace(s, "def", "DEF")
# "abcDEFghij"
str_replace(t, "def", "DEF")
# "abcDEFghijabcdefghij"
str_replace_all(t, "def", "DEF")
# "abcDEFghijabcDEFghij"
# 文字列の大文字・小文字の変換
s <- "abcDEFghij"
str_to_upper(s) # 大文字に
# "ABCDEFGHIJ"
str_to_lower(s) # 小文字に
# "abcdefghij"
str_to_title(s) # 先頭のみ大文字・それ以外は小文字に
# "abcdefghij"
str_to_sentence(s) # 先頭のみ大文字・それ以外は小文字に
# "Abcdefghij"
ss <- "abc def ghij"
str_to_title(ss)
# "Abc Def Ghij"
str_to_sentence(ss)
# "Abc def ghij"
t <- ""
for (i in 1:str_length(s)) {
stemp = str_sub(s,i,i)
if (stemp == str_to_lower(stemp)) {
stemp = str_to_upper(stemp)
} else if (stemp == str_to_upper(stemp)) {
stemp = str_to_lower(stemp)
}
t <- str_c(t, stemp)
}
t # 大文字・小文字の入れ替え
# "ABCdefGHIJ"
s == str_to_upper(s) # すべて大文字かどうかの判定
# FALSE
s == str_to_lower(s) # すべて小文字かどうかの判定
# FALSE
# 文字列の反転
s <- "abcdefghij"
t <- ""
for (i in 1:str_length(s)) {
t <- str_c(t, str_sub(s, -i, -i))
}
t
# "jihgfedcba"
# 文字列の繰り返し
str_dup("A", 3)
# "AAA"
str_dup("def", 3)
# "defdefdef"
# 文字列のスペース
str_c("-", str_dup(" ", 3), "-")
# "- -"
# "- -"
s <- str_c(str_dup(" ", 2), "d",
str_dup(" ", 3), "e",
str_dup(" ", 4), "f",
str_dup(" ", 5))
str_c("-", s, "-")
# "- d e f -"
# 文字列の前後のスペース削除
str_trim(s, side="left")
# "d e f "
str_trim(s, side="right")
# " d e f"
str_trim(s, side="both")
# "d e f"
# 文字列ベクトル
s1 <- "abcdefghij"
s2 <- "cdefghijkl"
s3 <- "efghijklmn"
ss <- c(s1, s2, s3)
ss
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"
str_c(ss, "_1")
# [1] "abcdefghij_1" "cdefghijkl_1" "efghijklmn_1"
str_length(ss)
# [1] 10 10 10
str_sub(ss, 1, 2)
# [1] "ab" "cd" "ef"
str_sub(ss, -2, -1)
# [1] "ij" "kl" "mn"
str_sub(ss, 2, 3)
# [1] "bc" "de" "fg"
str_detect(ss, "def")
# [1] TRUE TRUE FALSE
str_count(ss, "def")
# [1] 1 1 0
str_locate(ss, "def")
# start end
# [1,] 4 6
# [2,] 2 4
# [3,] NA NA
str_locate_all(ss, "def")
# [[1]]
# start end
# [1,] 4 6
#
# [[2]]
# start end
# [1,] 2 4
#
# [[3]]
# start end
#
str_replace(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"
str_replace_all(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"
str_to_upper(ss)
# [1] "ABCDEFGHIJ" "CDEFGHIJKL" "EFGHIJKLMN"
str_to_lower(ss)
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"
str_to_title(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"
str_to_sentence(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"
ss == str_to_upper(ss)
# [1] FALSE FALSE FALSE
ss == str_to_lower(ss)
# [1] TRUE TRUE TRUE
str_dup(ss, 2)
# [1] "abcdefghijabcdefghij" "cdefghijklcdefghijkl" "efghijklmnefghijklmn"
tt <- str_c(" ", ss, " _1 ")
tt
# [1] " abcdefghij _1 " " cdefghijkl _1 " " efghijklmn _1 "
str_trim(tt)
# [1] "abcdefghij _1" "cdefghijkl _1" "efghijklmn _1"
str_trim(tt, side="left")
# [1] "abcdefghij _1 " "cdefghijkl _1 " "efghijklmn _1 "
str_trim(tt, side="right")
# [1] " abcdefghij _1" " cdefghijkl _1" " efghijklmn _1"