はじめに
機械学習の勉強を始めたVBAユーザです。
備忘録としてPython・Rの文法をVBAと比較しながらまとめていきたいと思います。
目次
文字列操作
以前の記事では、PythonとRの文字列操作についてVBAとの比較をしましたが、その続きで、Rの文字列操作のパッケージstringr
を使ってみます。
参考記事に「R標準のbase
パッケージが提供する関数でも文字列処理は可能だが、 stringr
のほうが統一的なインターフェイスに合理的な挙動で使いやすい。」とありますが、実際に使ってみると、確かに、関数のネーミングと引数の順序に統一性があって、感動的に使いやすいです。
参考:stringr — Rの文字列をまともな方法で処理する
文字列の結合
R(stringr)
library(stringr)
s1 <- "abc"
s2 <- "def"
s3 <- "ghij"
str_c(s1, s2, s3)
# "abcdefghij"
文字列の長さ
R(stringr)
s <- "abcdefghij"
str_length(s)
# 10
文字列の取り出し
R(stringr)
s <- "abcdefghij"
str_sub(s, 1, 2)
# "ab"
str_sub(s, -2, -1)
# "ij"
str_sub(s, 4, 6)
# "def"
文字列の検索
R(stringr)
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_detect(s, "def")
# TRUE
str_detect(t, "def")
# TRUE
str_count(s, "def")
# 1
str_count(t, "def")
# 2
str_locate(s, "def")
# start end
# [1,] 4 6
str_locate(t, "def")
# start end
# [1,] 4 6
class(str_locate(t, "def"))
# "matrix"
str_locate_all(t, "def")
# [[1]]
# start end
# [1,] 4 6
# [2,] 14 16
class(str_locate_all(t, "def"))
# "list"
文字列の置換
R(stringr)
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_replace(s, "def", "DEF")
# "abcDEFghij"
str_replace(t, "def", "DEF")
# "abcDEFghijabcdefghij"
str_replace_all(t, "def", "DEF")
# "abcDEFghijabcDEFghij"
文字列の変換
大文字と小文字の変換
R(stringr)
s <- "abcDEFghij"
str_to_upper(s) # 大文字に
# "ABCDEFGHIJ"
str_to_lower(s) # 小文字に
# "abcdefghij"
str_to_title(s) # 先頭のみ大文字・それ以外は小文字に
# "abcdefghij"
str_to_sentence(s) # 先頭のみ大文字・それ以外は小文字に
# "Abcdefghij"
ss <- "abc def ghij"
str_to_title(ss)
# "Abc Def Ghij"
str_to_sentence(ss)
# "Abc def ghij"
t <- ""
for (i in 1:str_length(s)) {
stemp = str_sub(s,i,i)
if (stemp == str_to_lower(stemp)) {
stemp = str_to_upper(stemp)
} else if (stemp == str_to_upper(stemp)) {
stemp = str_to_lower(stemp)
}
t <- str_c(t, stemp)
}
t # 大文字・小文字の入れ替え
# "ABCdefGHIJ"
s == str_to_upper(s) # すべて大文字かどうかの判定
# FALSE
s == str_to_lower(s) # すべて小文字かどうかの判定
# FALSE
全角と半角の変換
R(stringr)
文字列の反転
R(stringr)
s <- "abcdefghij"
t <- ""
for (i in 1:str_length(s)) {
t <- str_c(t, str_sub(s, -i, -i))
}
t
# "jihgfedcba"
文字列の繰り返し
R(stringr)
str_dup("A", 3)
# "AAA"
str_dup("def", 3)
# "defdefdef"
スペース
スペースの文字列
R(stringr)
str_c("-", str_dup(" ", 3), "-")
# "- -"
# "- -"
s <- str_c(str_dup(" ", 2), "d",
str_dup(" ", 3), "e",
str_dup(" ", 4), "f",
str_dup(" ", 5))
str_c("-", s, "-")
# "- d e f -"
前後の不要なスペースの削除
R(stringr)
str_trim(s, side="left")
# "d e f "
str_trim(s, side="right")
# " d e f"
str_trim(s, side="both")
# "d e f"
文字列ベクトルについて
stringr
パッケージの関数は、文字列(1個の文字列)だけでなく、文字列のベクトルやデータフレームでも使えます。
例えば、3個の文字列からなる文字列ベクトルにstr_length
関数を使うと、その各要素の文字列に対してそれぞれstr_length
関数を使った結果の数値3個からなるベクトルが返ります。
R(stringr)
s1 <- "abcdefghij"
s2 <- "cdefghijkl"
s3 <- "efghijklmn"
ss <- c(s1, s2, s3)
ss
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"
str_c(ss, "_1")
# [1] "abcdefghij_1" "cdefghijkl_1" "efghijklmn_1"
str_length(ss)
# [1] 10 10 10
str_sub(ss, 1, 2)
# [1] "ab" "cd" "ef"
str_sub(ss, -2, -1)
# [1] "ij" "kl" "mn"
str_sub(ss, 2, 3)
# [1] "bc" "de" "fg"
str_detect(ss, "def")
# [1] TRUE TRUE FALSE
str_count(ss, "def")
# [1] 1 1 0
str_locate(ss, "def")
# start end
# [1,] 4 6
# [2,] 2 4
# [3,] NA NA
str_locate_all(ss, "def")
# [[1]]
# start end
# [1,] 4 6
#
# [[2]]
# start end
# [1,] 2 4
#
# [[3]]
# start end
#
str_replace(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"
str_replace_all(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"
str_to_upper(ss)
# [1] "ABCDEFGHIJ" "CDEFGHIJKL" "EFGHIJKLMN"
str_to_lower(ss)
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"
str_to_title(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"
str_to_sentence(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"
ss == str_to_upper(ss)
# [1] FALSE FALSE FALSE
ss == str_to_lower(ss)
# [1] TRUE TRUE TRUE
str_dup(ss, 2)
# [1] "abcdefghijabcdefghij" "cdefghijklcdefghijkl" "efghijklmnefghijklmn"
tt <- str_c(" ", ss, " _1 ")
tt
# [1] " abcdefghij _1 " " cdefghijkl _1 " " efghijklmn _1 "
str_trim(tt)
# [1] "abcdefghij _1" "cdefghijkl _1" "efghijklmn _1"
str_trim(tt, side="left")
# [1] "abcdefghij _1 " "cdefghijkl _1 " "efghijklmn _1 "
str_trim(tt, side="right")
# [1] " abcdefghij _1" " cdefghijkl _1" " efghijklmn _1"
ベクトルやデータフレームについては、また別の記事でまとめたいと思います。
まとめ
一覧
各言語で使用する文字列操作関数等を一覧にまとめます。比較のために、EXCELでの計算も示しました。
s1 = "abc"
s2 = "def"
s3 = "ghij"
s = "abcdefghij"
t = "abcdefghijabcdefghij"
u = "abcDEFghij"
v = "abcDEFghij"
w = " d e f "
とします。また、EXCELのセルにそれぞれ
A1セル:="abc"
A2セル:="def"
A3セル:="ghij"
A4セル:="abcdefghij"
A5セル:="abcdefghijabcdefghij"
A6セル:="abcDEFghij"
A7セル:="abcDEFghij"
A8セル:=" d e f "
が入力されているものとします。
文字列の基本的操作
Python | R | R(stringr) | VBA | EXCEL | 結果 | |
---|---|---|---|---|---|---|
結合 | s1 + s2 + s3 | paste0(s1, s2, s3) paste(s1, s2, s3, sep="") |
str_c(s1, s2, s3) | s1 & s2 & s3 | =A1&A2&A3 =CONCATENATE( A1,A2,A3) |
abcdefghij |
長さ | len(s) | nchar(s) | str_length(s) | Len(s) | =LEN(A4) | 10 |
反転 | s[::-1] | StrReverse(s) | jihgfedcba | |||
繰り返し | 'A' * 3 | str_dup("A", 3) | String(3, "A") | =REPT("A",3) | AAA | |
繰り返し | 'def' * 3 | str_dup("def", 3) | =REPT("def",3) | defdefdef |
文字列の取り出し
Python | R | R(stringr) | VBA | EXCEL | 結果 | |
---|---|---|---|---|---|---|
左から | s[8:10] s[0:2] s[:2] |
substr(s, 1, 2) substring(s, 1, 2) |
str_sub(s, 1, 2) | Left(s, 2) | =LEFT(A4,2) | ab |
右から | s[len(s)-2:len(s)] s[-2:] |
substr(s, nchar(s)-2+1, nchar(s)) |
str_sub(s, -2, -1) | Right(s, 2) | =RIGHT(A4,2) | ij |
途中 | s[3:6] | substr(s, 4, 6) | str_sub(s, 4, 6) | Mid(s, 4, 3) | =MID(A4,4,3) | def |
注意)「途中」の文字列の取り出しについて、Python, Rの関数では取り出す文字列を「どこからどこまで」と指定しますが、VBA, EXCELの関数では「どこから何文字分」と指定します。
文字列の検索
Python | R | R(stringr) | VBA | EXCEL | 結果 | |
---|---|---|---|---|---|---|
検索 | s.find('def') | str_locate(s, "def") | InStr(1, s, "def") | =FIND("def",A4,1) =SEARCH("def",A4,1) |
3,4 | |
後ろからの検索 | t.rfind('def') | InStrRev(t, "def") | 13,14 | |||
カウント | t.count('def') | str_count(t, "def") | 2 |
注意)str_detect
, str_locate
関数については、上記参照。
文字列の置換
Python | R | R(stringr) | VBA | EXCEL | 結果 | |
---|---|---|---|---|---|---|
置換 | s.replace('def', 'DEF') | sub("def", "DEF", s) | str_replace(s, "def", "DEF") | Replace(s, "def", "DEF") | =SUBSTITUTE( A4,"def","DEF") =REPLACE(A4, FIND("def",A4), LEN("def"),"DEF") |
abcDEFghij |
最初の1つだけ置換 | sub("def", "DEF", t) | str_replace(t, "def", "DEF") | abcDEFghij abcdefghij |
|||
すべて置換 | t.replace('def', 'DEF') | gsub("def", "DEF", t) | str_replace_all(t, "def", "DEF") | Replace(t, "def", "DEF") | =SUBSTITUTE( A5,"def","DEF") |
abcDEFghij abcDEFghij |
文字列の変換
Python | R | R(stringr) | VBA | EXCEL | 結果 | |
---|---|---|---|---|---|---|
大文字に | u.upper() | toupper(u) | str_to_upper(u) | UCase(u) | =UPPER(A6) | ABCDEFGHIJ |
小文字に | u.lower() | tolower(u) | str_to_lower(u) | LCase(u) | =LOWER(A6) | abcdefghij |
先頭のみ大文字・それ以外は小文字に | u.capitalize() | str_to_title(u) str_to_sentence(u) |
StrConv(u, vbProperCase) | =PROPER(A6) | Abcdefghij | |
大文字と小文字を入れ替え | u.swapcase() | chartr("A-Za-z", "a-zA-z", u) | ABCdefGHIJ | |||
大文字かどうかの判定 | u.isupper() | u == toupper(u) | u == str_to_upper(u) | False | ||
小文字かどうかの判定 | u.islower() | u == tolower(u) | u == str_to_lower(u) | False | ||
全角に | chartr("A-Za-z", "A-Za-z", u) | StrConv(u, vbWide) | =JIS(A6) | abcDEFghij | ||
半角に | chartr("A-Za-z", "A-Za-z", v) | StrConv(v, vbNarrow) | =ASC(A7) | abcDEFghij |
文字列のスペース
Python | R | R(stringr) | VBA | EXCEL | 結果 | |
---|---|---|---|---|---|---|
スペース | ' ' * 3 | str_dup(" ", 3) | Space(3) | =REPT(" ",3) | " " | |
両側スペース削除 | w.strip(' ') | str_trim(s, side="both") | Trim(w) | =TRIM(A8) | "d e f" | |
左スペース削除 | w.lstrip(' ') | str_trim(s, side="left") | LTrim(w) | "d e f " | ||
右スペース削除 | w.rstrip(' ') | str_trim(s, side="right") | RTrim(w) | " d e f" |
注意)EXCELのTRIM関数は文字列の中のスペースも1つを除いて削除されてd e f
となります。
プログラム全体
参考までに使ったプログラムの全体を示します。
Python, VBAのコードは前回の記事参照。
R(stringr)
library(stringr)
# 文字列の結合
s1 <- "abc"
s2 <- "def"
s3 <- "ghij"
str_c(s1, s2, s3)
# "abcdefghij"
# 文字列の長さ
s <- "abcdefghij"
str_length(s)
# 10
# 文字列の取り出し
s <- "abcdefghij"
str_sub(s, 1, 2)
# "ab"
str_sub(s, -2, -1)
# "ij"
str_sub(s, 4, 6)
# "def"
# 文字列の検索
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_detect(s, "def")
# TRUE
str_detect(t, "def")
# TRUE
str_count(s, "def")
# 1
str_count(t, "def")
# 2
str_locate(s, "def")
# start end
# [1,] 4 6
str_locate(t, "def")
# start end
# [1,] 4 6
class(str_locate(t, "def"))
# "matrix"
str_locate_all(t, "def")
# [[1]]
# start end
# [1,] 4 6
# [2,] 14 16
class(str_locate_all(t, "def"))
# "list"
# 文字列の置換
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_replace(s, "def", "DEF")
# "abcDEFghij"
str_replace(t, "def", "DEF")
# "abcDEFghijabcdefghij"
str_replace_all(t, "def", "DEF")
# "abcDEFghijabcDEFghij"
# 文字列の大文字・小文字の変換
s <- "abcDEFghij"
str_to_upper(s) # 大文字に
# "ABCDEFGHIJ"
str_to_lower(s) # 小文字に
# "abcdefghij"
str_to_title(s) # 先頭のみ大文字・それ以外は小文字に
# "abcdefghij"
str_to_sentence(s) # 先頭のみ大文字・それ以外は小文字に
# "Abcdefghij"
ss <- "abc def ghij"
str_to_title(ss)
# "Abc Def Ghij"
str_to_sentence(ss)
# "Abc def ghij"
t <- ""
for (i in 1:str_length(s)) {
stemp = str_sub(s,i,i)
if (stemp == str_to_lower(stemp)) {
stemp = str_to_upper(stemp)
} else if (stemp == str_to_upper(stemp)) {
stemp = str_to_lower(stemp)
}
t <- str_c(t, stemp)
}
t # 大文字・小文字の入れ替え
# "ABCdefGHIJ"
s == str_to_upper(s) # すべて大文字かどうかの判定
# FALSE
s == str_to_lower(s) # すべて小文字かどうかの判定
# FALSE
# 文字列の反転
s <- "abcdefghij"
t <- ""
for (i in 1:str_length(s)) {
t <- str_c(t, str_sub(s, -i, -i))
}
t
# "jihgfedcba"
# 文字列の繰り返し
str_dup("A", 3)
# "AAA"
str_dup("def", 3)
# "defdefdef"
# 文字列のスペース
str_c("-", str_dup(" ", 3), "-")
# "- -"
# "- -"
s <- str_c(str_dup(" ", 2), "d",
str_dup(" ", 3), "e",
str_dup(" ", 4), "f",
str_dup(" ", 5))
str_c("-", s, "-")
# "- d e f -"
# 文字列の前後のスペース削除
str_trim(s, side="left")
# "d e f "
str_trim(s, side="right")
# " d e f"
str_trim(s, side="both")
# "d e f"
# 文字列ベクトル
s1 <- "abcdefghij"
s2 <- "cdefghijkl"
s3 <- "efghijklmn"
ss <- c(s1, s2, s3)
ss
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"
str_c(ss, "_1")
# [1] "abcdefghij_1" "cdefghijkl_1" "efghijklmn_1"
str_length(ss)
# [1] 10 10 10
str_sub(ss, 1, 2)
# [1] "ab" "cd" "ef"
str_sub(ss, -2, -1)
# [1] "ij" "kl" "mn"
str_sub(ss, 2, 3)
# [1] "bc" "de" "fg"
str_detect(ss, "def")
# [1] TRUE TRUE FALSE
str_count(ss, "def")
# [1] 1 1 0
str_locate(ss, "def")
# start end
# [1,] 4 6
# [2,] 2 4
# [3,] NA NA
str_locate_all(ss, "def")
# [[1]]
# start end
# [1,] 4 6
#
# [[2]]
# start end
# [1,] 2 4
#
# [[3]]
# start end
#
str_replace(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"
str_replace_all(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"
str_to_upper(ss)
# [1] "ABCDEFGHIJ" "CDEFGHIJKL" "EFGHIJKLMN"
str_to_lower(ss)
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"
str_to_title(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"
str_to_sentence(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"
ss == str_to_upper(ss)
# [1] FALSE FALSE FALSE
ss == str_to_lower(ss)
# [1] TRUE TRUE TRUE
str_dup(ss, 2)
# [1] "abcdefghijabcdefghij" "cdefghijklcdefghijkl" "efghijklmnefghijklmn"
tt <- str_c(" ", ss, " _1 ")
tt
# [1] " abcdefghij _1 " " cdefghijkl _1 " " efghijklmn _1 "
str_trim(tt)
# [1] "abcdefghij _1" "cdefghijkl _1" "efghijklmn _1"
str_trim(tt, side="left")
# [1] "abcdefghij _1 " "cdefghijkl _1 " "efghijklmn _1 "
str_trim(tt, side="right")
# [1] " abcdefghij _1" " cdefghijkl _1" " efghijklmn _1"