LoginSignup
1
3

More than 3 years have passed since last update.

VBAユーザがPython・Rを使ってみた:文字列操作(続)

Last updated at Posted at 2021-01-18

はじめに

機械学習の勉強を始めたVBAユーザです。
備忘録としてPython・Rの文法をVBAと比較しながらまとめていきたいと思います。

目次

文字列操作

以前の記事では、PythonとRの文字列操作についてVBAとの比較をしましたが、その続きで、Rの文字列操作のパッケージstringrを使ってみます。

参考記事に「R標準のbaseパッケージが提供する関数でも文字列処理は可能だが、 stringrのほうが統一的なインターフェイスに合理的な挙動で使いやすい。」とありますが、実際に使ってみると、確かに、関数のネーミングと引数の順序に統一性があって、感動的に使いやすいです。
参考:stringr — Rの文字列をまともな方法で処理する

文字列の結合

R(stringr)

R
library(stringr)

s1 <- "abc"
s2 <- "def"
s3 <- "ghij"
str_c(s1, s2, s3)
# "abcdefghij"

文字列の長さ

R(stringr)

R
s <- "abcdefghij"
str_length(s)
# 10

文字列の取り出し

R(stringr)

R
s <- "abcdefghij"
str_sub(s, 1, 2)
# "ab"
str_sub(s, -2, -1)
# "ij"
str_sub(s, 4, 6)
# "def"

文字列の検索

R(stringr)

R
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_detect(s, "def")
# TRUE
str_detect(t, "def")
# TRUE
str_count(s, "def")
# 1
str_count(t, "def")
# 2
str_locate(s, "def")
#      start end
# [1,]     4   6
str_locate(t, "def")
#      start end
# [1,]     4   6
class(str_locate(t, "def"))
# "matrix"
str_locate_all(t, "def")
# [[1]]
#      start end
# [1,]     4   6
# [2,]    14  16
class(str_locate_all(t, "def"))
# "list"

文字列の置換

R(stringr)

R
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_replace(s, "def", "DEF")
# "abcDEFghij"
str_replace(t, "def", "DEF")
# "abcDEFghijabcdefghij"
str_replace_all(t, "def", "DEF")
# "abcDEFghijabcDEFghij"

文字列の変換

大文字と小文字の変換

R(stringr)

R
s <- "abcDEFghij"
str_to_upper(s)    # 大文字に
# "ABCDEFGHIJ"
str_to_lower(s)    # 小文字に
# "abcdefghij"
str_to_title(s)    # 先頭のみ大文字・それ以外は小文字に
# "abcdefghij"
str_to_sentence(s) # 先頭のみ大文字・それ以外は小文字に
# "Abcdefghij"

ss <- "abc def ghij"
str_to_title(ss)
# "Abc Def Ghij"
str_to_sentence(ss)
# "Abc def ghij"

t <- ""
for (i in 1:str_length(s)) {
  stemp = str_sub(s,i,i)
  if (stemp == str_to_lower(stemp)) {
    stemp = str_to_upper(stemp)
  } else if (stemp == str_to_upper(stemp)) {
    stemp = str_to_lower(stemp)
  }
  t <- str_c(t, stemp)
}
t                     # 大文字・小文字の入れ替え
# "ABCdefGHIJ"
s == str_to_upper(s)  # すべて大文字かどうかの判定
# FALSE
s == str_to_lower(s)  # すべて小文字かどうかの判定
# FALSE

全角と半角の変換

R(stringr)

R

文字列の反転

R(stringr)

R
s <- "abcdefghij"
t <- ""
for (i in 1:str_length(s)) {
  t <- str_c(t, str_sub(s, -i, -i))
}
t
# "jihgfedcba"

文字列の繰り返し

R(stringr)

R
str_dup("A", 3)
# "AAA"
str_dup("def", 3)
# "defdefdef"

スペース

スペースの文字列

R(stringr)

R
str_c("-", str_dup(" ", 3), "-")
# "-   -"
# "-   -"
s <- str_c(str_dup(" ", 2), "d",
           str_dup(" ", 3), "e",
           str_dup(" ", 4), "f",
           str_dup(" ", 5))
str_c("-", s, "-")
# "-  d   e    f     -"

前後の不要なスペースの削除

R(stringr)

R
str_trim(s, side="left")
# "d   e    f     "
str_trim(s, side="right")
# "  d   e    f"
str_trim(s, side="both")
# "d   e    f"

文字列ベクトルについて

stringrパッケージの関数は、文字列(1個の文字列)だけでなく、文字列のベクトルやデータフレームでも使えます。
例えば、3個の文字列からなる文字列ベクトルにstr_length関数を使うと、その各要素の文字列に対してそれぞれstr_length関数を使った結果の数値3個からなるベクトルが返ります。

R(stringr)

R
s1 <- "abcdefghij"
s2 <- "cdefghijkl"
s3 <- "efghijklmn"
ss <- c(s1, s2, s3)
ss
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"

str_c(ss, "_1")
# [1] "abcdefghij_1" "cdefghijkl_1" "efghijklmn_1" 

str_length(ss)
# [1] 10 10 10

str_sub(ss, 1, 2)
# [1] "ab" "cd" "ef"
str_sub(ss, -2, -1)
# [1] "ij" "kl" "mn"
str_sub(ss, 2, 3)
# [1] "bc" "de" "fg"

str_detect(ss, "def")
# [1]  TRUE  TRUE FALSE
str_count(ss, "def")
# [1] 1 1 0
str_locate(ss, "def")
#      start end
# [1,]     4   6
# [2,]     2   4
# [3,]    NA  NA
str_locate_all(ss, "def")
# [[1]]
#      start end
# [1,]     4   6
# 
# [[2]]
#      start end
# [1,]     2   4
# 
# [[3]]
#      start end
# 

str_replace(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"
str_replace_all(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"

str_to_upper(ss)
# [1] "ABCDEFGHIJ" "CDEFGHIJKL" "EFGHIJKLMN"
str_to_lower(ss)
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"
str_to_title(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"
str_to_sentence(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"

ss == str_to_upper(ss)
# [1] FALSE FALSE FALSE
ss == str_to_lower(ss)
# [1] TRUE TRUE TRUE

str_dup(ss, 2)
# [1] "abcdefghijabcdefghij" "cdefghijklcdefghijkl" "efghijklmnefghijklmn"

tt <- str_c(" ", ss, " _1 ")
tt
# [1] " abcdefghij _1 " " cdefghijkl _1 " " efghijklmn _1 "
str_trim(tt)
# [1] "abcdefghij _1" "cdefghijkl _1" "efghijklmn _1"
str_trim(tt, side="left")
# [1] "abcdefghij _1 " "cdefghijkl _1 " "efghijklmn _1 "
str_trim(tt, side="right")
# [1] " abcdefghij _1" " cdefghijkl _1" " efghijklmn _1"

ベクトルやデータフレームについては、また別の記事でまとめたいと思います。

まとめ

一覧

各言語で使用する文字列操作関数等を一覧にまとめます。比較のために、EXCELでの計算も示しました。
s1 = "abc"
s2 = "def"
s3 = "ghij"
s = "abcdefghij"
t = "abcdefghijabcdefghij"
u = "abcDEFghij"
v = "abcDEFghij"
w = " d e f "
とします。また、EXCELのセルにそれぞれ
A1セル:="abc"
A2セル:="def"
A3セル:="ghij"
A4セル:="abcdefghij"
A5セル:="abcdefghijabcdefghij"
A6セル:="abcDEFghij"
A7セル:="abcDEFghij"
A8セル:=" d e f "
が入力されているものとします。

文字列の基本的操作

Python R R(stringr) VBA EXCEL 結果
結合 s1 + s2 + s3 paste0(s1, s2, s3)
paste(s1, s2, s3, sep="")
str_c(s1, s2, s3) s1 & s2 & s3 =A1&A2&A3
=CONCATENATE(
A1,A2,A3)
abcdefghij
長さ len(s) nchar(s) str_length(s) Len(s) =LEN(A4) 10
反転 s[::-1] StrReverse(s) jihgfedcba
繰り返し 'A' * 3 str_dup("A", 3) String(3, "A") =REPT("A",3) AAA
繰り返し 'def' * 3 str_dup("def", 3) =REPT("def",3) defdefdef

文字列の取り出し

Python R R(stringr) VBA EXCEL 結果
左から s[8:10]
s[0:2]
s[:2]
substr(s, 1, 2)
substring(s, 1, 2)
str_sub(s, 1, 2) Left(s, 2) =LEFT(A4,2) ab
右から s[len(s)-2:len(s)]
s[-2:]
substr(s,
nchar(s)-2+1,
nchar(s))
str_sub(s, -2, -1) Right(s, 2) =RIGHT(A4,2) ij
途中 s[3:6] substr(s, 4, 6) str_sub(s, 4, 6) Mid(s, 4, 3) =MID(A4,4,3) def

注意)「途中」の文字列の取り出しについて、Python, Rの関数では取り出す文字列を「どこからどこまで」と指定しますが、VBA, EXCELの関数では「どこから何文字分」と指定します。

文字列の検索

Python R R(stringr) VBA EXCEL 結果
検索 s.find('def') str_locate(s, "def") InStr(1, s, "def") =FIND("def",A4,1)
=SEARCH("def",A4,1)
3,4
後ろからの検索 t.rfind('def') InStrRev(t, "def") 13,14
カウント t.count('def') str_count(t, "def") 2

注意)str_detect, str_locate関数については、上記参照。

文字列の置換

Python R R(stringr) VBA EXCEL 結果
置換 s.replace('def', 'DEF') sub("def", "DEF", s) str_replace(s, "def", "DEF") Replace(s, "def", "DEF") =SUBSTITUTE(
A4,"def","DEF")
=REPLACE(A4,
FIND("def",A4),
LEN("def"),"DEF")
abcDEFghij
最初の1つだけ置換 sub("def", "DEF", t) str_replace(t, "def", "DEF") abcDEFghij
abcdefghij
すべて置換 t.replace('def', 'DEF') gsub("def", "DEF", t) str_replace_all(t, "def", "DEF") Replace(t, "def", "DEF") =SUBSTITUTE(
A5,"def","DEF")
abcDEFghij
abcDEFghij

文字列の変換

Python R R(stringr) VBA EXCEL 結果
大文字に u.upper() toupper(u) str_to_upper(u) UCase(u) =UPPER(A6) ABCDEFGHIJ
小文字に u.lower() tolower(u) str_to_lower(u) LCase(u) =LOWER(A6) abcdefghij
先頭のみ大文字・それ以外は小文字に u.capitalize() str_to_title(u)
str_to_sentence(u)
StrConv(u, vbProperCase) =PROPER(A6) Abcdefghij
大文字と小文字を入れ替え u.swapcase() chartr("A-Za-z", "a-zA-z", u) ABCdefGHIJ
大文字かどうかの判定 u.isupper() u == toupper(u) u == str_to_upper(u) False
小文字かどうかの判定 u.islower() u == tolower(u) u == str_to_lower(u) False
全角に chartr("A-Za-z", "A-Za-z", u) StrConv(u, vbWide) =JIS(A6) abcDEFghij
半角に chartr("A-Za-z", "A-Za-z", v) StrConv(v, vbNarrow) =ASC(A7) abcDEFghij

文字列のスペース

Python R R(stringr) VBA EXCEL 結果
スペース ' ' * 3 str_dup(" ", 3) Space(3) =REPT(" ",3) " "
両側スペース削除 w.strip(' ') str_trim(s, side="both") Trim(w) =TRIM(A8) "d e f"
左スペース削除 w.lstrip(' ') str_trim(s, side="left") LTrim(w) "d e f "
右スペース削除 w.rstrip(' ') str_trim(s, side="right") RTrim(w) " d e f"

注意)EXCELのTRIM関数は文字列の中のスペースも1つを除いて削除されてd e fとなります。

プログラム全体

参考までに使ったプログラムの全体を示します。
Python, VBAのコードは前回の記事参照。

R(stringr)

R
library(stringr)

# 文字列の結合
s1 <- "abc"
s2 <- "def"
s3 <- "ghij"
str_c(s1, s2, s3)
# "abcdefghij"

# 文字列の長さ
s <- "abcdefghij"
str_length(s)
# 10

# 文字列の取り出し
s <- "abcdefghij"
str_sub(s, 1, 2)
# "ab"
str_sub(s, -2, -1)
# "ij"
str_sub(s, 4, 6)
# "def"

# 文字列の検索
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_detect(s, "def")
# TRUE
str_detect(t, "def")
# TRUE
str_count(s, "def")
# 1
str_count(t, "def")
# 2
str_locate(s, "def")
#      start end
# [1,]     4   6
str_locate(t, "def")
#      start end
# [1,]     4   6
class(str_locate(t, "def"))
# "matrix"
str_locate_all(t, "def")
# [[1]]
#      start end
# [1,]     4   6
# [2,]    14  16
class(str_locate_all(t, "def"))
# "list"

# 文字列の置換
s <- "abcdefghij"
t <- str_c(s, s, sep="") # "abcdefghijabcdefghij"
str_replace(s, "def", "DEF")
# "abcDEFghij"
str_replace(t, "def", "DEF")
# "abcDEFghijabcdefghij"
str_replace_all(t, "def", "DEF")
# "abcDEFghijabcDEFghij"

# 文字列の大文字・小文字の変換
s <- "abcDEFghij"
str_to_upper(s)    # 大文字に
# "ABCDEFGHIJ"
str_to_lower(s)    # 小文字に
# "abcdefghij"
str_to_title(s)    # 先頭のみ大文字・それ以外は小文字に
# "abcdefghij"
str_to_sentence(s) # 先頭のみ大文字・それ以外は小文字に
# "Abcdefghij"

ss <- "abc def ghij"
str_to_title(ss)
# "Abc Def Ghij"
str_to_sentence(ss)
# "Abc def ghij"

t <- ""
for (i in 1:str_length(s)) {
  stemp = str_sub(s,i,i)
  if (stemp == str_to_lower(stemp)) {
    stemp = str_to_upper(stemp)
  } else if (stemp == str_to_upper(stemp)) {
    stemp = str_to_lower(stemp)
  }
  t <- str_c(t, stemp)
}
t                     # 大文字・小文字の入れ替え
# "ABCdefGHIJ"
s == str_to_upper(s)  # すべて大文字かどうかの判定
# FALSE
s == str_to_lower(s)  # すべて小文字かどうかの判定
# FALSE

# 文字列の反転
s <- "abcdefghij"
t <- ""
for (i in 1:str_length(s)) {
  t <- str_c(t, str_sub(s, -i, -i))
}
t
# "jihgfedcba"

# 文字列の繰り返し
str_dup("A", 3)
# "AAA"
str_dup("def", 3)
# "defdefdef"

# 文字列のスペース
str_c("-", str_dup(" ", 3), "-")
# "-   -"
# "-   -"
s <- str_c(str_dup(" ", 2), "d",
           str_dup(" ", 3), "e",
           str_dup(" ", 4), "f",
           str_dup(" ", 5))
str_c("-", s, "-")
# "-  d   e    f     -"

# 文字列の前後のスペース削除
str_trim(s, side="left")
# "d   e    f     "
str_trim(s, side="right")
# "  d   e    f"
str_trim(s, side="both")
# "d   e    f"


# 文字列ベクトル
s1 <- "abcdefghij"
s2 <- "cdefghijkl"
s3 <- "efghijklmn"
ss <- c(s1, s2, s3)
ss
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"

str_c(ss, "_1")
# [1] "abcdefghij_1" "cdefghijkl_1" "efghijklmn_1" 

str_length(ss)
# [1] 10 10 10

str_sub(ss, 1, 2)
# [1] "ab" "cd" "ef"
str_sub(ss, -2, -1)
# [1] "ij" "kl" "mn"
str_sub(ss, 2, 3)
# [1] "bc" "de" "fg"

str_detect(ss, "def")
# [1]  TRUE  TRUE FALSE
str_count(ss, "def")
# [1] 1 1 0
str_locate(ss, "def")
#      start end
# [1,]     4   6
# [2,]     2   4
# [3,]    NA  NA
str_locate_all(ss, "def")
# [[1]]
#      start end
# [1,]     4   6
# 
# [[2]]
#      start end
# [1,]     2   4
# 
# [[3]]
#      start end
# 

str_replace(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"
str_replace_all(ss, "def", "DEF")
# [1] "abcDEFghij" "cDEFghijkl" "efghijklmn"

str_to_upper(ss)
# [1] "ABCDEFGHIJ" "CDEFGHIJKL" "EFGHIJKLMN"
str_to_lower(ss)
# [1] "abcdefghij" "cdefghijkl" "efghijklmn"
str_to_title(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"
str_to_sentence(ss)
# [1] "Abcdefghij" "Cdefghijkl" "Efghijklmn"

ss == str_to_upper(ss)
# [1] FALSE FALSE FALSE
ss == str_to_lower(ss)
# [1] TRUE TRUE TRUE

str_dup(ss, 2)
# [1] "abcdefghijabcdefghij" "cdefghijklcdefghijkl" "efghijklmnefghijklmn"

tt <- str_c(" ", ss, " _1 ")
tt
# [1] " abcdefghij _1 " " cdefghijkl _1 " " efghijklmn _1 "
str_trim(tt)
# [1] "abcdefghij _1" "cdefghijkl _1" "efghijklmn _1"
str_trim(tt, side="left")
# [1] "abcdefghij _1 " "cdefghijkl _1 " "efghijklmn _1 "
str_trim(tt, side="right")
# [1] " abcdefghij _1" " cdefghijkl _1" " efghijklmn _1"

参考

1
3
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
1
3