はじめに
Rのデータフレームの列の操作について、サンプルデータを用いて具体的に練習してみました。
目次
- Rのデータフレームの列についての操作
- 参考
Rのデータフレームの列についての操作
Rのデータフレームの列の操作について、サンプルデータを用いて具体的に練習します。
なお、以下については別記事でまとめています。
参考:
https://r4ds.had.co.nz/transform.html
https://github.com/tidyverse/dplyr
https://heavywatal.github.io/rstats/dplyr.html
練習に用いるデータ
Rのサンプルデータiris
とanscombe
を用います。
行番号を表示した方が分かりやすいため、tibble::rowid_to_column()
で行番号を列名にしています。
(こちらの記事と同じデータを用います。)
library(dplyr)
library(tibble)
library(tidyr)
iris
# 行番号を列名に
iris_tbl <- iris %>% as_tibble() %>% rowid_to_column("id")
iris_tbl
# # A tibble: 150 x 6
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <int> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 5.1 3.5 1.4 0.2 setosa
# 2 2 4.9 3 1.4 0.2 setosa
anscombe
anscombe_tbl <- anscombe %>% as_tibble() %>% rowid_to_column("id")
anscombe_tbl
# # A tibble: 11 x 9
# id x1 x2 x3 x4 y1 y2 y3 y4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 10 8 8.04 9.14 7.46 6.58
# 2 2 8 8 8 8 6.95 8.14 6.77 5.76
select():列の選択・並び替え
列を選択します。
# select()
iris_tbl %>%
select(c(1, 6, 2))
iris_tbl %>%
select(c(id, Species, Sepal.Length))
iris_tbl %>%
select(id, Species, Sepal.Length)
# # A tibble: 150 x 3
# id Species Sepal.Length
# <int> <fct> <dbl>
# 1 1 setosa 5.1
# 2 2 setosa 4.9
iris_tbl %>%
select(2:5)
iris_tbl %>%
select(Sepal.Length:Petal.Width)
# # A tibble: 150 x 4
# Sepal.Length Sepal.Width Petal.Length Petal.Width
# <dbl> <dbl> <dbl> <dbl>
# 1 5.1 3.5 1.4 0.2
# 2 4.9 3 1.4 0.2
anscombe_tbl %>%
select(2:5)
anscombe_tbl %>%
select(x1:y4)
# # A tibble: 11 x 4
# x1 x2 x3 x4
# <dbl> <dbl> <dbl> <dbl>
# 1 10 10 10 8
# 2 8 8 8 8
everything():すべての列
# everything()
iris_tbl %>%
select(everything())
# # A tibble: 150 x 4
# Sepal.Length Sepal.Width Petal.Length Petal.Width
# <dbl> <dbl> <dbl> <dbl>
# 1 5.1 3.5 1.4 0.2
# 2 4.9 3 1.4 0.2
anscombe_tbl %>%
select(everything())
# # A tibble: 11 x 9
# id x1 x2 x3 x4 y1 y2 y3 y4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 10 8 8.04 9.14 7.46 6.58
# 2 2 8 8 8 8 6.95 8.14 6.77 5.76
last_col():最後の列
# last_col()
iris_tbl %>%
select(last_col()) # 最後の列
iris_tbl %>%
select(last_col(0)) # 最後の列
# # A tibble: 150 x 1
# Species
# <fct>
# 1 setosa
# 2 setosa
iris_tbl %>%
select(last_col(1)) # 最後の1つ前の列(後ろから2番目の列)
# # A tibble: 150 x 1
# Petal.Width
# <dbl>
# 1 0.2
# 2 0.2
iris_tbl %>%
select(last_col(2)) # 最後の2つ前の列(後ろから3番目の列)
# # A tibble: 150 x 1
# Petal.Length
# <dbl>
# 1 1.4
# 2 1.4
iris_tbl %>%
select(2:last_col(2)) # 2列目から最後の2つ前の列(後ろから3番目の列)まで
# # A tibble: 150 x 3
# Sepal.Length Sepal.Width Petal.Length
# <dbl> <dbl> <dbl>
# 1 5.1 3.5 1.4
# 2 4.9 3 1.4
iris_tbl %>%
select(!(2:last_col(2)))
# # A tibble: 150 x 3
# id Petal.Width Species
# <int> <dbl> <fct>
# 1 1 0.2 setosa
# 2 2 0.2 setosa
&, |:列の条件選択 and・or
# &, |
iris_tbl %>%
select(1:3 & 2:5)
# # A tibble: 150 x 2
# Sepal.Length Sepal.Width
# <dbl> <dbl>
# 1 5.1 3.5
# 2 4.9 3
iris_tbl %>%
select(1:3 | 2:5)
# # A tibble: 150 x 5
# id Sepal.Length Sepal.Width Petal.Length Petal.Width
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 5.1 3.5 1.4 0.2
# 2 2 4.9 3 1.4 0.2
iris_tbl %>%
select(starts_with("Sepal") & ends_with("Width"))
# # A tibble: 150 x 1
# Sepal.Width
# <dbl>
# 1 3.5
# 2 3
iris_tbl %>%
select(starts_with("Sepal") | ends_with("Width"))
# # A tibble: 150 x 3
# Sepal.Length Sepal.Width Petal.Width
# <dbl> <dbl> <dbl>
# 1 5.1 3.5 0.2
# 2 4.9 3 0.2
-, !:列の条件選択 not
# -, !
iris_tbl %>%
select(-id)
iris_tbl %>%
select(!id)
# # A tibble: 150 x 5
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <dbl> <dbl> <dbl> <dbl> <fct>
# 1 5.1 3.5 1.4 0.2 setosa
# 2 4.9 3 1.4 0.2 setosa
iris_tbl %>%
select(!2:5)
iris_tbl %>%
select(-(2:5))
# # A tibble: 150 x 2
# id Species
# <int> <fct>
# 1 1 setosa
# 2 2 setosa
iris_tbl %>%
select(-c(1, 6, 2))
iris_tbl %>%
select(-id, -Species, -Sepal.Length)
# # A tibble: 150 x 3
# Sepal.Width Petal.Length Petal.Width
# <dbl> <dbl> <dbl>
# 1 3.5 1.4 0.2
# 2 3 1.4 0.2
iris_tbl %>%
select(-starts_with("Sepal"))
iris_tbl %>%
select(!starts_with("Sepal"))
# # A tibble: 150 x 4
# id Petal.Length Petal.Width Species
# <int> <dbl> <dbl> <fct>
# 1 1 1.4 0.2 setosa
# 2 2 1.4 0.2 setosa
starts_with(), ends_with():指定した文字列で始まる・終わる列名
# starts_with(), ends_with()
iris_tbl %>%
select(id, starts_with("S")) # "S"で始まる列
# # A tibble: 150 x 4
# id Sepal.Length Sepal.Width Species
# <int> <dbl> <dbl> <fct>
# 1 1 5.1 3.5 setosa
# 2 2 4.9 3 setosa
iris_tbl %>%
select(id, ends_with("th")) # "th"で終わる列
# # A tibble: 150 x 5
# id Sepal.Length Sepal.Width Petal.Length Petal.Width
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 5.1 3.5 1.4 0.2
# 2 2 4.9 3 1.4 0.2
contains():指定した文字列を含む列名
# contains()
iris_tbl %>%
select(contains("al")) # "al"を含む列
iris_tbl %>%
select(contains(".")) # "."を含む列
# # A tibble: 150 x 4
# Sepal.Length Sepal.Width Petal.Length Petal.Width
# <dbl> <dbl> <dbl> <dbl>
# 1 5.1 3.5 1.4 0.2
# 2 4.9 3 1.4 0.2
iris_tbl %>%
select(-contains("al"))
# # A tibble: 150 x 2
# id Species
# <int> <fct>
# 1 1 setosa
# 2 2 setosa
matches():指定した正規表現にマッチする列名
# matches()
iris_tbl %>%
select(matches("al")) # "al"を含む列名
# # A tibble: 150 x 4
# Sepal.Length Sepal.Width Petal.Length Petal.Width
# <dbl> <dbl> <dbl> <dbl>
# 1 5.1 3.5 1.4 0.2
# 2 4.9 3 1.4 0.2
iris_tbl %>%
select(matches(".")) # 「"."を含む列名」ではない
# contains()に指定するのは文字列だが、matches()に指定するのは正規表現
# 正規表現で"."は「何でも」の意味
# # A tibble: 150 x 6
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <int> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 5.1 3.5 1.4 0.2 setosa
# 2 2 4.9 3 1.4 0.2 setosa
iris_tbl %>%
select(matches("\\.")) # "."を含む列名
# 正規表現で"\\."は"."を含む文字列
# # A tibble: 150 x 4
# Sepal.Length Sepal.Width Petal.Length Petal.Width
# <dbl> <dbl> <dbl> <dbl>
# 1 5.1 3.5 1.4 0.2
# 2 4.9 3 1.4 0.2
iris_tbl %>%
select(matches("^..$")) # 2桁の列名
# # A tibble: 150 x 1
# id
# <int>
# 1 1
# 2 2
iris_tbl %>%
select(matches("^S......$")) # "S"で始まる7桁の列名
iris_tbl %>%
select(matches("^S.{6}$")) # "S"で始まる7桁の列名
# # A tibble: 150 x 1
# Species
# <fct>
# 1 setosa
# 2 setosa
iris_tbl %>%
select(matches("^S.+$")) # "S"で始まる2桁以上の列名
iris_tbl %>%
select(matches("^S.*$")) # "S"で始まる1桁以上の列名
# # A tibble: 150 x 3
# Sepal.Length Sepal.Width Species
# <dbl> <dbl> <fct>
# 1 5.1 3.5 setosa
# 2 4.9 3 setosa
iris_tbl %>%
select(matches("p")) # "p"を含む列名
# # A tibble: 150 x 5
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <dbl> <dbl> <dbl> <dbl> <fct>
# 1 5.1 3.5 1.4 0.2 setosa
# 2 4.9 3 1.4 0.2 setosa
iris_tbl %>%
select(matches("^p")) # "p"で始まる列名(1桁目が"p")
# # A tibble: 150 x 2
# Petal.Length Petal.Width
# <dbl> <dbl>
# 1 1.4 0.2
# 2 1.4 0.2
iris_tbl %>%
select(matches("^.p")) # 2桁目が"p"の列名
# # A tibble: 150 x 1
# Species
# <fct>
# 1 setosa
# 2 setosa
iris_tbl %>%
select(matches("\\W")) # 英数字以外を含む列名
# # A tibble: 150 x 4
# Sepal.Length Sepal.Width Petal.Length Petal.Width
# <dbl> <dbl> <dbl> <dbl>
# 1 5.1 3.5 1.4 0.2
# 2 4.9 3 1.4 0.2
iris_tbl %>%
select(matches("^\\w+$")) # 先頭から末尾まですべて英数字の列名(英数字以外を含まない列名)
iris_tbl %>%
select(-matches("\\W")) # 英数字以外を含まない列名
# # A tibble: 150 x 2
# id Species
# <int> <fct>
# 1 1 setosa
# 2 2 setosa
anscombe_tbl %>%
select(matches("^x|y")) # "x"か"y"で始まる列名
anscombe_tbl %>%
select(matches("[1-4]$")) # "1"-"4"で終わる列名
anscombe_tbl %>%
select(matches("^\\w\\d")) # 1桁目が英数字、2桁目が数字の列名
# # A tibble: 11 x 8
# x1 x2 x3 x4 y1 y2 y3 y4
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 10 10 10 8 8.04 9.14 7.46 6.58
# 2 8 8 8 8 6.95 8.14 6.77 5.76
anscombe_tbl %>%
rename(`x 1` = x1) %>% print() %>%
select(matches("\\s")) # 空白を含む列名
# # A tibble: 11 x 1
# `x 1`
# <dbl>
# 1 10
# 2 8
anscombe_tbl %>%
rename(`x 1` = x1) %>% print() %>%
select(matches("^\\S+$")) # 空白を含まない列名
anscombe_tbl %>%
rename(`x 1` = x1) %>% print() %>%
select(!matches("\\s")) # 空白を含まない列名
# # A tibble: 11 x 8
# id x2 x3 x4 y1 y2 y3 y4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 8 8.04 9.14 7.46 6.58
# 2 2 8 8 8 6.95 8.14 6.77 5.76
anscombe_tbl %>%
select(matches("^[a-z]{2}$")) # 英字小文字2桁の列名
# # A tibble: 11 x 1
# id
# <int>
# 1 1
# 2 2
iris_tbl %>%
select(matches("^[a-z|A-Z]{7}$")) # 英字7桁の列名
# # A tibble: 150 x 1
# Species
# <fct>
# 1 setosa
# 2 setosa
num_range():数字の連番の列名
# num_range()
anscombe_tbl %>%
select(num_range("x", 2:3))
# # A tibble: 11 x 2
# x2 x3
# <dbl> <dbl>
# 1 10 10
# 2 8 8
anscombe_tbl %>%
select(-num_range("x", 2:3))
# # A tibble: 11 x 7
# id x1 x4 y1 y2 y3 y4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 8 8.04 9.14 7.46 6.58
# 2 2 8 8 6.95 8.14 6.77 5.76
anscombe_tbl %>%
select(num_range("x", 3:10))
# # A tibble: 11 x 2
# x3 x4
# <dbl> <dbl>
# 1 10 8
# 2 8 8
anscombe_tbl %>%
select(-num_range("x", 3:10))
# # A tibble: 11 x 7
# id x1 x2 y1 y2 y3 y4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 8.04 9.14 7.46 6.58
# 2 2 8 8 6.95 8.14 6.77 5.76
all_of(), any_of():文字列ベクトルによる列名の選択
# all_of(), any_of()
vars <- names(iris_tbl)
vars
# [1] "id" "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
vars <- vars[2:5]
vars
# [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
iris_tbl %>%
select(all_of(vars))
# # A tibble: 150 x 4
# Sepal.Length Sepal.Width Petal.Length Petal.Width
# <dbl> <dbl> <dbl> <dbl>
# 1 5.1 3.5 1.4 0.2
# 2 4.9 3 1.4 0.2
library(stringr)
vars <- str_c("x", 1:10)
vars
# [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10"
anscombe_tbl %>%
select(any_of(vars))
# # A tibble: 11 x 4
# x1 x2 x3 x4
# <dbl> <dbl> <dbl> <dbl>
# 1 10 10 10 8
# 2 8 8 8 8
vars <- str_c(letters, 1)
vars
# [1] "a1" "b1" "c1" "d1" "e1" "f1" "g1" "h1" "i1" "j1" "k1" "l1" "m1"
# [14] "n1" "o1" "p1" "q1" "r1" "s1" "t1" "u1" "v1" "w1" "x1" "y1" "z1"
anscombe_tbl %>%
select(any_of(vars))
# # A tibble: 11 x 2
# x1 y1
# <dbl> <dbl>
# 1 10 8.04
# 2 8 6.95
anscombe_tbl %>%
select(-any_of(vars))
# # A tibble: 11 x 7
# id x2 x3 x4 y2 y3 y4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 8 9.14 7.46 6.58
# 2 2 8 8 8 8.14 6.77 5.76
where():列の属性による列の選択
# where()
iris_tbl %>%
select(where(is.numeric))
# # A tibble: 150 x 5
# id Sepal.Length Sepal.Width Petal.Length Petal.Width
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 5.1 3.5 1.4 0.2
# 2 2 4.9 3 1.4 0.2
iris_tbl %>%
select(where(is.double))
# # A tibble: 150 x 4
# Sepal.Length Sepal.Width Petal.Length Petal.Width
# <dbl> <dbl> <dbl> <dbl>
# 1 5.1 3.5 1.4 0.2
# 2 4.9 3 1.4 0.2
iris_tbl %>%
select(where(is.integer))
# # A tibble: 150 x 1
# id
# <int>
# 1 1
# 2 2
iris_tbl %>%
select(where(is.character))
# # A tibble: 150 x 0
iris_tbl %>%
select(where(is.factor))
# # A tibble: 150 x 1
# Species
# <fct>
# 1 setosa
# 2 setosa
参考:
rename():列名の変更
列名を変更できます。
# rename()
iris_tbl %>%
rename(ID = id)
# # A tibble: 150 x 6
# ID Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <int> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 5.1 3.5 1.4 0.2 setosa
# 2 2 4.9 3 1.4 0.2 setosa
iris_tbl %>%
rename(sl = Sepal.Length, sw = Sepal.Width,
pl = Petal.Length, pw = Petal.Width)
# # A tibble: 150 x 6
# id sl sw pl pw Species
# <int> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 5.1 3.5 1.4 0.2 setosa
# 2 2 4.9 3 1.4 0.2 setosa
var <- "Species" # 変更前の列名
var_2 <- toupper(var) # 変更後の列名
iris_tbl %>%
rename(!!var_2 := !!var)
# # A tibble: 150 x 6
# id Sepal.Length Sepal.Width Petal.Length Petal.Width SPECIES
# <int> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 5.1 3.5 1.4 0.2 setosa
# 2 2 4.9 3 1.4 0.2 setosa
vars <- names(iris_tbl) # 変更前の列名
vars_2 <- toupper(vars) # 変更後の列名
named_vars <- setNames(vars, vars_2) # 名前付きベクトル
iris_tbl %>%
rename(!!!named_vars)
# # A tibble: 150 x 6
# ID SEPAL.LENGTH SEPAL.WIDTH PETAL.LENGTH PETAL.WIDTH SPECIES
# <int> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 5.1 3.5 1.4 0.2 setosa
# 2 2 4.9 3 1.4 0.2 setosa
vars <- str_c("x", 1:4) # 変更前の列名
vars_2 <- toupper(vars) # 変更後の列名
named_vars <- setNames(vars, vars_2)
anscombe_tbl %>%
rename(!!!named_vars)
# # A tibble: 11 x 9
# id X1 X2 X3 X4 y1 y2 y3 y4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 10 8 8.04 9.14 7.46 6.58
# 2 2 8 8 8 8 6.95 8.14 6.77 5.76
rename_with():複数列にわたる列名の一括変更
# rename_with()
iris_tbl %>%
rename_with(toupper)
iris_tbl %>%
rename_with(toupper, everything())
# # A tibble: 150 x 6
# ID SEPAL.LENGTH SEPAL.WIDTH PETAL.LENGTH PETAL.WIDTH SPECIES
# <int> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 5.1 3.5 1.4 0.2 setosa
# 2 2 4.9 3 1.4 0.2 setosa
anscombe_tbl %>%
rename_with(toupper, x1:x4)
anscombe_tbl %>%
rename_with(.fn = toupper, .cols = x1:x4)
anscombe_tbl %>%
rename_with(.fn = ~toupper(.), .cols = x1:x4)
# # A tibble: 11 x 9
# id X1 X2 X3 X4 y1 y2 y3 y4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 10 8 8.04 9.14 7.46 6.58
# 2 2 8 8 8 8 6.95 8.14 6.77 5.76
複数列に一括適用する関数がf(x)
の形の関数なら、引数.fn
にf
か~f(.)
の形で指定します。後者の形を用いれば、複雑な関数や合成関数でも適用できます。
anscombe_tbl %>%
rename_with(~str_c("var_", .), x1:x4)
anscombe_tbl %>%
rename_with(.fn = ~str_c("var_", .), .cols = x1:x4)
# # A tibble: 11 x 9
# id var_x1 var_x2 var_x3 var_x4 y1 y2 y3 y4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 10 8 8.04 9.14 7.46 6.58
# 2 2 8 8 8 8 6.95 8.14 6.77 5.76
anscombe_tbl %>%
rename_with(~str_c("var_", toupper(.)), x1:x4)
anscombe_tbl %>%
rename_with(.fn = ~str_c("var_", toupper(.)), .cols = x1:x4)
# # A tibble: 11 x 9
# id var_X1 var_X2 var_X3 var_X4 y1 y2 y3 y4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 10 8 8.04 9.14 7.46 6.58
# 2 2 8 8 8 8 6.95 8.14 6.77 5.76
iris_tbl %>%
rename_with(~str_to_lower(str_c(str_replace(., "\\.", "_"), "(cm)")), 2:5)
iris_tbl %>%
rename_with(~str_replace(., "\\.", "_"), 2:5) %>%
rename_with(~str_c(., "(cm)"), 2:5) %>%
rename_with(~str_to_lower(.), 2:5)
# # A tibble: 150 x 6
# id `sepal_length(cm)` `sepal_width(cm)` `petal_length(cm)` `petal_width(cm)` Species
# <int> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 5.1 3.5 1.4 0.2 setosa
# 2 2 4.9 3 1.4 0.2 setosa
vars <- names(iris_tbl)
vars <- vars[2:5]
vars
# [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
str_replace(vars, "\\.", "_") # "\\."は正規表現で"."の意味
# [1] "Sepal_Length" "Sepal_Width" "Petal_Length" "Petal_Width"
str_c(str_replace(vars, "\\.", "_"), "(cm)")
# [1] "Sepal_Length(cm)" "Sepal_Width(cm)" "Petal_Length(cm)" "Petal_Width(cm)"
str_to_lower(str_c(str_replace(vars, "\\.", "_"), "(cm)"))
# [1] "sepal_length(cm)" "sepal_width(cm)" "petal_length(cm)" "petal_width(cm)"
relocate():列の並び替え
# relocate()
iris_tbl %>%
relocate(Species) # Species列を最初に
# # A tibble: 150 x 6
# Species id Sepal.Length Sepal.Width Petal.Length Petal.Width
# <fct> <int> <dbl> <dbl> <dbl> <dbl>
# 1 setosa 1 5.1 3.5 1.4 0.2
# 2 setosa 2 4.9 3 1.4 0.2
iris_tbl %>%
relocate(starts_with("Sepal")) # Petalで始まる列を最初に
# A tibble: 150 x 6
# Sepal.Length Sepal.Width id Petal.Length Petal.Width Species
# <dbl> <dbl> <int> <dbl> <dbl> <fct>
# 1 5.1 3.5 1 1.4 0.2 setosa
# 2 4.9 3 2 1.4 0.2 setosa
iris_tbl %>%
relocate(id, .after = Species) # id列をSpecies列の後に
iris_tbl %>%
relocate(id, .after = last_col()) # id列を最後の列の後に
iris_tbl %>%
relocate(id, .after = everything()) # id列を全列の後に
# # A tibble: 150 x 6
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species id
# <dbl> <dbl> <dbl> <dbl> <fct> <int>
# 1 5.1 3.5 1.4 0.2 setosa 1
# 2 4.9 3 1.4 0.2 setosa 2
iris_tbl %>%
relocate(id, .before = Species) # id列をSpecies列の前に
iris_tbl %>%
relocate(id, .before = last_col(0)) # id列をSpecies列の前に
iris_tbl %>%
relocate(id, .after = last_col(1)) # id列を後ろから2番目の列の後に
# # A tibble: 150 x 6
# Sepal.Length Sepal.Width Petal.Length Petal.Width id Species
# <dbl> <dbl> <dbl> <dbl> <int> <fct>
# 1 5.1 3.5 1.4 0.2 1 setosa
# 2 4.9 3 1.4 0.2 2 setosa
iris_tbl %>%
relocate(starts_with("Sepal"), .after = starts_with("Petal")) # Sepalで始まる列をPetalで始まる列の後に
# # A tibble: 150 x 6
# id Petal.Length Petal.Width Sepal.Length Sepal.Width Species
# <int> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 1.4 0.2 5.1 3.5 setosa
# 2 2 1.4 0.2 4.9 3 setosa
iris_tbl %>%
relocate(where(is.factor)) # factor型の列を最初に
# # A tibble: 150 x 6
# Species id Sepal.Length Sepal.Width Petal.Length Petal.Width
# <fct> <int> <dbl> <dbl> <dbl> <dbl>
# 1 setosa 1 5.1 3.5 1.4 0.2
# 2 setosa 2 4.9 3 1.4 0.2
iris_tbl %>%
relocate(-where(is.integer)) # integer型の列以外を最初に
# # A tibble: 150 x 6
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species id
# <dbl> <dbl> <dbl> <dbl> <fct> <int>
# 1 5.1 3.5 1.4 0.2 setosa 1
# 2 4.9 3 1.4 0.2 setosa 2
anscombe_tbl %>%
relocate(contains("x"), .after = contains("y")) # "x"を含む列を"y"を含む列の後に
# # A tibble: 11 x 9
# id y1 y2 y3 y4 x1 x2 x3 x4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 8.04 9.14 7.46 6.58 10 10 10 8
# 2 2 6.95 8.14 6.77 5.76 8 8 8 8
vars <- str_c(letters, "1")
vars
anscombe_tbl %>%
relocate(any_of(vars))
# # A tibble: 11 x 9
# x1 y1 id x2 x3 x4 y2 y3 y4
# <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 10 8.04 1 10 10 8 9.14 7.46 6.58
# 2 8 6.95 2 8 8 8 8.14 6.77 5.76
mutate():列の計算
# mutate()
iris_tbl %>%
mutate(SP = toupper(Species))
# # A tibble: 150 x 7
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species SP
# <int> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
# 1 1 5.1 3.5 1.4 0.2 setosa SETOSA
# 2 2 4.9 3 1.4 0.2 setosa SETOSA
iris_tbl %>%
mutate(sp = as.character(Species))
# # A tibble: 150 x 7
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species sp
# <int> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
# 1 1 5.1 3.5 1.4 0.2 setosa setosa
# 2 2 4.9 3 1.4 0.2 setosa setosa
iris_tbl %>%
mutate(sl_centralized = Sepal.Length - mean(Sepal.Length))
# 中心化
# # A tibble: 150 x 7
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species sl_centralized
# <int> <dbl> <dbl> <dbl> <dbl> <fct> <dbl>
# 1 1 5.1 3.5 1.4 0.2 setosa -0.743
# 2 2 4.9 3 1.4 0.2 setosa -0.943
iris_tbl %>%
mutate(sl_normalized = (Sepal.Length - mean(Sepal.Length)) / sd(Sepal.Length))
iris_tbl %>%
mutate(sl_normalized = as.vector(scale(Sepal.Length)))
# 平均=0、標準偏差=1に正規化 (z-score normalization)
# # A tibble: 150 x 7
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species sl_normalized
# <int> <dbl> <dbl> <dbl> <dbl> <fct> <dbl>
# 1 1 5.1 3.5 1.4 0.2 setosa -0.898
# 2 2 4.9 3 1.4 0.2 setosa -1.14
# scale()はmatrixを返すためas.vector()でベクトル化している
scale(iris$Sepal.Length)
class(scale(iris$Sepal.Length)) #[1] "matrix"
iris_tbl %>%
mutate(sl_normalized = (Sepal.Length - min(Sepal.Length)) / (max(Sepal.Length) - min(Sepal.Length)))
# 最小=0、最大=1に正規化 (min-max normalization)
# A tibble: 150 x 7
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species sl_normalized
# <int> <dbl> <dbl> <dbl> <dbl> <fct> <dbl>
# 1 1 5.1 3.5 1.4 0.2 setosa 0.222
# 2 2 4.9 3 1.4 0.2 setosa 0.167
引数.before, .after:計算後の列の場所の指定
# 引数.before, .after
iris_tbl %>%
mutate(SP = toupper(Species), .before = Species)
# A tibble: 150 x 7
# id Sepal.Length Sepal.Width Petal.Length Petal.Width SP Species
# <int> <dbl> <dbl> <dbl> <dbl> <chr> <fct>
# 1 1 5.1 3.5 1.4 0.2 SETOSA setosa
# 2 2 4.9 3 1.4 0.2 SETOSA setosa
library(stringr)
iris_tbl %>%
mutate(ID = str_pad(id, 3L, pad = "0"), .after = id)
# # A tibble: 150 x 7
# id ID Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <int> <chr> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 001 5.1 3.5 1.4 0.2 setosa
# 2 2 002 4.9 3 1.4 0.2 setosa
anscombe_tbl %>%
mutate(log_x1 = log(x1), .after = x1)
# # A tibble: 11 x 10
# id x1 log_x1 x2 x3 x4 y1 y2 y3 y4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 2.30 10 10 8 8.04 9.14 7.46 6.58
# 2 2 8 2.08 8 8 8 6.95 8.14 6.77 5.76
引数.keep:計算に用いた列を残すかどうか
# 引数.keep
anscombe_tbl %>%
mutate(z1 = x1 + y1, .keep = "all") # すべての列を残す
# # A tibble: 11 x 10
# id x1 x2 x3 x4 y1 y2 y3 y4 z1
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 10 8 8.04 9.14 7.46 6.58 18.0
# 2 2 8 8 8 8 6.95 8.14 6.77 5.76 15.0
anscombe_tbl %>%
mutate(z1 = x1 + y1, .keep = "used") # 計算に用いた列を残す
# # A tibble: 11 x 3
# x1 y1 z1
# <dbl> <dbl> <dbl>
# 1 10 8.04 18.0
# 2 8 6.95 15.0
anscombe_tbl %>%
mutate(z1 = x1 + y1, .keep = "unused") # 計算に用いなかった列を残す
# # A tibble: 11 x 8
# id x2 x3 x4 y2 y3 y4 z1
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 8 9.14 7.46 6.58 18.0
# 2 2 8 8 8 8.14 6.77 5.76 15.0
anscombe_tbl %>%
mutate(z1 = x1 + y1, .keep = "none") # どの列も残さない
# # A tibble: 11 x 1
# z1
# <dbl>
# 1 18.0
# 2 15.0
mutate(across()):複数列にわたる計算
# mutate(across())
iris_tbl %>%
mutate(across(Sepal.Length:Petal.Width, round))
iris_tbl %>%
mutate(across(2:5, round))
# # A tibble: 150 x 6
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <int> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 5 4 1 0 setosa
# 2 2 5 3 1 0 setosa
iris_tbl %>%
mutate(across(Sepal.Length:Petal.Width, log))
# # A tibble: 150 x 6
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <int> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 1.63 1.25 0.336 -1.61 setosa
# 2 2 1.59 1.10 0.336 -1.61 setosa
scale2 <- function(x, na.rm = FALSE) (x - mean(x, na.rm = na.rm)) / sd(x, na.rm)
iris_tbl %>%
mutate(across(Sepal.Length:Petal.Width, scale2))
# A tibble: 150 x 6
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <int> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 -0.898 1.02 -1.34 -1.31 setosa
# 2 2 -1.14 -0.132 -1.34 -1.31 setosa
where():複数列の条件指定
# where()
iris_tbl %>%
mutate(across(where(is.double), round))
# # A tibble: 150 x 6
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <int> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 5 4 1 0 setosa
# 2 2 5 3 1 0 setosa
iris_tbl %>%
mutate(across(where(is.factor), as.character))
# # A tibble: 150 x 6
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <int> <dbl> <dbl> <dbl> <dbl> <chr>
# 1 1 5.1 3.5 1.4 0.2 setosa
# 2 2 4.9 3 1.4 0.2 setosa
iris_tbl %>%
mutate(across(where(is.factor), as.integer))
# # A tibble: 150 x 6
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <int> <dbl> <dbl> <dbl> <dbl> <int>
# 1 1 5.1 3.5 1.4 0.2 1
# 2 2 4.9 3 1.4 0.2 1
iris_tbl %>%
mutate(across(where(is.double) & !starts_with("Petal"), round))
# # A tibble: 150 x 6
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <int> <dbl> <dbl> <dbl> <dbl> <fct>
# 1 1 5 4 1.4 0.2 setosa
# 2 2 5 3 1.4 0.2 setosa
引数.fns:複数列にわたる計算に用いる関数の指定
# 引数.fns
anscombe_tbl %>%
mutate(across(x1:y4, log10))
anscombe_tbl %>%
mutate(across(x1:y4, .fns = log10))
anscombe_tbl %>%
mutate(across(x1:y4, ~log(., 10)))
anscombe_tbl %>%
mutate(across(x1:y4, .fns = ~log(., 10)))
# # A tibble: 11 x 9
# id x1 x2 x3 x4 y1 y2 y3 y4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 1 1 0.903 0.905 0.961 0.873 0.818
# 2 2 0.903 0.903 0.903 0.903 0.842 0.911 0.831 0.760
anscombe_tbl %>%
mutate(across(-id, ~round(log(., 10), 2)))
anscombe_tbl %>%
mutate(across(-id, ~log(., 10))) %>% print() %>%
mutate(across(-id, ~round(., 2)))
# # A tibble: 11 x 9
# id x1 x2 x3 x4 y1 y2 y3 y4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 1 1 0.9 0.91 0.96 0.87 0.82
# 2 2 0.9 0.9 0.9 0.9 0.84 0.91 0.83 0.76
iris_tbl %>%
mutate(across(2:5, ~str_c(sprintf("%.2f", .), "cm")))
iris_tbl %>%
mutate(across(2:5, ~sprintf("%.2fcm", .)))
iris_tbl %>%
mutate(across(2:5, ~str_c(format(., nsmall = 2), "cm")))
# # A tibble: 150 x 6
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <int> <chr> <chr> <chr> <chr> <fct>
# 1 1 5.10cm 3.50cm 1.40cm 0.20cm setosa
# 2 2 4.90cm 3.00cm 1.40cm 0.20cm setosa
iris_tbl %>%
mutate(across(2:3, ~round(.))) %>%
mutate(across(4:5, ~round(., 1))) %>%
mutate(across(2:5, ~str_c(as.character(.), "cm")))
# # A tibble: 150 x 6
# id Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <int> <chr> <chr> <chr> <chr> <fct>
# 1 1 5cm 4cm 1.4cm 0.2cm setosa
# 2 2 5cm 3cm 1.4cm 0.2cm setosa
anscombe_tbl %>%
select(id, x1:x2) %>%
mutate(across(x1:x2, .fns = list(sqrt, exp, log)))
# # A tibble: 11 x 9
# id x1 x2 x1_1 x1_2 x1_3 x2_1 x2_2 x2_3
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 3.16 22026. 2.30 3.16 22026. 2.30
# 2 2 8 8 2.83 2981. 2.08 2.83 2981. 2.08
引数.names:複数列にわたる計算後の列名の指定
# 引数.names
anscombe_tbl %>%
select(id, x1:x4) %>%
mutate(across(x1:x4, log, .names = "log_{col}"))
anscombe_tbl %>%
select(id, x1:x4) %>%
mutate(across(.cols = x1:x4, .fns = log, .names = "log_{col}"))
# # A tibble: 11 x 9
# id x1 x2 x3 x4 log_x1 log_x2 log_x3 log_x4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 10 8 2.30 2.30 2.30 2.08
# 2 2 8 8 8 8 2.08 2.08 2.08 2.08
anscombe_tbl %>%
select(id, x1:x4) %>%
mutate(across(.cols = x1:x4, .fns = list(Log = log), .names = "{fn}_{col}"))
# # A tibble: 11 x 9
# id x1 x2 x3 x4 Log_x1 Log_x2 Log_x3 Log_x4
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 10 8 2.30 2.30 2.30 2.08
# 2 2 8 8 8 8 2.08 2.08 2.08 2.08
anscombe_tbl %>%
select(id, x1:x2) %>%
mutate(across(.cols = x1:x2,
.fns = list(Sqrt = sqrt, Exp = exp, Log = log),
.names = "{fn}_{col}"))
anscombe_tbl %>%
select(id, x1:x2) %>%
mutate(across(x1:x2, list(Sqrt = sqrt, Exp = exp, Log = log),
.names = "{fn}_{col}"))
# # A tibble: 11 x 9
# id x1 x2 Sqrt_x1 Exp_x1 Log_x1 Sqrt_x2 Exp_x2 Log_x2
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 3.16 22026. 2.30 3.16 22026. 2.30
# 2 2 8 8 2.83 2981. 2.08 2.83 2981. 2.08
引数.names
には、{col}
で(引数.cols
に指定した)列名、{fn}
で(引数.fns
に指定した)関数名を含む列名を指定できます。ただし、関数名は名前付きリストの形(list(名前 = 要素, 名前 = 要素, ...)
の形)で渡す必要があります。列名や関数名を指定しなかった場合の例を次に示します。
anscombe_tbl %>%
select(id, x1:x4) %>%
mutate(across(.cols = x1:x4, .fns = log, .names = "{fn}_{col}"))
# # A tibble: 11 x 9
# id x1 x2 x3 x4 `1_x1` `1_x2` `1_x3` `1_x4`
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 10 8 2.30 2.30 2.30 2.08
# 2 2 8 8 8 8 2.08 2.08 2.08 2.08
# 名前付きリストの形で関数を指定していないため、{fn}のところが数字になっている。
anscombe_tbl %>%
select(id, x1:x2) %>%
mutate(across(.cols = x1:x2,
.fns = list(sqrt, exp, log),
.names = "{fn}_{col}"))
# # A tibble: 11 x 9
# id x1 x2 `1_x1` `2_x1` `3_x1` `1_x2` `2_x2` `3_x2`
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 3.16 22026. 2.30 3.16 22026. 2.30
# 2 2 8 8 2.83 2981. 2.08 2.83 2981. 2.08
# 名前付きリストの形で関数を指定していないため、{fn}のところが数字になっている。
anscombe_tbl %>%
select(id, x1:x2) %>%
mutate(across(.cols = x1:x2,
.fns = list(sqrt, exp, log)))
# # A tibble: 11 x 9
# id x1 x2 x1_1 x1_2 x1_3 x2_1 x2_2 x2_3
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 10 10 3.16 22026. 2.30 3.16 22026. 2.30
# 2 2 8 8 2.83 2981. 2.08 2.83 2981. 2.08
# 計算後の列名の形を指定していないため、"計算前の列名_1"のような列名になっている。
参考:
列計算に使う関数
四則演算や数学関数、文字列関数の他にもいろいろな関数が用意されていますが、長くなりましたので、これについては別記事にします。
参考
- tidyverse
- R for Data Science
- Heavy Watal - R stats
- Rのデータベースの操作についての自前のまとめ