はじめに
機械学習の勉強を始めたVBAユーザです。
備忘録としてRの使い方についてまとめています。
こちらの記事では、Rのパッケージdplyr
, tidyr
を使ったデータフレーム操作についてまとめました。ここではそれと対比して、Rの標準パッケージを用いて同じことをしてみます。
目次
Rの標準パッケージによるデータフレームの集計操作
カウント集計
列のカウントにはnrow()
関数を使うか、rownames()
(行名)のlength()
(長さ)を計算することでもできます。
# カウント集計
nrow(iris)
# [1] 150
length(rownames(iris))
# [1] 150
グループ別カウント集計
グループ別にカウント集計するにはtable()
関数が使えます。結果はtable
クラスのオブジェクトが返りますが、data.frame()
でデータフレームにできます。
# グループ別カウント集計
table(iris$Species)
#
# setosa versicolor virginica
# 50 50 50
table(iris$Species)
#
# setosa versicolor virginica
# 50 50 50
table_iris <- table(iris$Species)
class(table_iris)
# [1] "table"
data.frame(table_iris)
# Var1 Freq
# 1 setosa 50
# 2 versicolor 50
# 3 virginica 50
as.data.frame(table_iris)
# Var1 Freq
# 1 setosa 50
# 2 versicolor 50
# 3 virginica 50
as.data.frame(table_iris, responseName = "count")
# Var1 count
# 1 setosa 50
# 2 versicolor 50
# 3 virginica 50
基本統計量の集計
# 基本統計量の集計
# 最大値
max(iris$Sepal.Length)
# [1] 7.9
# 最小値
min(iris$Sepal.Length)
# [1] 4.3
# ユニークな値の数
length(unique(iris$Sepal.Length))
# [1] 35
# 平均値
mean(iris$Sepal.Length)
# [1] 5.843333
# 中央値
median(iris$Sepal.Length)
# [1] 5.8
# 不偏分散
var(iris$Sepal.Length)
sum((iris$Sepal.Length - mean(iris$Sepal.Length))^2) / (length(iris$Sepal.Length) - 1)
# [1] 0.6856935
# 分散
var(iris$Sepal.Length)*((length(iris$Sepal.Length)-1) / length(iris$Sepal.Length))
sum((iris$Sepal.Length - mean(iris$Sepal.Length))^2) / length(iris$Sepal.Length)
# [1] 0.6811222
# 不偏標準偏差
sd(iris$Sepal.Length)
sqrt(sum((iris$Sepal.Length - mean(iris$Sepal.Length))^2) / (length(iris$Sepal.Length) - 1))
# [1] 0.8280661
# 標準偏差
sd(iris$Sepal.Length)*sqrt((length(iris$Sepal.Length)-1) / length(iris$Sepal.Length))
sqrt(sum((iris$Sepal.Length - mean(iris$Sepal.Length))^2) / length(iris$Sepal.Length))
# [1] 0.8253013
# クォンタイル点(最小値・第1四分位・中央値・第3四分位・最大値)
quantile(iris$Sepal.Length)
# 0% 25% 50% 75% 100%
# 4.3 5.1 5.8 6.4 7.9
quantile(iris$Sepal.Length, 0.25)
# 25%
# 5.1
quantile(iris$Sepal.Length, c(0.25, 0.75))
# 25% 75%
# 5.1 6.4
# 範囲(最小値・最大値)
range(iris$Sepal.Length)
# [1] 4.3 7.9
range(iris$Sepal.Length)[2] - range(iris$Sepal.Length)[1]
# [1] 3.6
# 四分位偏差(第3四分位 - 第1四分位)
IQR(iris$Sepal.Length)
quantile(iris$Sepal.Length, 0.75, names=F) - quantile(iris$Sepal.Length, 0.25, names=F)
# [1] 1.3
# 平均絶対偏差(Mean Absolute Deviation)
mad(iris$Sepal.Length)
# [1] 1.03782
# 5数要約(最小値・下側ヒンジ・中央値・上側ヒンジ・最大値)
fivenum(iris$Sepal.Length)
# [1] 4.3 5.1 5.8 6.4 7.9
# 要約統計量
summary(iris$Sepal.Length)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 4.300 5.100 5.800 5.843 6.400 7.900
# 和
sum(iris$Sepal.Length)
# [1] 876.5
# 積
prod(iris$Sepal.Length)
# [1] 2.25744e+114
summary()
関数を使うと、各列の要約統計量(最小値、第1四分位数、中央値、平均値、第3四分位数、最大値)(因子型ついては各因子の度数カウント)を表示できます。
summary(iris)
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 setosa :50
# 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:50
# Median :5.800 Median :3.000 Median :4.350 Median :1.300 virginica :50
# Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
# 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
# Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
summary(anscombe)
# x1 x2 x3 x4 y1 y2 y3 y4
# Min. : 4.0 Min. : 4.0 Min. : 4.0 Min. : 8 Min. : 4.260 Min. :3.100 Min. : 5.39 Min. : 5.250
# 1st Qu.: 6.5 1st Qu.: 6.5 1st Qu.: 6.5 1st Qu.: 8 1st Qu.: 6.315 1st Qu.:6.695 1st Qu.: 6.25 1st Qu.: 6.170
# Median : 9.0 Median : 9.0 Median : 9.0 Median : 8 Median : 7.580 Median :8.140 Median : 7.11 Median : 7.040
# Mean : 9.0 Mean : 9.0 Mean : 9.0 Mean : 9 Mean : 7.501 Mean :7.501 Mean : 7.50 Mean : 7.501
# 3rd Qu.:11.5 3rd Qu.:11.5 3rd Qu.:11.5 3rd Qu.: 8 3rd Qu.: 8.570 3rd Qu.:8.950 3rd Qu.: 7.98 3rd Qu.: 8.190
# Max. :14.0 Max. :14.0 Max. :14.0 Max. :19 Max. :10.840 Max. :9.260 Max. :12.74 Max. :12.500
基本統計量の集計結果をデータフレームで返します。
data.frame(n = length(row.names(iris)),
max = max(iris$Sepal.Length),
min = min(iris$Sepal.Length))
# n max min
# 1 150 7.9 4.3
data.frame(length(row.names(iris)),
mean(iris$Sepal.Length), mean(iris$Sepal.Width),
mean(iris$Petal.Length), mean(iris$Petal.Width))
# length.row.names.iris.. mean.iris.Sepal.Length. mean.iris.Sepal.Width. mean.iris.Petal.Length.
# 1 150 5.843333 3.057333 3.758
data.frame(count = length(row.names(iris)),
mean_sl = mean(iris$Sepal.Length), mean_sw = mean(iris$Sepal.Width),
mean_pl = mean(iris$Petal.Length), mean_pw = mean(iris$Petal.Width))
# count mean_sl mean_sw mean_pl mean_pw
# 1 150 5.843333 3.057333 3.758 1.199333
グループ別の基本統計量の集計
グループ別に基本統計量を集計するには、aggregate()
関数を使います。
attach()
しておくと、データフレーム名を毎回書かなくても列名だけでデータフレームの列にアクセスできるようになります。detach()
で元に戻ります。
# グループ別の基本統計量の集計
aggregate(iris$Sepal.Length, by = list(iris$Species), FUN = length)
aggregate(iris$Sepal.Length, by = list(iris$Species), FUN = function(x) length(x))
# Group.1 x
# 1 setosa 50
# 2 versicolor 50
# 3 virginica 50
attach(iris)
aggregate(Sepal.Length, by = list(Species), FUN = length)
detach(iris)
# Group.1 x
# 1 setosa 50
# 2 versicolor 50
# 3 virginica 50
aggregate(iris$Sepal.Length, by = list(species = iris$Species), FUN = length)
aggregate(iris$Sepal.Length, by = list(species = iris$Species), FUN = function(x) length(x))
# species x
# 1 setosa 50
# 2 versicolor 50
# 3 virginica 50
aggregate(Sepal.Length ~ Species, data = iris, FUN = length)
aggregate(Sepal.Length ~ Species, data = iris, FUN = function(x) length(x))
# Species Sepal.Length
# 1 setosa 50
# 2 versicolor 50
# 3 virginica 50
aggregate(iris$Sepal.Length, by = list(iris$Species, round(iris$Sepal.Length)), FUN = length)
# Group.1 Group.2 x
# 1 setosa 4 5
# 2 setosa 5 40
# 3 versicolor 5 6
# 4 virginica 5 1
# 5 setosa 6 5
# 6 versicolor 6 36
# 7 virginica 6 27
# 8 versicolor 7 8
# 9 virginica 7 16
# 10 virginica 8 6
aggregate(iris$Sepal.Length, by = list(species = iris$Species, sl = round(iris$Sepal.Length)), FUN = length)
# species sl x
# 1 setosa 4 5
# 2 setosa 5 40
# 3 versicolor 5 6
# 4 virginica 5 1
# 5 setosa 6 5
# 6 versicolor 6 36
# 7 virginica 6 27
# 8 versicolor 7 8
# 9 virginica 7 16
# 10 virginica 8 6
aggregate(Sepal.Length ~ Species + round(Sepal.Length), data = iris, FUN = length)
# Species round(Sepal.Length) Sepal.Length
# 1 setosa 4 5
# 2 setosa 5 40
# 3 versicolor 5 6
# 4 virginica 5 1
# 5 setosa 6 5
# 6 versicolor 6 36
# 7 virginica 6 27
# 8 versicolor 7 8
# 9 virginica 7 16
# 10 virginica 8 6
aggregate(iris$Sepal.Length, by = list(species = iris$Species), FUN = mean)
# species x
# 1 setosa 5.006
# 2 versicolor 5.936
# 3 virginica 6.588
aggregate(Sepal.Length ~ Species, data = iris, FUN = mean)
# Species Sepal.Length
# 1 setosa 5.006
# 2 versicolor 5.936
# 3 virginica 6.588
aggregate(iris[,1:4], by = list(species = iris$Species), FUN = mean)
# species Sepal.Length Sepal.Width Petal.Length Petal.Width
# 1 setosa 5.006 3.428 1.462 0.246
# 2 versicolor 5.936 2.770 4.260 1.326
# 3 virginica 6.588 2.974 5.552 2.026
df <- cbind(
aggregate(iris$Sepal.Length, by = list(iris$Species), FUN = length),
aggregate(iris$Sepal.Length, by = list(iris$Species), FUN = mean)[,2],
aggregate(iris$Sepal.Width, by = list(iris$Species), FUN = mean)[,2],
aggregate(iris$Petal.Length, by = list(iris$Species), FUN = mean)[,2],
aggregate(iris$Petal.Width, by = list(iris$Species), FUN = mean)[,2])
colnames(df) <- c("Species", "count", "mean_sl", "mean_sw", "mean_pl", "mean_pw")
df
# Species count mean_sl mean_sw mean_pl mean_pw
# 1 setosa 50 5.006 3.428 1.462 0.246
# 2 versicolor 50 5.936 2.770 4.260 1.326
# 3 virginica 50 6.588 2.974 5.552 2.026
df <- cbind(
aggregate(iris$Sepal.Length, by = list(iris$Species), FUN = length),
aggregate(iris[,1:4], by = list(iris$Species), FUN = mean)[,2:5])
colnames(df) <- c("Species", "count", "mean_sl", "mean_sw", "mean_pl", "mean_pw")
df
# Species count mean_sl mean_sw mean_pl mean_pw
# 1 setosa 50 5.006 3.428 1.462 0.246
# 2 versicolor 50 5.936 2.770 4.260 1.326
# 3 virginica 50 6.588 2.974 5.552 2.026
aggregate(iris$Sepal.Length, by = list(iris$Species), FUN = function(x) quantile(x, 0.25))
aggregate(Sepal.Length ~ Species, data = iris, FUN = function(x) quantile(x, 0.25))
# Species Sepal.Length
# 1 setosa 4.800
# 2 versicolor 5.600
# 3 virginica 6.225
aggregate(iris$Sepal.Length, by = list(iris$Species), FUN = function(x) quantile(x, c(0.25, 0.75)))
aggregate(Sepal.Length ~ Species, data = iris, FUN = function(x) quantile(x, c(0.25, 0.75)))
# Species Sepal.Length.25% Sepal.Length.75%
# 1 setosa 4.800 5.200
# 2 versicolor 5.600 6.300
# 3 virginica 6.225 6.900
df <- cbind(
aggregate(Sepal.Length ~ Species, data = iris, FUN = length),
aggregate(Sepal.Length ~ Species, data = iris, FUN = function(x) length(unique(x)))[,2],
aggregate(Sepal.Length ~ Species, data = iris, FUN = mean)[,2],
aggregate(Sepal.Length ~ Species, data = iris, FUN = median)[,2],
aggregate(Sepal.Length ~ Species, data = iris, FUN = sd)[,2],
aggregate(Sepal.Length ~ Species, data = iris, FUN = IQR)[,2],
aggregate(Sepal.Length ~ Species, data = iris, FUN = mad)[,2],
aggregate(Sepal.Length ~ Species, data = iris, FUN = min)[,2],
aggregate(Sepal.Length ~ Species, data = iris, FUN = max)[,2],
aggregate(Sepal.Length ~ Species, data = iris, FUN = function(x) quantile(x, 0.25))[,2],
aggregate(Sepal.Length ~ Species, data = iris, FUN = function(x) quantile(x, 0.75))[,2],
aggregate(Sepal.Length ~ Species, data = iris, FUN = sum)[,2],
aggregate(Sepal.Length ~ Species, data = iris, FUN = prod)[,2])
colnames(df) <- c("Species", "n", "n_distinct",
"mean", "median", "sd", "IQR", "max", "min", "max", "Q1", "Q2",
"sum", "prod")
df
# Species n n_distinct mean median sd IQR max min max Q1 Q2 sum prod
# 1 setosa 50 15 5.006 5.0 0.3524897 0.400 0.29652 4.3 5.8 4.800 5.2 250.3 8.350948e+34
# 2 versicolor 50 21 5.936 5.9 0.5161711 0.700 0.51891 4.9 7.0 5.600 6.3 296.8 3.926361e+38
# 3 virginica 50 21 6.588 6.5 0.6358796 0.675 0.59304 4.9 7.9 6.225 6.9 329.4 6.884782e+40
統計量を集計する関数の結果が長さ1でない場合について
# 統計量を集計する関数の結果が長さ1でない場合
aggregate(Sepal.Length ~ 1, data = iris, FUN = range)
# Sepal.Length.1 Sepal.Length.2
# 1 4.3 7.9
aggregate(Sepal.Length ~ 1, data = iris, FUN = summary)
# Sepal.Length.Min. Sepal.Length.1st Qu. Sepal.Length.Median Sepal.Length.Mean Sepal.Length.3rd Qu. Sepal.Length.Max.
# 1 4.300000 5.100000 5.800000 5.843333 6.400000 7.900000
aggregate(Sepal.Length ~ 1, data = iris, FUN = function(x) quantile(x, c(0.0, 0.25, 0.5, 0.75, 1.0)))
# Sepal.Length.0% Sepal.Length.25% Sepal.Length.50% Sepal.Length.75% Sepal.Length.100%
# 1 4.3 5.1 5.8 6.4 7.9
aggregate(Sepal.Length ~ Species, data = iris, FUN = summary)
# Species Sepal.Length.Min. Sepal.Length.1st Qu. Sepal.Length.Median Sepal.Length.Mean Sepal.Length.3rd Qu. Sepal.Length.Max.
# 1 setosa 4.300 4.800 5.000 5.006 5.200 5.800
# 2 versicolor 4.900 5.600 5.900 5.936 6.300 7.000
# 3 virginica 4.900 6.225 6.500 6.588 6.900 7.900
複数列にわたる集計
# 複数列にわたる集計
aggregate(iris[,1:4], by = list(species = iris$Species), FUN = mean)
# species Sepal.Length Sepal.Width Petal.Length Petal.Width
# 1 setosa 5.006 3.428 1.462 0.246
# 2 versicolor 5.936 2.770 4.260 1.326
# 3 virginica 6.588 2.974 5.552 2.026
aggregate(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) ~ Species, data = iris, FUN = mean)
aggregate(. ~ Species, data = iris, FUN = mean)
# Species Sepal.Length Sepal.Width Petal.Length Petal.Width
# 1 setosa 5.006 3.428 1.462 0.246
# 2 versicolor 5.936 2.770 4.260 1.326
# 3 virginica 6.588 2.974 5.552 2.026
aggregate(iris[,1:4], by = list(species = iris$Species), FUN = function(x) mean(x) * 50)
# species Sepal.Length Sepal.Width Petal.Length Petal.Width
# 1 setosa 250.3 171.4 73.1 12.3
# 2 versicolor 296.8 138.5 213.0 66.3
# 3 virginica 329.4 148.7 277.6 101.3
aggregate(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) ~ Species, data = iris, FUN = function(x) mean(x) * 50)
aggregate(. ~ Species, data = iris, FUN = function(x) mean(x) * 50)
# Species Sepal.Length Sepal.Width Petal.Length Petal.Width
# 1 setosa 250.3 171.4 73.1 12.3
# 2 versicolor 296.8 138.5 213.0 66.3
# 3 virginica 329.4 148.7 277.6 101.3
aggregate(iris[,1:4], by = list(species = iris$Species), FUN = sum)
# species Sepal.Length Sepal.Width Petal.Length Petal.Width
# 1 setosa 250.3 171.4 73.1 12.3
# 2 versicolor 296.8 138.5 213.0 66.3
# 3 virginica 329.4 148.7 277.6 101.3
aggregate(cbind(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) ~ Species, data = iris, FUN = sum)
aggregate(. ~ Species, data = iris, FUN = sum)
# Species Sepal.Length Sepal.Width Petal.Length Petal.Width
# 1 setosa 250.3 171.4 73.1 12.3
# 2 versicolor 296.8 138.5 213.0 66.3
# 3 virginica 329.4 148.7 277.6 101.3
aggregate(cbind(x1, x2, x3, x4, y1, y2, y3, y4) ~ 1, data = anscombe, FUN = mean)
aggregate(. ~ 1, data = anscombe, FUN = mean)
# x1 x2 x3 x4 y1 y2 y3 y4
# 1 9 9 9 9 7.500909 7.500909 7.5 7.500909
aggregate(anscombe, by = list(rep(1, 11)), FUN = mean)
# Group.1 x1 x2 x3 x4 y1 y2 y3 y4
# 1 1 9 9 9 9 7.500909 7.500909 7.5 7.500909
複数の集計関数
複数の集計関数を適用することはできない?
おまけ
# summary関数のような出力
df <- rbind(
aggregate(. ~ 1, data = iris[,1:4], FUN = min),
aggregate(. ~ 1, data = iris[,1:4], FUN = function(x) quantile(x, 0.25)),
aggregate(. ~ 1, data = iris[,1:4], FUN = median),
aggregate(. ~ 1, data = iris[,1:4], FUN = mean),
aggregate(. ~ 1, data = iris[,1:4], FUN = function(x) quantile(x, 0.75)),
aggregate(. ~ 1, data = iris[,1:4], FUN = max))
rownames(df) <- c("Min", "Q1", "Median", "Mean", "Q3", "Max")
df
# Sepal.Length Sepal.Width Petal.Length Petal.Width
# Min 4.300000 2.000000 1.000 0.100000
# Q1 5.100000 2.800000 1.600 0.300000
# Median 5.800000 3.000000 4.350 1.300000
# Mean 5.843333 3.057333 3.758 1.199333
# Q3 6.400000 3.300000 5.100 1.800000
# Max 7.900000 4.400000 6.900 2.500000
df <- rbind(
aggregate(. ~ 1, data = anscombe, FUN = min),
aggregate(. ~ 1, data = anscombe, FUN = function(x) quantile(x, 0.25)),
aggregate(. ~ 1, data = anscombe, FUN = median),
aggregate(. ~ 1, data = anscombe, FUN = mean),
aggregate(. ~ 1, data = anscombe, FUN = function(x) quantile(x, 0.75)),
aggregate(. ~ 1, data = anscombe, FUN = max))
rownames(df) <- c("Min.", "1st Qu.", "Median", "Mean", "3rd Qu.", "Max.")
df
# x1 x2 x3 x4 y1 y2 y3 y4
# Min. 4.0 4.0 4.0 8 4.260000 3.100000 5.39 5.250000
# 1st Qu. 6.5 6.5 6.5 8 6.315000 6.695000 6.25 6.170000
# Median 9.0 9.0 9.0 8 7.580000 8.140000 7.11 7.040000
# Mean 9.0 9.0 9.0 9 7.500909 7.500909 7.50 7.500909
# 3rd Qu. 11.5 11.5 11.5 8 8.570000 8.950000 7.98 8.190000
# Max. 14.0 14.0 14.0 19 10.840000 9.260000 12.74 12.500000
ピボット集計
まず、使用するデータを作成しておきます。
# データの準備
iris_tbl <- iris
iris_tbl$id <- 1:nrow(iris)
iris_sl <- data.frame(iris_tbl[, c("id", "Species")], part = "Sepal", measure = "Length", value = iris_tbl$Sepal.Length)
iris_sw <- data.frame(iris_tbl[, c("id", "Species")], part = "Sepal", measure = "Width" , value = iris_tbl$Sepal.Width)
iris_pl <- data.frame(iris_tbl[, c("id", "Species")], part = "Petal", measure = "Length", value = iris_tbl$Petal.Length)
iris_pw <- data.frame(iris_tbl[, c("id", "Species")], part = "Petal", measure = "Width" , value = iris_tbl$Petal.Width)
iris_long <- rbind(iris_sl, iris_sw, iris_pl, iris_pw)
iris_long
# id Species part measure value
# 1 1 setosa Sepal Length 5.1
# 2 2 setosa Sepal Length 4.9
# 3 3 setosa Sepal Length 4.7
# 4 4 setosa Sepal Length 4.6
# 5 5 setosa Sepal Length 5.0
# ...
anscombe_tbl <- anscombe
anscombe_tbl$id <- 1:nrow(anscombe)
anscombe_x1 <- data.frame(id = anscombe_tbl$id, axis = "x", set = "1", value = anscombe_tbl$x1)
anscombe_x2 <- data.frame(id = anscombe_tbl$id, axis = "x", set = "2", value = anscombe_tbl$x2)
anscombe_x3 <- data.frame(id = anscombe_tbl$id, axis = "x", set = "3", value = anscombe_tbl$x3)
anscombe_x4 <- data.frame(id = anscombe_tbl$id, axis = "x", set = "4", value = anscombe_tbl$x4)
anscombe_y1 <- data.frame(id = anscombe_tbl$id, axis = "y", set = "1", value = anscombe_tbl$y1)
anscombe_y2 <- data.frame(id = anscombe_tbl$id, axis = "y", set = "2", value = anscombe_tbl$y2)
anscombe_y3 <- data.frame(id = anscombe_tbl$id, axis = "y", set = "3", value = anscombe_tbl$y3)
anscombe_y4 <- data.frame(id = anscombe_tbl$id, axis = "y", set = "4", value = anscombe_tbl$y4)
anscombe_long <- rbind(anscombe_x1, anscombe_x2, anscombe_x3, anscombe_x4,
anscombe_y1, anscombe_y2, anscombe_y3, anscombe_y4)
anscombe_long
# id axis set value
# 1 1 x 1 10.00
# 2 2 x 1 8.00
# 3 3 x 1 13.00
# 4 4 x 1 9.00
# 5 5 x 1 11.00
# ...
ピボット集計でカウント
table()
関数は2次元でも使えます。
# ピボット集計でカウント
table(iris_long$part, iris_long$measure)
#
# Length Width
# Petal 150 150
# Sepal 150 150
table(paste(iris_long$part, iris_long$measure), iris_long$Species)
#
# setosa versicolor virginica
# Petal Length 50 50 50
# Petal Width 50 50 50
# Sepal Length 50 50 50
# Sepal Width 50 50 50
table(anscombe_long$set, anscombe_long$axis)
#
# x y
# 1 11 11
# 2 11 11
# 3 11 11
# 4 11 11
2次元以上になる場合は、ftable()
関数も使えます。
ftable(anscombe_long[c("set", "axis")])
# axis x y
# set
# 1 11 11
# 2 11 11
# 3 11 11
# 4 11 11
ftable(iris_long[c("part", "measure", "Species")])
# Species setosa versicolor virginica
# part measure
# Sepal Length 50 50 50
# Width 50 50 50
# Petal Length 50 50 50
# Width 50 50 50
ftable(iris_long[c("Species", "measure", "part")], row.vars = c(3, 2))
# Species setosa versicolor virginica
# part measure
# Sepal Length 50 50 50
# Width 50 50 50
# Petal Length 50 50 50
# Width 50 50 50
ftable(iris_long[c("Species", "part", "measure")], row.vars = c(1))
# part Sepal Petal
# measure Length Width Length Width
# Species
# setosa 50 50 50 50
# versicolor 50 50 50 50
# virginica 50 50 50 50
ftable(anscombe_long[c("set", "axis")], row.vars = c(2, 1))
# axis set
# x 1 11
# 2 11
# 3 11
# 4 11
# y 1 11
# 2 11
# 3 11
# 4 11
xtabs()
関数も使えます。
xtabs(~ set + axis, data = anscombe_long)
# axis
# set x y
# 1 11 11
# 2 11 11
# 3 11 11
# 4 11 11
xtabs(~ paste(part, measure) + Species, data = iris_long)
# Species
# paste(part, measure) setosa versicolor virginica
# Petal Length 50 50 50
# Petal Width 50 50 50
# Sepal Length 50 50 50
# Sepal Width 50 50 50
anscombe_ag <- aggregate(value ~ set + axis, data = anscombe_long, FUN = length)
anscombe_ag
# set axis value
# 1 1 x 11
# 2 2 x 11
# 3 3 x 11
# 4 4 x 11
# 5 1 y 11
# 6 2 y 11
# 7 3 y 11
# 8 4 y 11
xtabs(value ~ set + axis, data = anscombe_ag)
# axis
# set x y
# 1 11 11
# 2 11 11
# 3 11 11
# 4 11 11
iris_ag <- aggregate(value ~ part + measure + Species, data = iris_long, FUN = length)
iris_ag
# part measure Species value
# 1 Petal Length setosa 50
# 2 Sepal Length setosa 50
# 3 Petal Width setosa 50
# 4 Sepal Width setosa 50
# 5 Petal Length versicolor 50
# 6 Sepal Length versicolor 50
# 7 Petal Width versicolor 50
# 8 Sepal Width versicolor 50
# 9 Petal Length virginica 50
# 10 Sepal Length virginica 50
# 11 Petal Width virginica 50
# 12 Sepal Width virginica 50
xtabs(value ~ paste(part, measure) + Species, data = iris_ag)
# Species
# paste(part, measure) setosa versicolor virginica
# Petal Length 50 50 50
# Petal Width 50 50 50
# Sepal Length 50 50 50
# Sepal Width 50 50 50
ピボット集計で合計
xtabs()
関数でピボット集計(合計)ができます。
# ピボット集計で合計
xtabs(value ~ set + axis, data = anscombe_long)
# axis
# set x y
# 1 99.00 82.51
# 2 99.00 82.51
# 3 99.00 82.50
# 4 99.00 82.51
xtabs(value ~ paste(part, measure) + Species, data = iris_long)
# Species
# paste(part, measure) setosa versicolor virginica
# Petal Length 73.1 213.0 277.6
# Petal Width 12.3 66.3 101.3
# Sepal Length 250.3 296.8 329.4
# Sepal Width 171.4 138.5 148.7
anscombe_ag <- aggregate(value ~ set + axis, data = anscombe_long, FUN = sum)
anscombe_ag
# set axis value
# 1 1 x 99.00
# 2 2 x 99.00
# 3 3 x 99.00
# 4 4 x 99.00
# 5 1 y 82.51
# 6 2 y 82.51
# 7 3 y 82.50
# 8 4 y 82.51
xtabs(value ~ set + axis, data = anscombe_ag)
# axis
# set x y
# 1 99.00 82.51
# 2 99.00 82.51
# 3 99.00 82.50
# 4 99.00 82.51
iris_ag <- aggregate(value ~ part + measure + Species, data = iris_long, FUN = sum)
iris_ag
# part measure Species value
# 1 Petal Length setosa 73.1
# 2 Sepal Length setosa 250.3
# 3 Petal Width setosa 12.3
# 4 Sepal Width setosa 171.4
# 5 Petal Length versicolor 213.0
# 6 Sepal Length versicolor 296.8
# 7 Petal Width versicolor 66.3
# 8 Sepal Width versicolor 138.5
# 9 Petal Length virginica 277.6
# 10 Sepal Length virginica 329.4
# 11 Petal Width virginica 101.3
# 12 Sepal Width virginica 148.7
xtabs(value ~ paste(part, measure) + Species, data = iris_ag)
# Species
# paste(part, measure) setosa versicolor virginica
# Petal Length 73.1 213.0 277.6
# Petal Width 12.3 66.3 101.3
# Sepal Length 250.3 296.8 329.4
# Sepal Width 171.4 138.5 148.7
ピボット集計で平均
ピボット集計で平均を取る場合は、まずaggregate()
関数で平均を計算しておいて、xtabs()
関数でピボットすればできます。
# ピボット集計で平均
anscombe_ag <- aggregate(value ~ set + axis, data = anscombe_long, FUN = mean)
anscombe_ag
# set axis value
# 1 1 x 9.000000
# 2 2 x 9.000000
# 3 3 x 9.000000
# 4 4 x 9.000000
# 5 1 y 7.500909
# 6 2 y 7.500909
# 7 3 y 7.500000
# 8 4 y 7.500909
xtabs(value ~ set + axis, data = anscombe_ag)
# axis
# set x y
# 1 9.000000 7.500909
# 2 9.000000 7.500909
# 3 9.000000 7.500000
# 4 9.000000 7.500909
iris_ag <- aggregate(value ~ part + measure + Species, data = iris_long, FUN = mean)
iris_ag
# part measure Species value
# 1 Petal Length setosa 1.462
# 2 Sepal Length setosa 5.006
# 3 Petal Width setosa 0.246
# 4 Sepal Width setosa 3.428
# 5 Petal Length versicolor 4.260
# 6 Sepal Length versicolor 5.936
# 7 Petal Width versicolor 1.326
# 8 Sepal Width versicolor 2.770
# 9 Petal Length virginica 5.552
# 10 Sepal Length virginica 6.588
# 11 Petal Width virginica 2.026
# 12 Sepal Width virginica 2.974
xtabs(value ~ paste(part, measure) + Species, data = iris_ag)
# Species
# paste(part, measure) setosa versicolor virginica
# Petal Length 1.462 4.260 5.552
# Petal Width 0.246 1.326 2.026
# Sepal Length 5.006 5.936 6.588
# Sepal Width 3.428 2.770 2.974