More than 5 years have passed since last update.

R言語 - 高速な共分散と相関の処理 "coop"

Last updated at 2016-08-10Posted at 2016-08-10

はじめに

coop パッケージを使用すると、高速な共分散、相関の処理が可能です。

▼サンプルスクリプト

# 環境情報
# Oracle Distribution of R version 3.2.0  (--) -- "Full of Ingredients"
# Oracle Linux Server release 6.6(2.6.32-504.16.2.el6.x86_64)
# Intel(R) Core(TM) i5-3320M CPU @ 2.60GHz

# 必要に応じて、パッケージをインストール
install.packages( "coop" )
install.packages( "rbenchmark" )
install.packages( "lsa" )

# ライブラリの読み込み
library( coop )
library( rbenchmark )

マトリクスの検証

▼サンプルスクリプト

# ベンチマークの比較項目
cols <- c("test", "replications", "elapsed", "relative")

# ベンチマークの繰り返し回数
reps <- 25

# 正規分布に従う乱数によるマトリクスの生成
x <- matrix( rnorm( 2000*500 ), 2000, 500 )

# 共分散
benchmark( cov(x), coop::covar(x), replications=reps, columns=cols )
# 結果
#             test replications elapsed relative
# 1         cov(x)         25  35.762   56.855
# 2 coop::covar(x)         25   0.629    1.000

# 相関
benchmark( cor(x), coop::pcor(x), replications=reps, columns=cols )
# 結果
#            test replications elapsed relative
# 1        cor(x)           25  36.178   50.247
# 2 coop::pcor(x)           25   0.720    1.000

ベクターの検証

▼サンプルスクリプト

# ベンチマークの比較項目
cols <- c( "test", "replications", "elapsed", "relative" )

# ベンチマークの繰り返し回数
reps <- 100

# 正規分布に従う乱数によるベクターの生成
x <- rnorm( 10000000 )
y <- rnorm( 10000000 )

# 共分散
benchmark( coop::covar(x, y), cov(x, y), columns=cols, replications=reps )
# 結果
#                test replications elapsed relative
# 1 coop::covar(x, y)          100   1.888    1.000
# 2         cov(x, y)          100  31.531   16.701

# 相関
benchmark( coop::pcor(x, y), cor(x, y), replications=reps, columns=cols )
# 結果
#               test replications elapsed relative
# 1 coop::pcor(x, y)          100  19.983    1.000
# 2        cor(x, y)          100  43.198    2.162

[参考]cov()とcovar()の処理の違いを見てみた

cov()のプロファイル

Rprof(filename = "cov.out", append = FALSE, interval = 0.01, memory.profiling = TRUE)
cov(x)
Rprof(NULL)
summaryRprof(filename = "cov.out", memory = "none")
# プロファイル結果
# $by.self
#      self.time self.pct total.time total.pct
# "cov"       1.4      100        1.4       100
#
# $by.total
#      total.time total.pct self.time self.pct
# "cov"        1.4       100       1.4      100
#
# $sample.interval
# [1] 0.01
#
# $sampling.time
# [1] 1.4

coop::covar()のプロファイル

Rprof(filename = "covar.out", append = FALSE, interval = 0.01, memory.profiling = TRUE)
covar(x)
Rprof(NULL)
summaryRprof(filename = "covar.out", memory = "none")
# プロファイル結果
# $by.self
#            self.time self.pct total.time total.pct
# "co_matrix"       0.1      100        0.1       100
#
# $by.total
#               total.time total.pct self.time self.pct
# "co_matrix"           0.1       100       0.1      100
# "covar"               0.1       100       0.0        0
# "covar.matrix"        0.1       100       0.0        0
#
# $sample.interval
# [1] 0.01
#
# $sampling.time
# [1] 0.1

cov()のシステムトレースの比較結果

coop::covar()の高速化のポイントはシステムコールの違いにあることが分かります。

# % time     seconds  usecs/call     calls    errors syscall
# ------ ----------- ----------- --------- --------- ----------------
#  95.89    0.000840           0    140003           write
#   4.11    0.000036          18         2           getrusage
#   0.00    0.000000           0         1           read
#   0.00    0.000000           0         1           mmap
#   0.00    0.000000           0        18           rt_sigaction
#   0.00    0.000000           0        11           rt_sigprocmask
#   0.00    0.000000           0         5           ioctl
#   0.00    0.000000           0         1           select
# ------ ----------- ----------- --------- --------- ----------------
# 100.00    0.000876                140042           total

# coop::covar()のシステムトレース結果
# % time     seconds  usecs/call     calls    errors syscall
# ------ ----------- ----------- --------- --------- ----------------
#  66.62    0.019210        9605         2           readlink
#  25.53    0.007363           0    140003           write
#   4.37    0.001261         631         2           munmap
#   1.68    0.000485          26        19           mmap
#   1.14    0.000328          55         6           mprotect
#   0.66    0.000189          47         4           sched_setaffinity
#   0.00    0.000000           0         9           read
#   0.00    0.000000           0         6           open
#   0.00    0.000000           0         6           close
#   0.00    0.000000           0         6           fstat
#   0.00    0.000000           0         1           brk
#   0.00    0.000000           0        18           rt_sigaction
#   0.00    0.000000           0        11           rt_sigprocmask
#   0.00    0.000000           0         5           ioctl
#   0.00    0.000000           0         1           select
#   0.00    0.000000           0         1           clone
#   0.00    0.000000           0         3           getcwd
#   0.00    0.000000           0         2           getrusage
#   0.00    0.000000           0         5           futex
#   0.00    0.000000           0         3           sched_getaffinity
# ------ ----------- ----------- --------- --------- ----------------
# 100.00    0.028836                140113           total

リファレンス

https://cran.r-project.org/web/packages/coop/vignettes/algos.html
https://github.com/wrathematics/coop/blob/master/README.md

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up