NOTE: If you are upgrading from a previous installation of TensorFlow < 0.7.1, you should uninstall the previous TensorFlow and protobuf using pip uninstall first to make sure you get a clean installation of the updated protobuf dependency.
# 削除したprotobufを再インストール
$ pip install protobuf
# OS XでCPU利用の場合
$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.11.0rc1-py2-none-any.whl
$ sudo pip install --upgrade $TF_BINARY_URL
# {tensorflow}のインストール時に{tensorflow}で利用するPythonを指定するため、パスを確認しておく
$ which python
export PYTHON_PATH=`which python`
# {tensorflow}で利用するPythonのパスを環境変数に設定してインストールする
Sys.setenv(TENSORFLOW_PYTHON = "/usr/local/bin/python")
SET_SETP_NUM <- 3000
# confusion matrixからAccuracyを計算
calcAccuracy <- function(confusion_mat) {
return(sum(diag(x = confusion_mat)) / sum(confusion_mat))
# MNIST For ML Beginnersの例を参考
W <- tensorflow::tf$Variable(
initial_value = tensorflow::tf$zeros(shape = tensorflow::shape(SET_DATA_PARAM$FEATURE_NUM, SET_DATA_PARAM$CLASS_NUM))
x <- tensorflow::tf$placeholder(
dtype = tensorflow::tf$float32,
shape = tensorflow::shape(NULL, SET_DATA_PARAM$FEATURE_NUM)
b <- tensorflow::tf$Variable(
initial_value = tensorflow::tf$zeros(shape = tensorflow::shape(SET_DATA_PARAM$CLASS_NUM))
y <- tensorflow::tf$nn$softmax(logits = tensorflow::tf$matmul(a = x, b = W) + b)
y_ <- tensorflow::tf$placeholder(dtype = tensorflow::tf$float32, shape = tensorflow::shape(NULL, SET_DATA_PARAM$CLASS_NUM))
# 損失関数とオプティマイザーの設定
cross_entropy <- tensorflow::tf$reduce_mean(input_tensor = - tensorflow::tf$reduce_sum(input_tensor = y_ * tensorflow::tf$log(x = y), reduction_indices = 1L))
optimizer <- tensorflow::tf$train$GradientDescentOptimizer(learning_rate = 0.5)
train_step <- optimizer$minimize(loss = cross_entropy)
# 評価用
correct_prediction <- tensorflow::tf$equal(
x = tensorflow::tf$argmax(input = y, dimension = 1L), y = tensorflow::tf$argmax(input = y_, dimension = 1L)
accuracy <- tensorflow::tf$reduce_mean(
input_tensor = tensorflow::tf$cast(x = correct_prediction, dtype = tensorflow::tf$float32)
# irisデータ
> iris %>%
head(n = 10)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
7 4.6 3.4 1.4 0.3 setosa
8 5.0 3.4 1.5 0.2 setosa
9 4.4 2.9 1.4 0.2 setosa
10 4.9 3.1 1.5 0.1 setosa
> iris %>%
dplyr::group_by(Species) %>%
dplyr::summarise_all(.fun = mean)
# A tibble: 3 × 5
Species Sepal.Length Sepal.Width Petal.Length Petal.Width
<fctr> <dbl> <dbl> <dbl> <dbl>
1 setosa 5.006 3.428 1.462 0.246
2 versicolor 5.936 2.770 4.260 1.326
3 virginica 6.588 2.974 5.552 2.026
set.seed(seed = 71)
cv_number <- sample(x = seq(from = 1, to = SET_CV_NUM), size = nrow(x = iris), replace = TRUE)
# N分割交差検証
> tf_result <- lapply(
X = seq(from = 1, to = SET_CV_NUM),
FUN = function (cv_counter) {
# 学習データ作成
trn_d <- iris %>%
dplyr::filter(cv_number != cv_counter)
trn_x <- trn_d %>%
dplyr::select(-Species) %>%
trn_y <- trn_d %>%
caret::dummyVars(formula = ~ Species, sep = NULL) %>%
predict(object = ., newdata = trn_d) %>%
# 初期化
tf_session <- tensorflow::tf$Session()
tf_session$run(fetches = tensorflow::tf$initialize_all_variables())
# パラメータが初期化されているか確認
stringr::str_c("CV:", cv_counter),
stringr::str_c("W:", sum(tf_session$run(W))),
stringr::str_c("b:", sum(tf_session$run(b))),
sep = " "
# 学習
foreach::times(n = SET_SETP_NUM) %do% {
step_logic <- sample(x = c(FALSE, TRUE), size = nrow(x = trn_x), replace = TRUE, prob = c(0.5, 0.5))
fetches = train_step,
feed_dict = tensorflow::dict(x = trn_x[step_logic, ], y_ = trn_y[step_logic, , drop = FALSE])
# 当てはめ結果
tf_fit_accuracy <- accuracy$eval(feed_dict = tensorflow::dict(x = trn_x, y_ = trn_y), session = tf_session)
fit_confusion_mat <- table(
predict = tf_session$run(fetches = y, feed_dict = tensorflow::dict(x = trn_x, y_ = trn_y)) %>%
apply(MARGIN = 1, FUN = which.max),
true = trn_y %>%
apply(MARGIN = 1, FUN = which.max)
# 評価データ作成
tst_d <- iris %>%
dplyr::filter(cv_number == cv_counter)
tst_x <- tst_d %>%
dplyr::select(-Species) %>%
tst_y <- tst_d %>%
dummyVars(formula = ~ Species, sep = NULL) %>%
predict(object = ., newdata = tst_d) %>%
# 予測結果
tf_predict_accuracy <- accuracy$eval(feed_dict = tensorflow::dict(x = tst_x, y_ = tst_y), session = tf_session)
predict_confusion_mat <- table(
predict = tf_session$run(fetches = y, feed_dict = tensorflow::dict(x = tst_x, y_ = matrix(data = 0, nrow = nrow(x = tst_x), ncol = SET_DATA_PARAM$CLASS_NUM))) %>%
apply(MARGIN = 1, FUN = which.max),
true = tst_y %>%
apply(MARGIN = 1, FUN = which.max)
# 比較用
glmnet_mdl <- glmnet::glmnet(x = trn_x, y = trn_d$Species, family = "multinomial")
glmnet_confusion_mat <- table(
predict = predict(object = glmnet_mdl, newx = tst_x, type = "class", s = 0.01)[, 1],
true = tst_d$Species
tf_fit_accuracy = tf_fit_accuracy,
fit_confusion_mat = fit_confusion_mat,
tf_predict_accuracy = tf_predict_accuracy,
predict_confusion_mat = predict_confusion_mat,
glmnet_confusion_mat = glmnet_confusion_mat
[1] "CV:1 W:0 b:0"
[1] "CV:2 W:0 b:0"
[1] "CV:3 W:0 b:0"
[1] "CV:4 W:0 b:0"
[1] "CV:5 W:0 b:0"
# 当てはめ結果を確認
> sapply(X = tf_result, FUN = "[[", "tf_fit_accuracy")
[1] 0.9487180 0.9920000 0.9663866 0.9842520 0.9732143
# TensorFlow上で評価用に算出したaccuracyとほぼ同じ結果
> sapply(
X = lapply(X = tf_result, FUN = "[[", "fit_confusion_mat"),
FUN = calcAccuracy
[1] 0.9487179 0.9920000 0.9663866 0.9842520 0.9732143
# 当てはめ結果の平均
> mean(x = sapply(X = tf_result, FUN = "[[", "tf_fit_accuracy"))
[1] 0.9729141
# 当てはめ結果のconfusion matrix
> lapply(X = tf_result, FUN = "[[", "fit_confusion_mat")
predict 1 2 3
1 40 0 0
2 0 31 0
3 0 6 40
predict 1 2 3
1 39 0 0
2 0 41 0
3 0 1 44
predict 1 2 3
1 42 0 0
2 0 33 0
3 0 4 40
predict 1 2 3
1 38 0 0
2 0 43 1
3 0 1 44
predict 1 2 3
1 41 0 0
2 0 37 0
3 0 3 31
# 予測結果を確認
> sapply(X = tf_result, FUN = "[[", "tf_predict_accuracy")
[1] 0.9696970 0.9200000 0.9677419 0.9565217 0.9736842
> sapply(
X = lapply(X = tf_result, FUN = "[[", "predict_confusion_mat"),
FUN = calcAccuracy
[1] 0.9696970 0.9200000 0.9677419 0.9565217 0.9736842
# 予測結果の平均
> mean(x = sapply(X = tf_result, FUN = "[[", "tf_predict_accuracy"))
[1] 0.957529
# 予測結果のconfusion matrix
> lapply(X = tf_result, FUN = "[[", "predict_confusion_mat")
predict 1 2 3
1 10 0 0
2 0 12 0
3 0 1 10
predict 1 2 3
1 11 0 0
2 0 6 0
3 0 2 6
predict 1 2 3
1 8 0 0
2 0 12 0
3 0 1 10
predict 1 2 3
1 12 0 0
2 0 6 1
3 0 0 4
predict 1 2 3
1 9 0 0
2 0 9 0
3 0 1 19
# {glmnet}の予測結果
> glmnet_ev <- sapply(
X = lapply(X = tf_result, FUN = "[[", "glmnet_confusion_mat"),
FUN = calcAccuracy
) %>%
[1] 1.0000000 0.9200000 0.9677419 0.9130435 0.9736842
> mean(x = glmnet_ev)
[1] 0.9548939
{tensorflow}を用いて、分析業界の"Hello World"であるirisデータの分類を試しました。以前は{PythonInR}を使ってTensorFlowを呼び出しましたが、RStudio社が公開するパッケージということでこちらの方が安心感があります。今回はとりあえず動かしてみただけですので、モデルの改良などはもう少しお勉強してからにします。
