More than 3 years have passed since last update.

mlr3で「Rによる機械学習」：第5章「分割統治 --- 決定木と分類ルールを使った分類」

Posted at 2021-08-24

準備

必要なパッケージをインストールしておきます．

remotes::install_github("mlr-org/mlr3extralearners")
install.packages("C50")
install.packages("RWeka")

5.2 実例 --- C5.0決定木を使った銀行の信用評価モデル

データのダウンロードをします．

url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
credit <- read_delim(url,
                     delim = " ",
                     col_names = FALSE,
                     col_types = "cdffdccdffdfdffdfdffi") %>%
  select(checking_account = 1,
         months_loan_duaration = 2,
         credit_history = 3,
         purpose = 4,
         amount = 5,
         savings_account = 6,
         present_employment_since = 7,
         installment_rate_in_percentage_of_disposable_income = 8,
         personal_status_and_sex = 9,
         other_debtors = 10,
         present_residence_since = 11,
         property = 12,
         age = 13,
         other_installment_plans = 14,
         housing = 15,
         existing_credits = 16,
         job = 17,
         number_of_people_being_liable = 18,
         telephone = 19,
         foreign_worker = 20,
         default = 21) %>%
  mutate(
    checking_account = factor(checking_account, 
                              ordered = TRUE,
                              levels = sort(unique(checking_account)),
                              labels = c("< 0 DM",
                                         "1 - 200 DM",
                                         ">= 200 DM",
                                         "Unknown"))) %>%
  mutate(
    savings_account = factor(savings_account,
                             ordered = TRUE,
                             levels = sort(unique(savings_account)),
                             labels = c("< 100 DM",
                                        "100 - 500 DM",
                                        "500 - 1000 DM",
                                        ">= 1000 DM",
                                        "Unknown"))) %>%
  mutate(
    present_employment_since = factor(present_employment_since,
                                      ordered = TRUE,
                                      levels = sort(unique(present_employment_since)),
                                      labels = c("Unemployed",
                                                 "< 1 year",
                                                 "1 - 4 years",
                                                 "4 - 7 years",
                                                 ">= 7 years"))) %>%
  mutate(
    default = factor(default, labels = c("no", "yes"))
  )

checking_balanceとsavings_balanceの分布を確認します．

print(table(credit$checking_account))
## 
##     < 0 DM 1 - 200 DM  >= 200 DM    Unknown 
##        274        269         63        394

print(table(credit$savings_account))
## 
##      < 100 DM  100 - 500 DM 500 - 1000 DM    >= 1000 DM       Unknown 
##           603           103            63            48           183

months_loan_durationとamountの集計を確認します．

print(summary(credit$months_loan_duaration))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     4.0    12.0    18.0    20.9    24.0    72.0

print(summary(credit$amount))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

目的変数となるdefaultの分布を確認しておきます．

print(table(credit$default))
## 
##  no yes 
## 700 300

タスクを準備し，訓練データとテストデータに分割します．学習器はclassif.C50を使います．

set.seed(123)
task <- as_task_classif(credit,
                        target = "default")
learner_C50 <- lrn("classif.C50")
train_set <- sample(task$nrow, task$nrow * 0.9)
test_set <- setdiff(seq_len(task$nrow), train_set)

それぞれのデータセットの債務不履行の割合を確認します．

print(prop.table(table(credit[train_set,]$default)))
## 
##        no       yes 
## 0.7055556 0.2944444

print(prop.table(table(credit[test_set,]$default)))
## 
##   no  yes 
## 0.65 0.35

モデルの訓練を行います．

learner_C50$train(task = task, row_ids = train_set)

訓練結果を確認してみます．

print(learner_C50$model)
## 
## Call:
## C50::C5.0.formula(formula = f, data = data, control = c5control)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 20 
## 
## Tree size: 58 
## 
## Non-standard options: attempt to group attributes

実際の木を見るためにsummary()を使います．

summary(learner_C50$model)
## 
## Call:
## C50::C5.0.formula(formula = f, data = data, control = c5control)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Tue Aug 24 16:22:30 2021
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 900 cases (21 attributes) from undefined.data
## 
## Decision tree:
## 
## checking_account in [>= 200 DM-Unknown]:
## :...other_installment_plans = A143:
## :   :...present_employment_since in [1 - 4 years->= 7 years]: no (283/20)
## :   :   present_employment_since in [Unemployed-< 1 year]:
## :   :   :...other_debtors = A103: no (0)
## :   :       other_debtors = A102: yes (3)
## :   :       other_debtors = A101:
## :   :       :...amount <= 7119: no (51/7)
## :   :           amount > 7119: yes (3)
## :   other_installment_plans in {A141,A142}:
## :   :...purpose in {A43,A46,A42,A41,A44,A45,A410,A48}: no (46/8)
## :       purpose = A49:
## :       :...present_employment_since in [Unemployed-1 - 4 years]: yes (7/1)
## :       :   present_employment_since in [4 - 7 years->= 7 years]: no (4)
## :       purpose = A40:
## :       :...months_loan_duaration <= 15: no (5)
## :           months_loan_duaration > 15:
## :           :...job in {A173,A172,A171}: yes (6)
## :               job = A174: no (4/1)
## checking_account in [< 0 DM-1 - 200 DM]:
## :...credit_history in {A30,A31}:
##     :...housing = A151: yes (16/1)
##     :   housing = A153:
##     :   :...other_debtors in {A101,A103}: yes (12/1)
##     :   :   other_debtors = A102: no (2)
##     :   housing = A152:
##     :   :...purpose in {A43,A46,A44,A45,A410}: yes (6/1)
##     :       purpose in {A41,A49,A48}: no (10/2)
##     :       purpose = A42:
##     :       :...other_installment_plans in {A143,A142}: no (4)
##     :       :   other_installment_plans = A141: yes (5/1)
##     :       purpose = A40:
##     :       :...months_loan_duaration <= 22: yes (6)
##     :           months_loan_duaration > 22: no (2)
##     credit_history in {A34,A32,A33}:
##     :...months_loan_duaration <= 15: no (180/45)
##         months_loan_duaration > 15:
##         :...savings_account in [>= 1000 DM-Unknown]:
##             :...credit_history in {A34,A33}: no (14)
##             :   credit_history = A32:
##             :   :...purpose in {A46,A49,A44,A45,A410,A48}: no (5)
##             :       purpose = A40: yes (7/1)
##             :       purpose = A43:
##             :       :...amount <= 6110: no (5)
##             :       :   amount > 6110: yes (2)
##             :       purpose = A42:
##             :       :...age <= 27: yes (2)
##             :       :   age > 27: no (5)
##             :       purpose = A41:
##             :       :...amount <= 6967: no (4)
##             :           amount > 6967: yes (2)
##             savings_account in [< 100 DM-500 - 1000 DM]:
##             :...months_loan_duaration > 47: yes (23/3)
##                 months_loan_duaration <= 47:
##                 :...present_employment_since in [Unemployed-1 - 4 years]:
##                     :...purpose in {A46,A44,A48}: yes (7/1)
##                     :   purpose in {A49,A410}: no (8/2)
##                     :   purpose = A42:
##                     :   :...property in {A121,A123}: yes (12/1)
##                     :   :   property in {A122,A124}: no (14/4)
##                     :   purpose = A41:
##                     :   :...amount <= 8086: no (8/1)
##                     :   :   amount > 8086: yes (3)
##                     :   purpose = A45:
##                     :   :...property in {A121,A122,A124}: yes (3)
##                     :   :   property = A123: no (2)
##                     :   purpose = A40:
##                     :   :...savings_account in [100 - 500 DM-500 - 1000 DM]: no (2)
##                     :   :   savings_account = < 100 DM: [S1]
##                     :   purpose = A43:
##                     :   :...existing_credits > 1: yes (7)
##                     :       existing_credits <= 1:
##                     :       :...number_of_people_being_liable > 1: yes (2)
##                     :           number_of_people_being_liable <= 1:
##                     :           :...job in {A174,A171}: no (4)
##                     :               job in {A173,A172}:
##                     :               :...telephone = A192: yes (5)
##                     :                   telephone = A191: [S2]
##                     present_employment_since in [4 - 7 years->= 7 years]:
##                     :...personal_status_and_sex = A91: yes (4)
##                         personal_status_and_sex in {A93,A92,A94}:
##                         :...savings_account in [100 - 500 DM-500 - 1000 DM]: no (12/1)
##                             savings_account = < 100 DM:
##                             :...job in {A172,A171}: no (9/1)
##                                 job = A174:
##                                 :...amount <= 11590: no (10/1)
##                                 :   amount > 11590: yes (2)
##                                 job = A173: [S3]
## 
## SubTree [S1]
## 
## installment_rate_in_percentage_of_disposable_income <= 2: no (5/1)
## installment_rate_in_percentage_of_disposable_income > 2: yes (13/1)
## 
## SubTree [S2]
## 
## installment_rate_in_percentage_of_disposable_income <= 3: no (5/1)
## installment_rate_in_percentage_of_disposable_income > 3: yes (11/2)
## 
## SubTree [S3]
## 
## other_installment_plans in {A141,A142}: yes (7)
## other_installment_plans = A143:
## :...present_employment_since = >= 7 years: no (9/1)
##     present_employment_since = 4 - 7 years:
##     :...number_of_people_being_liable > 1: no (2)
##         number_of_people_being_liable <= 1:
##         :...months_loan_duaration <= 22: no (3)
##             months_loan_duaration > 22: yes (7)
## 
## 
## Evaluation on training data (900 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      57  110(12.2%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     621    14    (a): class no
##      96   169    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% checking_account
##   58.56% present_employment_since
##   54.22% credit_history
##   49.89% other_installment_plans
##   49.78% months_loan_duaration
##   27.56% purpose
##   27.22% savings_account
##   10.00% amount
##    9.33% job
##    7.89% other_debtors
##    7.22% personal_status_and_sex
##    7.00% housing
##    4.33% number_of_people_being_liable
##    3.78% existing_credits
##    3.78% installment_rate_in_percentage_of_disposable_income
##    3.44% property
##    2.33% telephone
##    0.78% age
## 
## 
## Time: 0.0 secs

決定木の訓練が終わったので，テストデータで予測してみます．

credit_pred <- learner_C50$predict(task, row_ids = test_set)
print(credit_pred)
## <PredictionClassif> for 100 observations:
##     row_ids truth response
##           1    no       no
##           3    no       no
##           7    no       no
## ---                       
##         993    no       no
##         995    no       no
##         997    no       no

混同行列を確認します．実際に債務不履行（yes）となった35件のうち，正しく予測できたのは14件でした．

print(credit_pred$confusion)
##         truth
## response no yes
##      no  58  21
##      yes  7  14

ブースティングによって決定木を改善していきます．
学習器を定義する際にtrials = 10を追加します．

learner_C50_10 <- lrn("classif.C50", trials = 10)
learner_C50_10$train(task = task, row_ids = train_set)
print(learner_C50_10$model)
## 
## Call:
## C50::C5.0.formula(formula = f, data = data, control = c5control, trials = 10L)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 20 
## 
## Number of boosting iterations: 10 
## Average tree size: 50 
## 
## Non-standard options: attempt to group attributes

summary()は長すぎるので省略して，テストデータで予測を行ってみます．

credit_10_pred <- learner_C50_10$predict(task, row_ids = test_set)
print(credit_10_pred)
## <PredictionClassif> for 100 observations:
##     row_ids truth response
##           1    no       no
##           3    no       no
##           7    no       no
## ---                       
##         993    no       no
##         995    no       no
##         997    no       no

混同行列は以下の通りです．債務不履行35件中18件を予測することができました．

print(credit_10_pred$confusion)
##         truth
## response no yes
##      no  55  17
##      yes 10  18

本来ならばこの後はコスト行列を指定して決定木を訓練するのですが，コスト行列をmlr3の学習器を通じて渡す方法がわからなかったためいったん保留とします．

5.4 実例 --- 分類ルール学習器を使った毒キノコの判定

データをダウンロードします．

url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
mushrooms <- read_csv(url,
                      col_names = FALSE,
                      col_types = "fffffffffffffffffffffff") %>%
  select(type = 1,
         cap_shape = 2,
         cap_surface = 3,
         cap_color = 4,
         bruises = 5,
         odor = 6,
         gill_attachment = 7,
         gill_spacing = 8,
         gill_size = 9,
         gill_color = 10,
         stalk_shape = 11,
         stalk_root = 12,
         stalk_surface_above_ring = 13,
         stalk_surface_below_ring = 14,
         stalk_color_above_ring = 15,
         stalk_color_below_ring = 16,
         veil_type = 17,
         veil_color = 18,
         ring_number = 19,
         ring_type = 20,
         spore_print_color = 21,
         population = 22,
         habitat = 23) %>%
  mutate(odor = factor(odor, 
                       levels = sort(levels(unique(odor))),
                       labels = c("almond",
                                  "creosote",
                                  "foul",
                                  "anise",
                                  "musty",
                                  "none",
                                  "pungent",
                                  "spicy",
                                  "sishy"))) %>%
  mutate(type = factor(type,
                       levels = sort(levels(unique(type))),
                       labels = c("edible", "poisonous")))

データフレームの中身を見てみます．

str(mushrooms)
## tibble [8,124 x 23] (S3: tbl_df/tbl/data.frame)
##  $ type                    : Factor w/ 2 levels "edible","poisonous": 2 1 1 2 1 1 1 1 2 1 ...
##  $ cap_shape               : Factor w/ 6 levels "x","b","s","f",..: 1 1 2 1 1 1 2 2 1 2 ...
##  $ cap_surface             : Factor w/ 4 levels "s","y","f","g": 1 1 1 2 1 2 1 2 2 1 ...
##  $ cap_color               : Factor w/ 10 levels "n","y","w","g",..: 1 2 3 3 4 2 3 3 3 2 ...
##  $ bruises                 : Factor w/ 2 levels "t","f": 1 1 1 1 2 1 1 1 1 1 ...
##  $ odor                    : Factor w/ 9 levels "almond","creosote",..: 7 1 4 7 6 1 1 4 7 1 ...
##  $ gill_attachment         : Factor w/ 2 levels "f","a": 1 1 1 1 1 1 1 1 1 1 ...
##  $ gill_spacing            : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
##  $ gill_size               : Factor w/ 2 levels "n","b": 1 2 2 1 2 2 2 2 1 2 ...
##  $ gill_color              : Factor w/ 12 levels "k","n","g","p",..: 1 1 2 2 1 2 3 2 4 3 ...
##  $ stalk_shape             : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
##  $ stalk_root              : Factor w/ 5 levels "e","c","b","r",..: 1 2 2 1 1 2 2 2 1 2 ...
##  $ stalk_surface_above_ring: Factor w/ 4 levels "s","f","k","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stalk_surface_below_ring: Factor w/ 4 levels "s","f","y","k": 1 1 1 1 1 1 1 1 1 1 ...
##  $ stalk_color_above_ring  : Factor w/ 9 levels "w","g","p","n",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ stalk_color_below_ring  : Factor w/ 9 levels "w","p","g","b",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ veil_type               : Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
##  $ veil_color              : Factor w/ 4 levels "w","n","o","y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ring_number             : Factor w/ 3 levels "o","t","n": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ring_type               : Factor w/ 5 levels "p","e","l","f",..: 1 1 1 1 2 1 1 1 1 1 ...
##  $ spore_print_color       : Factor w/ 9 levels "k","n","u","h",..: 1 2 2 1 2 1 1 2 1 1 ...
##  $ population              : Factor w/ 6 levels "s","n","a","v",..: 1 2 2 1 3 2 2 1 4 1 ...
##  $ habitat                 : Factor w/ 7 levels "u","g","m","d",..: 1 2 3 1 2 2 3 3 2 3 ...

veil_typeが1レベルしか含まないため，これを削除しておきます．

mushrooms$veil_type <- NULL

typeの分布を確認します．

table(mushrooms$type)
## 
##    edible poisonous 
##      4208      3916

1R分類器で分類してみます．classif.OneRを使います．

task <- as_task_classif(mushrooms, target = "type")
learner_OneR <- lrn("classif.OneR")
learner_OneR$train(task)
learner_OneR$model
## odor:
##  almond  -> edible
##  creosote    -> poisonous
##  foul    -> poisonous
##  anise   -> edible
##  musty   -> poisonous
##  none    -> edible
##  pungent -> poisonous
##  spicy   -> poisonous
##  sishy   -> poisonous
## (8004/8124 instances correct)

残念ながらlearner_OneR$modelにsummary()は使えないようです．

summary(learner_OneR$model)
## Error: object of type 'closure' is not subsettable

次にRIPPERアルゴリズムによるclassif.JRipを使ってみます．

learner_JRip <- lrn("classif.JRip")
learner_JRip$train(task)
learner_JRip$model
## JRIP rules:
## ===========
## 
## (odor = foul) => type=poisonous (2160.0/0.0)
## (gill_size = n) and (gill_color = b) => type=poisonous (1152.0/0.0)
## (gill_size = n) and (odor = pungent) => type=poisonous (256.0/0.0)
## (odor = creosote) => type=poisonous (192.0/0.0)
## (spore_print_color = r) => type=poisonous (72.0/0.0)
## (stalk_surface_below_ring = y) and (stalk_surface_above_ring = k) => type=poisonous (68.0/0.0)
## (stalk_color_above_ring = y) => type=poisonous (8.0/0.0)
## (cap_surface = g) => type=poisonous (4.0/0.0)
## (habitat = l) and (bruises = t) => type=poisonous (4.0/0.0)
##  => type=edible (4208.0/0.0)
## 
## Number of Rules : 10

終わりに

mlr3経由だと本来使えるはずの機能が使えなかったり（見つけられなかったり）という点で苦労しますね．

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up