準備
必要なパッケージをインストールしておきます.
remotes::install_github("mlr-org/mlr3extralearners")
install.packages("C50")
install.packages("RWeka")
5.2 実例 --- C5.0決定木を使った銀行の信用評価モデル
データのダウンロードをします.
url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
credit <- read_delim(url,
delim = " ",
col_names = FALSE,
col_types = "cdffdccdffdfdffdfdffi") %>%
select(checking_account = 1,
months_loan_duaration = 2,
credit_history = 3,
purpose = 4,
amount = 5,
savings_account = 6,
present_employment_since = 7,
installment_rate_in_percentage_of_disposable_income = 8,
personal_status_and_sex = 9,
other_debtors = 10,
present_residence_since = 11,
property = 12,
age = 13,
other_installment_plans = 14,
housing = 15,
existing_credits = 16,
job = 17,
number_of_people_being_liable = 18,
telephone = 19,
foreign_worker = 20,
default = 21) %>%
mutate(
checking_account = factor(checking_account,
ordered = TRUE,
levels = sort(unique(checking_account)),
labels = c("< 0 DM",
"1 - 200 DM",
">= 200 DM",
"Unknown"))) %>%
mutate(
savings_account = factor(savings_account,
ordered = TRUE,
levels = sort(unique(savings_account)),
labels = c("< 100 DM",
"100 - 500 DM",
"500 - 1000 DM",
">= 1000 DM",
"Unknown"))) %>%
mutate(
present_employment_since = factor(present_employment_since,
ordered = TRUE,
levels = sort(unique(present_employment_since)),
labels = c("Unemployed",
"< 1 year",
"1 - 4 years",
"4 - 7 years",
">= 7 years"))) %>%
mutate(
default = factor(default, labels = c("no", "yes"))
)
checking_balance
とsavings_balance
の分布を確認します.
print(table(credit$checking_account))
##
## < 0 DM 1 - 200 DM >= 200 DM Unknown
## 274 269 63 394
print(table(credit$savings_account))
##
## < 100 DM 100 - 500 DM 500 - 1000 DM >= 1000 DM Unknown
## 603 103 63 48 183
months_loan_duration
とamount
の集計を確認します.
print(summary(credit$months_loan_duaration))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.0 12.0 18.0 20.9 24.0 72.0
print(summary(credit$amount))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
目的変数となるdefault
の分布を確認しておきます.
print(table(credit$default))
##
## no yes
## 700 300
タスクを準備し,訓練データとテストデータに分割します.学習器はclassif.C50
を使います.
set.seed(123)
task <- as_task_classif(credit,
target = "default")
learner_C50 <- lrn("classif.C50")
train_set <- sample(task$nrow, task$nrow * 0.9)
test_set <- setdiff(seq_len(task$nrow), train_set)
それぞれのデータセットの債務不履行の割合を確認します.
print(prop.table(table(credit[train_set,]$default)))
##
## no yes
## 0.7055556 0.2944444
print(prop.table(table(credit[test_set,]$default)))
##
## no yes
## 0.65 0.35
モデルの訓練を行います.
learner_C50$train(task = task, row_ids = train_set)
訓練結果を確認してみます.
print(learner_C50$model)
##
## Call:
## C50::C5.0.formula(formula = f, data = data, control = c5control)
##
## Classification Tree
## Number of samples: 900
## Number of predictors: 20
##
## Tree size: 58
##
## Non-standard options: attempt to group attributes
実際の木を見るためにsummary()
を使います.
summary(learner_C50$model)
##
## Call:
## C50::C5.0.formula(formula = f, data = data, control = c5control)
##
##
## C5.0 [Release 2.07 GPL Edition] Tue Aug 24 16:22:30 2021
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 900 cases (21 attributes) from undefined.data
##
## Decision tree:
##
## checking_account in [>= 200 DM-Unknown]:
## :...other_installment_plans = A143:
## : :...present_employment_since in [1 - 4 years->= 7 years]: no (283/20)
## : : present_employment_since in [Unemployed-< 1 year]:
## : : :...other_debtors = A103: no (0)
## : : other_debtors = A102: yes (3)
## : : other_debtors = A101:
## : : :...amount <= 7119: no (51/7)
## : : amount > 7119: yes (3)
## : other_installment_plans in {A141,A142}:
## : :...purpose in {A43,A46,A42,A41,A44,A45,A410,A48}: no (46/8)
## : purpose = A49:
## : :...present_employment_since in [Unemployed-1 - 4 years]: yes (7/1)
## : : present_employment_since in [4 - 7 years->= 7 years]: no (4)
## : purpose = A40:
## : :...months_loan_duaration <= 15: no (5)
## : months_loan_duaration > 15:
## : :...job in {A173,A172,A171}: yes (6)
## : job = A174: no (4/1)
## checking_account in [< 0 DM-1 - 200 DM]:
## :...credit_history in {A30,A31}:
## :...housing = A151: yes (16/1)
## : housing = A153:
## : :...other_debtors in {A101,A103}: yes (12/1)
## : : other_debtors = A102: no (2)
## : housing = A152:
## : :...purpose in {A43,A46,A44,A45,A410}: yes (6/1)
## : purpose in {A41,A49,A48}: no (10/2)
## : purpose = A42:
## : :...other_installment_plans in {A143,A142}: no (4)
## : : other_installment_plans = A141: yes (5/1)
## : purpose = A40:
## : :...months_loan_duaration <= 22: yes (6)
## : months_loan_duaration > 22: no (2)
## credit_history in {A34,A32,A33}:
## :...months_loan_duaration <= 15: no (180/45)
## months_loan_duaration > 15:
## :...savings_account in [>= 1000 DM-Unknown]:
## :...credit_history in {A34,A33}: no (14)
## : credit_history = A32:
## : :...purpose in {A46,A49,A44,A45,A410,A48}: no (5)
## : purpose = A40: yes (7/1)
## : purpose = A43:
## : :...amount <= 6110: no (5)
## : : amount > 6110: yes (2)
## : purpose = A42:
## : :...age <= 27: yes (2)
## : : age > 27: no (5)
## : purpose = A41:
## : :...amount <= 6967: no (4)
## : amount > 6967: yes (2)
## savings_account in [< 100 DM-500 - 1000 DM]:
## :...months_loan_duaration > 47: yes (23/3)
## months_loan_duaration <= 47:
## :...present_employment_since in [Unemployed-1 - 4 years]:
## :...purpose in {A46,A44,A48}: yes (7/1)
## : purpose in {A49,A410}: no (8/2)
## : purpose = A42:
## : :...property in {A121,A123}: yes (12/1)
## : : property in {A122,A124}: no (14/4)
## : purpose = A41:
## : :...amount <= 8086: no (8/1)
## : : amount > 8086: yes (3)
## : purpose = A45:
## : :...property in {A121,A122,A124}: yes (3)
## : : property = A123: no (2)
## : purpose = A40:
## : :...savings_account in [100 - 500 DM-500 - 1000 DM]: no (2)
## : : savings_account = < 100 DM: [S1]
## : purpose = A43:
## : :...existing_credits > 1: yes (7)
## : existing_credits <= 1:
## : :...number_of_people_being_liable > 1: yes (2)
## : number_of_people_being_liable <= 1:
## : :...job in {A174,A171}: no (4)
## : job in {A173,A172}:
## : :...telephone = A192: yes (5)
## : telephone = A191: [S2]
## present_employment_since in [4 - 7 years->= 7 years]:
## :...personal_status_and_sex = A91: yes (4)
## personal_status_and_sex in {A93,A92,A94}:
## :...savings_account in [100 - 500 DM-500 - 1000 DM]: no (12/1)
## savings_account = < 100 DM:
## :...job in {A172,A171}: no (9/1)
## job = A174:
## :...amount <= 11590: no (10/1)
## : amount > 11590: yes (2)
## job = A173: [S3]
##
## SubTree [S1]
##
## installment_rate_in_percentage_of_disposable_income <= 2: no (5/1)
## installment_rate_in_percentage_of_disposable_income > 2: yes (13/1)
##
## SubTree [S2]
##
## installment_rate_in_percentage_of_disposable_income <= 3: no (5/1)
## installment_rate_in_percentage_of_disposable_income > 3: yes (11/2)
##
## SubTree [S3]
##
## other_installment_plans in {A141,A142}: yes (7)
## other_installment_plans = A143:
## :...present_employment_since = >= 7 years: no (9/1)
## present_employment_since = 4 - 7 years:
## :...number_of_people_being_liable > 1: no (2)
## number_of_people_being_liable <= 1:
## :...months_loan_duaration <= 22: no (3)
## months_loan_duaration > 22: yes (7)
##
##
## Evaluation on training data (900 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 57 110(12.2%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 621 14 (a): class no
## 96 169 (b): class yes
##
##
## Attribute usage:
##
## 100.00% checking_account
## 58.56% present_employment_since
## 54.22% credit_history
## 49.89% other_installment_plans
## 49.78% months_loan_duaration
## 27.56% purpose
## 27.22% savings_account
## 10.00% amount
## 9.33% job
## 7.89% other_debtors
## 7.22% personal_status_and_sex
## 7.00% housing
## 4.33% number_of_people_being_liable
## 3.78% existing_credits
## 3.78% installment_rate_in_percentage_of_disposable_income
## 3.44% property
## 2.33% telephone
## 0.78% age
##
##
## Time: 0.0 secs
決定木の訓練が終わったので,テストデータで予測してみます.
credit_pred <- learner_C50$predict(task, row_ids = test_set)
print(credit_pred)
## <PredictionClassif> for 100 observations:
## row_ids truth response
## 1 no no
## 3 no no
## 7 no no
## ---
## 993 no no
## 995 no no
## 997 no no
混同行列を確認します.実際に債務不履行(yes
)となった35件のうち,正しく予測できたのは14件でした.
print(credit_pred$confusion)
## truth
## response no yes
## no 58 21
## yes 7 14
ブースティングによって決定木を改善していきます.
学習器を定義する際にtrials = 10
を追加します.
learner_C50_10 <- lrn("classif.C50", trials = 10)
learner_C50_10$train(task = task, row_ids = train_set)
print(learner_C50_10$model)
##
## Call:
## C50::C5.0.formula(formula = f, data = data, control = c5control, trials = 10L)
##
## Classification Tree
## Number of samples: 900
## Number of predictors: 20
##
## Number of boosting iterations: 10
## Average tree size: 50
##
## Non-standard options: attempt to group attributes
summary()
は長すぎるので省略して,テストデータで予測を行ってみます.
credit_10_pred <- learner_C50_10$predict(task, row_ids = test_set)
print(credit_10_pred)
## <PredictionClassif> for 100 observations:
## row_ids truth response
## 1 no no
## 3 no no
## 7 no no
## ---
## 993 no no
## 995 no no
## 997 no no
混同行列は以下の通りです.債務不履行35件中18件を予測することができました.
print(credit_10_pred$confusion)
## truth
## response no yes
## no 55 17
## yes 10 18
本来ならばこの後はコスト行列を指定して決定木を訓練するのですが,コスト行列をmlr3
の学習器を通じて渡す方法がわからなかったためいったん保留とします.
5.4 実例 --- 分類ルール学習器を使った毒キノコの判定
データをダウンロードします.
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
mushrooms <- read_csv(url,
col_names = FALSE,
col_types = "fffffffffffffffffffffff") %>%
select(type = 1,
cap_shape = 2,
cap_surface = 3,
cap_color = 4,
bruises = 5,
odor = 6,
gill_attachment = 7,
gill_spacing = 8,
gill_size = 9,
gill_color = 10,
stalk_shape = 11,
stalk_root = 12,
stalk_surface_above_ring = 13,
stalk_surface_below_ring = 14,
stalk_color_above_ring = 15,
stalk_color_below_ring = 16,
veil_type = 17,
veil_color = 18,
ring_number = 19,
ring_type = 20,
spore_print_color = 21,
population = 22,
habitat = 23) %>%
mutate(odor = factor(odor,
levels = sort(levels(unique(odor))),
labels = c("almond",
"creosote",
"foul",
"anise",
"musty",
"none",
"pungent",
"spicy",
"sishy"))) %>%
mutate(type = factor(type,
levels = sort(levels(unique(type))),
labels = c("edible", "poisonous")))
データフレームの中身を見てみます.
str(mushrooms)
## tibble [8,124 x 23] (S3: tbl_df/tbl/data.frame)
## $ type : Factor w/ 2 levels "edible","poisonous": 2 1 1 2 1 1 1 1 2 1 ...
## $ cap_shape : Factor w/ 6 levels "x","b","s","f",..: 1 1 2 1 1 1 2 2 1 2 ...
## $ cap_surface : Factor w/ 4 levels "s","y","f","g": 1 1 1 2 1 2 1 2 2 1 ...
## $ cap_color : Factor w/ 10 levels "n","y","w","g",..: 1 2 3 3 4 2 3 3 3 2 ...
## $ bruises : Factor w/ 2 levels "t","f": 1 1 1 1 2 1 1 1 1 1 ...
## $ odor : Factor w/ 9 levels "almond","creosote",..: 7 1 4 7 6 1 1 4 7 1 ...
## $ gill_attachment : Factor w/ 2 levels "f","a": 1 1 1 1 1 1 1 1 1 1 ...
## $ gill_spacing : Factor w/ 2 levels "c","w": 1 1 1 1 2 1 1 1 1 1 ...
## $ gill_size : Factor w/ 2 levels "n","b": 1 2 2 1 2 2 2 2 1 2 ...
## $ gill_color : Factor w/ 12 levels "k","n","g","p",..: 1 1 2 2 1 2 3 2 4 3 ...
## $ stalk_shape : Factor w/ 2 levels "e","t": 1 1 1 1 2 1 1 1 1 1 ...
## $ stalk_root : Factor w/ 5 levels "e","c","b","r",..: 1 2 2 1 1 2 2 2 1 2 ...
## $ stalk_surface_above_ring: Factor w/ 4 levels "s","f","k","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ stalk_surface_below_ring: Factor w/ 4 levels "s","f","y","k": 1 1 1 1 1 1 1 1 1 1 ...
## $ stalk_color_above_ring : Factor w/ 9 levels "w","g","p","n",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ stalk_color_below_ring : Factor w/ 9 levels "w","p","g","b",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ veil_type : Factor w/ 1 level "p": 1 1 1 1 1 1 1 1 1 1 ...
## $ veil_color : Factor w/ 4 levels "w","n","o","y": 1 1 1 1 1 1 1 1 1 1 ...
## $ ring_number : Factor w/ 3 levels "o","t","n": 1 1 1 1 1 1 1 1 1 1 ...
## $ ring_type : Factor w/ 5 levels "p","e","l","f",..: 1 1 1 1 2 1 1 1 1 1 ...
## $ spore_print_color : Factor w/ 9 levels "k","n","u","h",..: 1 2 2 1 2 1 1 2 1 1 ...
## $ population : Factor w/ 6 levels "s","n","a","v",..: 1 2 2 1 3 2 2 1 4 1 ...
## $ habitat : Factor w/ 7 levels "u","g","m","d",..: 1 2 3 1 2 2 3 3 2 3 ...
veil_type
が1レベルしか含まないため,これを削除しておきます.
mushrooms$veil_type <- NULL
type
の分布を確認します.
table(mushrooms$type)
##
## edible poisonous
## 4208 3916
1R分類器で分類してみます.classif.OneR
を使います.
task <- as_task_classif(mushrooms, target = "type")
learner_OneR <- lrn("classif.OneR")
learner_OneR$train(task)
learner_OneR$model
## odor:
## almond -> edible
## creosote -> poisonous
## foul -> poisonous
## anise -> edible
## musty -> poisonous
## none -> edible
## pungent -> poisonous
## spicy -> poisonous
## sishy -> poisonous
## (8004/8124 instances correct)
残念ながらlearner_OneR$model
にsummary()
は使えないようです.
summary(learner_OneR$model)
## Error: object of type 'closure' is not subsettable
次にRIPPERアルゴリズムによるclassif.JRip
を使ってみます.
learner_JRip <- lrn("classif.JRip")
learner_JRip$train(task)
learner_JRip$model
## JRIP rules:
## ===========
##
## (odor = foul) => type=poisonous (2160.0/0.0)
## (gill_size = n) and (gill_color = b) => type=poisonous (1152.0/0.0)
## (gill_size = n) and (odor = pungent) => type=poisonous (256.0/0.0)
## (odor = creosote) => type=poisonous (192.0/0.0)
## (spore_print_color = r) => type=poisonous (72.0/0.0)
## (stalk_surface_below_ring = y) and (stalk_surface_above_ring = k) => type=poisonous (68.0/0.0)
## (stalk_color_above_ring = y) => type=poisonous (8.0/0.0)
## (cap_surface = g) => type=poisonous (4.0/0.0)
## (habitat = l) and (bruises = t) => type=poisonous (4.0/0.0)
## => type=edible (4208.0/0.0)
##
## Number of Rules : 10
終わりに
mlr3
経由だと本来使えるはずの機能が使えなかったり(見つけられなかったり)という点で苦労しますね.