More than 1 year has passed since last update.
クラスタリング結果を用いた、別データの再分類

Last updated at 2024-03-19Posted at 2024-01-30
R言語でのクラスタリングサンプルソースと
クラスタリング結果から別データを参照して再分類するソース
################################################################################
################################################################################
# クラスタリング サンプルソース
# クラスタリングの結果から別データを分類する
################################################################################
Sys.setlocale(locale="Japanese")
library(caret)
library(reshape)
library(dplyr)


################################################################################
#
# 1.ターゲットデータの抽出
#
################################################################################

targetData <-read.csv("target.csv")
nostddf<-targetData

################################################################################

# 2.クラスタリングの実施

################################################################################
nostddfdataonly <- nostddf
#数値クラスタリングに不要なカラムの除外
rownames(nostddfdataonly)<-nostddfdataonly$会員番号
nostddfdataonly<-nostddf[,-c("会員番号")]
rownames(nostddfdataonly)<-nostddf$会員番号

#情報量が”０”に近いカラムを削除
names(nostddfdataonly)
nzv <- nearZeroVar(nostddfdataonly)
nostddfdataonly <- nostddfdataonly[,-nzv]
names(nostddfdataonly)

#高相関のカラムを削除
highcorColumn<-findCorrelation(cor(nostddfdataonly),cutoff=0.8)
nostddfdataonly<-nostddfdataonly[,-highcorColumn]
names(nostddfdataonly)

prcompresult<-prcomp(nostddfdataonly,scale=T)
prcompresultDF<-data.frame(prcompresult$x)

set.seed(1)
data <- prcompresultDF

# クラスタ数の範囲を指定
k_values <- 1:10

# 各クラスタ数でkmeansを実行し、クラスタ内の平方和を取得
wss <- sapply(k_values, function(k) {
  kmeans(data, centers = k)$tot.withinss
})

# エルボープロットの作成
plot(k_values, wss, type = "b", pch = 19, frame = FALSE, 
     xlab = "Number of Clusters", ylab = "Within groups sum of squares")

# エルボー法で最適なクラスタ数を選択
# グラフを見て「肘」（曲線の急激な変化が緩やかになる箇所）の位置を特定

km <- kmeans(prcompresultDF,9)
table(km$cluster)
names(writeDF) <- c(names(nostddf),"cluster")

###################################
#分析結果をもとに別の母集団でグルーピングする場合のソース
#元の分析で作ったobjデータを指定する
saveRDS(prcompresult,"clustering.obj")
#これは元の分類をした際に保存しておいたobjを採用すること！！！
#別データについてはスケールする必要なし！
prcompresult <- readRDS("clustering.obj")
#ターゲットデータを元データと同じcenterとscaleでスケール化
scaleresult <- scale(nostddfdataonly,center = prcompresult$center, scale = prcompresult$scale)
#ターゲットデータを元データと同じrotationで主成分分析
compresult <- scaleresult %*%  prcompresult$rotation
compresult <- data.frame(compresult)

#各グループごとの平均値をとる
clusterData <- writeDF
prcompresultDF<-data.frame(prcompresult$x)
prcompresultDF$会員番号 <- row.names(prcompresultDF)

s <- merge(prcompresultDF,clusterData[,c("会員番号","cluster")],by.x ="会員番号",by.y = "会員番号" )

s <- s %>%
  group_by(cluster) %>%
  summarize(
    PC1 = mean(PC1),
    PC2 = mean(PC2),
    PC3 = mean(PC3),
    PC4 = mean(PC4),
    PC5 = mean(PC5)
  )
saveRDS(s,"ave_cluster.obj")
s<-s[-c(1)]


#ユークリッド距離で最も近いグループを取得する
result <- data.frame()
for (i in 1:nrow(compresult)) {
  targetData <- compresult[i,]
  # alldist <- dist(rbind(s[,c(-1)],targetData[,c(-1)]),method = "euclidean")
  alldist <- dist(rbind(s,targetData),method = "euclidean")
  distDF <- melt(as.matrix(alldist), varname=c("source", "target"))
  check<-distDF[nrow(distDF),]$source
  temp <- distDF[distDF$source==check,]
  temp <- temp[temp$target!=check,]
  #予測クラスタ
  targetResult <- cbind(dfCity[i,1]
                        ,temp[temp$value==min(temp$value),]$target)
  names(targetResult) <- c("gid","cluster")
  result <- rbind(result,targetResult)
  print(i)
}
names(result) <- c("会員番号","cluster")
writeDF2 <- merge(clusterData,result,by.x ="会員番号", by.y ="会員番号" )

write.csv(writeDF2, paste0("clusteredData_", format(Sys.time(), "%Y%m%d%H%M"), ".csv") ,row.names=FALSE)
You get articles that match your needs
You can efficiently read back useful information
You can use dark theme
What you can do with signing up