packageを使用しないでk-平均法を実装してみました。
平均10と-10の正規乱数のサンプルが3000個ずつ発生し、その重心となる三次元の2つのクラスター(10,10,10),(-10,-10,-10)を計算します。表示されるLはコスト関数で、実際に動かすと、徐々に収束します。
#k-mean method
library(dplyr)
n=1000;
x=c(rnorm(n,mean=2,sd=2),rnorm(n,mean=-2,sd=2))
cluster_num=1
C=sample(x,cluster_num)
ite=20
Data=data.frame(num=0,C=t(C),L=0)
for(k in 1:ite){
X=data.frame(x=x,c=0,L=0)
for(j in 1:length(x)){
x_one=x[j]
c_x=data.frame(num=1:length(C),c=C) %>% dplyr::mutate(L=(x_one-C)^2)
X$c[j]=c_x$num[c_x$L==min(c_x$L)]
X$L[j]=min(c_x$L)
}
XX=X %>% group_by(c) %>% summarise(Center_of_gravity=mean(x),L=sum(L))
C=XX$Center_of_gravity
data=data.frame(num=k,C=t(C),L=sum(XX$L))
Data=rbind(Data,data)
print(paste0("C=",C))
print(paste0("L=",sum(XX$L)))
}
mean_sd_data=X %>% group_by(c) %>% summarise(ave=mean(x),sd=sd(x),n=n())
#k-mean method(multi dimension)
library(dplyr)
n=3000;dim=3
x=rbind(matrix(rnorm(n*dim,mean=10,sd=10),ncol=dim,nrow=n),matrix(rnorm(n*dim,mean=-10,sd=10),ncol=dim,nrow=n))
cluster_num=2
C=x[sample(1:n,cluster_num),]
Data=data.frame(num=rep(0,cluster_num))
for(j in 1:dim){
C_D=data.frame(C=C[,j])
colnames(C_D)=c(as.character(paste0("c.",j)))
Data=cbind(Data,C_D)
}
Data=Data %>% mutate(L=0)
ite=50
for(k in 1:ite){
X=data.frame(x=x,c=0,L=0)
for(j in 1:nrow(X)){
x_one=x[j,]
c_x=data.frame(num=1:nrow(C),c=C,L=0)
for(l in 1:nrow(C)){
c_x$L[l]=sum((x_one-C[l,])^2)
}
X$c[j]=c_x$num[c_x$L==min(c_x$L)]
X$L[j]=min(c_x$L)
}
XX=data.frame(c=1:cluster_num)
for(j in 1:dim){
x_c=data.frame(c=X$c,x=X[,j])
x_c=x_c %>% group_by(c) %>% summarise(center_of_gravity=mean(x))
colnames(x_c)=c("c",as.character(paste0("c.",j)))
XX=left_join(XX,x_c,by="c")
}
x_c=X %>% group_by(c) %>% summarise(L=sum(L))
XX=left_join(XX,x_c,by="c")
C=XX[,2:(1+ncol(C))]
data_c=data.frame(num=1:cluster_num)
for(j in 1:dim){
C_data=data.frame(num=1:cluster_num,c=C[,j])
colnames(C_data)=c("num",as.character(paste0("c.",j)))
data_c=left_join(data_c,C_data,by="num")
}
data=data_c %>% mutate(L=sum(XX$L))
data$num=k
Data=rbind(Data,data)
print(paste0("C=",C))
print(paste0("L=",sum(XX$L)))
}
library(scatterplot3d)
if(dim==3 & cluster_num==2){
c_1=X %>% dplyr::filter(c==1)
c_2=X %>% dplyr::filter(c!=1)
z1<-c_1$x.3;y1<-c_1$x.2;x1<-c_1$x.1
z2<-c_2$x.3;y2<-c_2$x.2;x2<-c_2$x.1
scatterplot3d(x1, y1, z1, col.axis="blue",col.grid="lightblue", main="k-means;c1_red,c2_green", pch=20,xlim=c(min(x1,x2),max(x1,x2)),ylim=c(min(y1,y2),max(y1,y2)),zlim=c(min(z1,z2)-10,max(z1,z2)+10),color=2,xlab="X",ylab="Y",zlab="Z")
par(new=T)
scatterplot3d(x2, y2, z2, col.axis="blue",col.grid="lightblue", main="k-means;c1_red,c2_green", pch=20,xlim=c(min(x1,x2),max(x1,x2)),ylim=c(min(y1,y2),max(y1,y2)),zlim=c(min(z1,z2)-10,max(z1,z2)+10),color=3,xlab="X",ylab="Y",zlab="Z")
}