LoginSignup
2
0

More than 3 years have passed since last update.

k-平均法

Last updated at Posted at 2018-01-23

packageを使用しないでk-平均法を実装してみました。
平均10と-10の正規乱数のサンプルが3000個ずつ発生し、その重心となる三次元の2つのクラスター(10,10,10),(-10,-10,-10)を計算します。表示されるLはコスト関数で、実際に動かすと、徐々に収束します。


#k-mean method

library(dplyr)

n=1000;
x=c(rnorm(n,mean=2,sd=2),rnorm(n,mean=-2,sd=2))

cluster_num=1

C=sample(x,cluster_num)

ite=20

Data=data.frame(num=0,C=t(C),L=0)
for(k in 1:ite){

 X=data.frame(x=x,c=0,L=0)

for(j in 1:length(x)){
  x_one=x[j]
  c_x=data.frame(num=1:length(C),c=C) %>% dplyr::mutate(L=(x_one-C)^2)
  X$c[j]=c_x$num[c_x$L==min(c_x$L)]
  X$L[j]=min(c_x$L)
}

XX=X %>% group_by(c) %>% summarise(Center_of_gravity=mean(x),L=sum(L)) 

C=XX$Center_of_gravity

data=data.frame(num=k,C=t(C),L=sum(XX$L))
Data=rbind(Data,data)
print(paste0("C=",C))
print(paste0("L=",sum(XX$L)))
}

mean_sd_data=X %>% group_by(c) %>% summarise(ave=mean(x),sd=sd(x),n=n())



#k-mean method(multi dimension)

library(dplyr)

n=3000;dim=3

x=rbind(matrix(rnorm(n*dim,mean=10,sd=10),ncol=dim,nrow=n),matrix(rnorm(n*dim,mean=-10,sd=10),ncol=dim,nrow=n))

cluster_num=2

C=x[sample(1:n,cluster_num),]

Data=data.frame(num=rep(0,cluster_num))

for(j in 1:dim){

  C_D=data.frame(C=C[,j])

  colnames(C_D)=c(as.character(paste0("c.",j)))

  Data=cbind(Data,C_D)

}

Data=Data %>% mutate(L=0)


ite=50

for(k in 1:ite){

 X=data.frame(x=x,c=0,L=0)

for(j in 1:nrow(X)){

  x_one=x[j,]

  c_x=data.frame(num=1:nrow(C),c=C,L=0) 

  for(l in 1:nrow(C)){

  c_x$L[l]=sum((x_one-C[l,])^2)

  }

  X$c[j]=c_x$num[c_x$L==min(c_x$L)]

  X$L[j]=min(c_x$L)

}


XX=data.frame(c=1:cluster_num)

for(j in 1:dim){

  x_c=data.frame(c=X$c,x=X[,j])

  x_c=x_c %>% group_by(c) %>% summarise(center_of_gravity=mean(x))

  colnames(x_c)=c("c",as.character(paste0("c.",j)))

  XX=left_join(XX,x_c,by="c")

} 

x_c=X %>% group_by(c) %>% summarise(L=sum(L))

XX=left_join(XX,x_c,by="c")

C=XX[,2:(1+ncol(C))]

data_c=data.frame(num=1:cluster_num)

for(j in 1:dim){

  C_data=data.frame(num=1:cluster_num,c=C[,j])

  colnames(C_data)=c("num",as.character(paste0("c.",j)))

  data_c=left_join(data_c,C_data,by="num")

}

data=data_c %>% mutate(L=sum(XX$L))

data$num=k

Data=rbind(Data,data)

print(paste0("C=",C))

print(paste0("L=",sum(XX$L)))

}

library(scatterplot3d)

if(dim==3 & cluster_num==2){
  c_1=X %>% dplyr::filter(c==1)
  c_2=X %>% dplyr::filter(c!=1)
  z1<-c_1$x.3;y1<-c_1$x.2;x1<-c_1$x.1
  z2<-c_2$x.3;y2<-c_2$x.2;x2<-c_2$x.1

  scatterplot3d(x1, y1, z1,  col.axis="blue",col.grid="lightblue", main="k-means;c1_red,c2_green", pch=20,xlim=c(min(x1,x2),max(x1,x2)),ylim=c(min(y1,y2),max(y1,y2)),zlim=c(min(z1,z2)-10,max(z1,z2)+10),color=2,xlab="X",ylab="Y",zlab="Z")

  par(new=T)

  scatterplot3d(x2, y2, z2,  col.axis="blue",col.grid="lightblue", main="k-means;c1_red,c2_green", pch=20,xlim=c(min(x1,x2),max(x1,x2)),ylim=c(min(y1,y2),max(y1,y2)),zlim=c(min(z1,z2)-10,max(z1,z2)+10),color=3,xlab="X",ylab="Y",zlab="Z")
}


2
0
4

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
2
0