life<-read.table("/Users/yasu/Desktop/maFall2009/Data/life.txt", header=TRUE) # # # life.mds<-cmdscale(as.matrix(dist(life)),k=5,eig=T) life.mds$eig # # Look if two dimensions are appropriate # sum(abs(life.mds$eig[1:2]))/sum(abs(life.mds$eig)) sum(life.mds$eig[1:2]^2)/sum(life.mds$eig^2) # # You may need to flip to obtain desired image # par(pty="s") plot(life.mds$points[,1],life.mds$points[,2],type="n",xlab="Coordinate 1",ylab="Coordinate 2", xlim=c(-50,30),ylim=c(-20,10)) text(life.mds$points[,1],life.mds$points[,2],labels=row.names(life)) # # # hierarchical clustering # # a series of partitioning step from a single cluster containing all members # to n clusters each containing a single individuals # # start with C_n each containing a single individual # find the nearest pair of distinct clusters, merge them, decrease the number of cluster by 1 # if the number of cluster becomes 1, stop, else continue merging # par(mfrow=c(1,3)) # single linkage = d_AB = min(d_ij) plclust(hclust(dist(life),method="single"),labels=row.names(life),ylab="Distance") title("(a) Single linkage") # complete linkage = d_AB = max(d_ij) plclust(hclust(dist(life),method="complete"),labels=row.names(life),ylab="Distance") title("(b) Complete linkage") # average linkage = d_AB = d_ij/(n_i*n_j) plclust(hclust(dist(life),method="average"),labels=row.names(life),ylab="Distance") title("(c) Average linkage") # five<-cutree(hclust(dist(life),method="complete"),h=21) # # country.clus<-lapply(1:5,function(nc) row.names(life)[five==nc]) country.mean<-lapply(1:5,function(nc) apply(life[five==nc,],2,mean)) country.mean country.clus # dev.off() pairs(life,panel=function(x,y) text(x,y,five)) # # # # # # K-means clustering # # Find some initial partitioning # Calculate the change in clustering criterion # (e.g., minimize within cluster sum of squares) # when each individual is moved to another cluster # Make the change that leads to the greatest improvement # in clustering criterion # Repeat until no move of an individual leads to an # improvement # # minimize within cluster sum of squares # n<-length(life[,1]) wss1<-(n-1)*sum(apply(life,2,var)) wss<-numeric(0) for(i in 2:6) { W<-sum(kmeans(life,i)$withinss) wss<-c(wss,W) } # wss<-c(wss1,wss) # # plot wss to see how many clusters # plot(1:6,wss,type="l",xlab="Number of groups",ylab="Within groups sum of squares",lwd=2) # life.kmean<-kmeans(life,3) life.kmean lapply(1:3,function(nc) row.names(life)[life.kmean$cluster==nc]) lapply(1:3,function(nc) apply(life[life.kmean$cluster==nc,],2,mean)) # # # # # Model-based clustering # # classification # # determine min and max number of clusters to consider # determine parameter candidates for the model # do expectation maximization (EM) for each parameter set and each number of clusters # compute bayesian information criterion (BIC) to decide model # library(mclust) life.clus<-Mclust(life) # lapply(1:2,function(nc) row.names(life)[life.clus$classification==nc]) lapply(1:2,function(nc) apply(life[life.clus$classification==nc,],2,mean)) life.clus$parameters$mean # plot(life.clus, life, what=c("BIC", "classification")) # # # food preference and attitude survey # temp <- scan("/Users/yasu/Desktop/ma/Data/data.txt", sep="\t", what = list('',0,'','')) name <- temp[[1]] complete <- table(name) sort(complete) item <- temp[[3]] resp <- temp[[4]] eggplant.perception <- tapply(as.numeric(resp[item=='eggplant.perception']), name[item=='eggplant.perception'], mean) eggplant.preference <- tapply(as.numeric(resp[item=='eggplant.preference']), name[item=='eggplant.preference'], mean)