########################################################## # Principal Components Analysis ########################################################## # # A way of finding patterns in data of high dimensions # # You compress the data by reducing the number of dimensions # without much loss of information # # The main goal is to describe the variation in a set of # correlated variables in terms of a new set of uncorrelated # variables # # The new variables are derived in decreasing order of # importance - principal components # # The first component provides a measure that maximally # discriminates individual data points # # The hope is that the first few components will account for # a big portion of the variation in the original variables # and serve as a lower-dimensional summary # # PCA is useful when you want to develop an index to # summarize complex data - rank students by examination scores, # rank cities by cost of living # # PCA is also useful for visualization when there are too many # explanatory variables relative to the number of observations # and when the explanatory variables are highly correlated # ########################################################## # # Step 1: Subtract the mean # Step 2: Calculate the covariance matrix (or correlation) # Step 3: Calculate the eigenvectors and eigenvalues # Step 4: Choose components and form a feature vector # Step 5: Derive the new data # ########################################################### # # var(x) = sum((x - mean(x))^2)/(length(x) - 1) # var(x) = sum((x - mean(x))*(x - mean(x)))/(length(x) - 1) # cov(x, y) = sum((x - mean(x))*(y - mean(y)))/(length(x) - 1) # # Like correlation # positive cov values - both dimensions increase together # negative cov values - when one increases, the other decreases # cov = 0 - two dimensions are independent # # cor(x, y) = sum(((x - mean(x))/sd(x))*((y - mean(y))/sd(y)))/(length(x) - 1) # ########################################################## # # Covariance matrix # For 3 dimensions x, y, and z # [ cov(x, x) cov(x, y) cov(x, z) ] # C = [ cov(y, x) cov(y, y) cov(y, z) ] # [ cov(z, x) cov(z, y) cov(z, z) ] # # Diagonal -> cov between one dim and itself = variance # C is symmetrical -> cov(x, y) = cov(y, x) # ########################################################## # # Eigenvector and eigenvalue # # Ax = lx # # A = a matrix # x = eigenvector # l = eigenvalue # # [ a11 a12 ] [ x ] [ x ] # [ a21 a22 ] [ y ] = l [ y ] # # [ 2 3 ] [ 3 ] [ 12 ] [ 3 ] # [ 2 1 ] [ 2 ] = [ 8 ] = 4 [ 2 ] # # [ 2 3 ] [ 3 ] # [ 2 1 ] = A [ 2 ] = x 4 = l # ########################################################## # an example with simple data ########################################################## # dimx <- c(2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2, 1, 1.5, 1.1) dimy <- c(2.4, 0.7, 2.9, 2.2, 3, 2.7, 1.6, 1.1, 1.6, 0.9) # ########################################################## # subtract the mean ########################################################## # x <- dimx - mean(dimx) y <- dimy - mean(dimy) # ########################################################## # calculate cov ########################################################## # c <- cov(cbind(x, y)) # ########################################################## # calculate the eigenvectors and eigenvalues of c ########################################################## # eig <- eigen(c) # ########################################################## # choose components # - enough components to explain 70 to 90% of total variance # - exclude components with eigenvalues below average # - when using cor, components with eigenvalues less than 1 (or .7) # - Scree diagram, look for elbow # # and form a feature vector ########################################################## # v <- eig$vectors[,1] v2 <- eig$vectors[,2] # ########################################################## # derive the new data ########################################################## # z <- as.vector(v%*%rbind(x,y)) # ########################################################## # plot ########################################################## # par(mfcol = c(1, 2)) plot(x, y, type='n') text(x, y, labels=c(1:length(x))) abline(0, v[1]/v[2]) abline(0, v2[1]/v2[2]) plot(rep(0, length(x)), z, type='n', xaxt='n', xlab='') text(rep(0, length(x)), z, labels=c(1:length(x))) # ########################################################## # PCA analysis of Digg influential ########################################################## # influential <- read.table("/Users/yasu/Desktop/influential.txt", header=T, sep=',') influential <- influential[1:50,] inf <- influential[,c(-1,-2)] n.dims <- dim(inf) xbars <- colMeans(inf) data <- inf - matrix(rep(xbars, n.dims[1]), nrow=n.dims[1], byrow=T) c <- cov(data) eig <- eigen(c) sqrt(eig$values*(n.dims[1]-1)/(n.dims[1])) pca <- princomp(inf, cor=F) summary(pca, loadings=T) plot(pca, type='line') screeplot(pca, type='line') pca$scores summary(lm(influential$Frontpage.Stories ~ pca$scores[,1] + pca$scores[,2] + pca$scores[,3])) par(mfrow=c(1,3), cex=1.3) plot(pca$scores[,1], influential$Frontpage.Stories, xlab='PC1') abline(lsfit(pca$scores[,1], influential$Frontpage.Stories)$coef,lwd=2) lines(lowess(pca$scores[,1], influential$Frontpage.Stories),lty=2,lwd=2) text(x=mean(range(pca$scores[,1])), y=max(influential$Frontpage.Stories), labels = paste('r =', round(cor(pca$scores[,1], influential$Frontpage.Stories), 2)), cex=1.5, col='red', pos=1) plot(pca$scores[,2], influential$Frontpage.Stories, xlab='PC2') abline(lsfit(pca$scores[,2], influential$Frontpage.Stories)$coef,lwd=2) lines(lowess(pca$scores[,2], influential$Frontpage.Stories),lty=2,lwd=2) text(x=mean(range(pca$scores[,2])), y=max(influential$Frontpage.Stories), labels = paste('r =', round(cor(pca$scores[,2], influential$Frontpage.Stories), 2)), cex=1.5, col='red', pos=1) plot(pca$scores[,3], influential$Frontpage.Stories, xlab='PC3') abline(lsfit(pca$scores[,3], influential$Frontpage.Stories)$coef,lwd=2) lines(lowess(pca$scores[,3], influential$Frontpage.Stories),lty=2,lwd=2) text(x=mean(range(pca$scores[,3])), y=max(influential$Frontpage.Stories), labels = paste('r =', round(cor(pca$scores[,3], influential$Frontpage.Stories), 2)), cex=1.5, col='red', pos=1) c <- cor(data) eig <- eigen(c) sqrt(eig$values) pca <- princomp(inf, cor=T) summary(pca, loadings=T) plot(pca, type='line') screeplot(pca, type='line') par(mfrow=c(1,3)) plot(pca$scores[,1], influential$Frontpage.Stories, xlab='PC1') abline(lsfit(pca$scores[,1], influential$Frontpage.Stories)$coef,lwd=2) lines(lowess(pca$scores[,1], influential$Frontpage.Stories),lty=2,lwd=2) text(x=mean(range(pca$scores[,1])), y=max(influential$Frontpage.Stories), labels = paste('r =', round(cor(pca$scores[,1], influential$Frontpage.Stories), 2)), cex=1.5, col='red', pos=1) plot(pca$scores[,2], influential$Frontpage.Stories, xlab='PC2') abline(lsfit(pca$scores[,2], influential$Frontpage.Stories)$coef,lwd=2) lines(lowess(pca$scores[,2], influential$Frontpage.Stories),lty=2,lwd=2) text(x=mean(range(pca$scores[,2])), y=max(influential$Frontpage.Stories), labels = paste('r =', round(cor(pca$scores[,2], influential$Frontpage.Stories), 2)), cex=1.5, col='red', pos=1) plot(pca$scores[,3], influential$Frontpage.Stories, xlab='PC3') abline(lsfit(pca$scores[,3], influential$Frontpage.Stories)$coef,lwd=2) lines(lowess(pca$scores[,3], influential$Frontpage.Stories),lty=2,lwd=2) text(x=mean(range(pca$scores[,3])), y=max(influential$Frontpage.Stories), labels = paste('r =', round(cor(pca$scores[,3], influential$Frontpage.Stories), 2)), cex=1.5, col='red', pos=1) ########################################################## # PCA analysis of usair data ########################################################## # usair.data <- read.table("/Users/yasu/Desktop/ma/Data/usair.txt", header=TRUE) usair <- usair.data[,-1] attach(usair) # n.dims <- dim(usair) xbars <- colMeans(usair) data <- usair - matrix(rep(xbars, n.dims[1]), nrow=n.dims[1], byrow=T) c <- cov(data) eig <- eigen(c) z <- t(t(eig$vectors)%*%t(data)) # # Easier way pca <- princomp(usair, cor=F) summary(pca, loadings=T) plot(pca) # In the PCA summary # Loadings are the same as eigenvectors calculated above eig # Standard deviation is the same as square root of eigenvalues (variances) sqrt(eig$values*(n.dims[1]-1)/(n.dims[1])) # # Using cov, component 1 accounts for 98% of the variance! Not so fun z pca$scores # # # Use cor c <- cor(data) eig <- eigen(c) z <- t(t(eig$vectors)%*%t(data)) # pca <- princomp(usair, cor=T) summary(pca, loadings=T) plot(pca, type='lines') eig sqrt(eig$values) z t(t(eig$vectors/matrix(rep(pca$scale, n.dims[2]), nrow=n.dims[2], byrow=F))%*%t(data)) pca$scores # # Plot # par(mfcol=c(1,3), pty='s') plot(pca$scores[,1], pca$scores[,2], xlab='pc1', ylab='pc2', type='n') text(pca$scores[,1], pca$scores[,2], labels=abbreviate(row.names(usair))) plot(pca$scores[,1], pca$scores[,3], xlab='pc1', ylab='pc3', type='n') text(pca$scores[,1], pca$scores[,3], labels=abbreviate(row.names(usair))) plot(pca$scores[,2], pca$scores[,3], xlab='pc2', ylab='pc3', type='n') text(pca$scores[,2], pca$scores[,3], labels=abbreviate(row.names(usair))) # # What characteristics of a city are predictive of its level of SO2? # par(mfrow=c(1,3)) plot(pca$scores[,1], usair.data$SO2, xlab='PC1') plot(pca$scores[,2], usair.data$SO2, xlab='PC2') plot(pca$scores[,3], usair.data$SO2, xlab='PC3') # # # summary(lm(usair.data$SO2 ~ pca$scores[,1] + pca$scores[,2] + pca$scores[,3])) # # # # Do a pca analysis on airpoll data to find out what characteristics predict mortality. # Before with correlation, nonwhite was most predictive. # Now we can look at combinations of variables. # Make plots and explain the analysis.