################################################## # Distribution # - Central limit theorem # - R programming # - Graphing histogram ################################################## outcome <- c(0,1) n <- 5 n.sampling <- 1000 xbar <- rep(NA, n.sampling) for (i in 1:n.sampling) { x <- sample(outcome, n, replace = T) # this is one experiment (sample) xbar[i] <- mean(x) } par(mfcol = c(2, 3), mai=c(1.3,1.3,1.3,1.3), cex=1.5) hist(xbar, breaks = seq(0.0, 1.0, .01), ylim=c(0, 400), main = paste('n =', n)) n <- 10 for (i in 1:n.sampling) { x <- sample(outcome, n, replace = T) # this is one experiment (sample) xbar[i] <- mean(x) } hist(xbar, breaks = seq(0.0, 1.0, .01), ylim=c(0, 400), main = paste('n =', n)) n <- 30 for (i in 1:n.sampling) { x <- sample(outcome, n, replace = T) # this is one experiment (sample) xbar[i] <- mean(x) } hist(xbar, breaks = seq(0.0, 1.0, .01), ylim=c(0, 400), main = paste('n =', n)) n <- 100 for (i in 1:n.sampling) { x <- sample(outcome, n, replace = T) # this is one experiment (sample) xbar[i] <- mean(x) } hist(xbar, breaks = seq(0.0, 1.0, .01), ylim=c(0, 400), main = paste('n =', n)) n <- 500 for (i in 1:n.sampling) { x <- sample(outcome, n, replace = T) # this is one experiment (sample) xbar[i] <- mean(x) } hist(xbar, breaks = seq(0.0, 1.0, .01), ylim=c(0, 400), main = paste('n =', n)) n <- 1000 for (i in 1:n.sampling) { x <- sample(outcome, n, replace = T) # this is one experiment (sample) xbar[i] <- mean(x) } hist(xbar, breaks = seq(0.0, 1.0, .01), ylim=c(0, 400), main = paste('n =', n)) sd(x) #standard deviation of the last set of observations sd(x)/sqrt(n) #standard error of the last set of observations sd(xbar) #standard error calculated by simulation population <- runif(1000) n <- 1 n.sampling <- 1000 xbar <- rep(NA, n.sampling) for (i in 1:n.sampling) { x <- sample(population, n) # this is one experiment (sample) xbar[i] <- mean(x) } par(mfcol = c(2, 2), mai=c(1.3,1.3,1.3,1.3), cex=1.5) hist(population, breaks = seq(0.0, 1.0, .01), ylim=c(0, 100), main = 'uniform distribution') hist(xbar, breaks = seq(0.0, 1.0, .01), ylim=c(0, 100), main = paste('n =', n)) n <- 2 for (i in 1:n.sampling) { x <- sample(population, n) # this is one experiment (sample) xbar[i] <- mean(x) } hist(xbar, breaks = seq(0.0, 1.0, .01), ylim=c(0, 100), main = paste('n =', n)) n <- 30 for (i in 1:n.sampling) { x <- sample(population, n) # this is one experiment (sample) xbar[i] <- mean(x) } hist(xbar, breaks = seq(0.0, 1.0, .01), ylim=c(0, 100), main = paste('n =', n)) population <- c(rnorm(500, mean=10, sd=1), rnorm(500, mean=20, sd=2)) n <- 1 n.sampling <- 1000 xbar <- rep(NA, n.sampling) for (i in 1:n.sampling) { x <- sample(population, n) # this is one experiment (sample) xbar[i] <- mean(x) } par(mfcol = c(2, 2), mai=c(1.3,1.3,1.3,1.3), cex=1.5) hist(population, breaks = seq(0.0, 30.0, .1), ylim=c(0, 50), main = 'bimodal distribution') hist(xbar, breaks = seq(0.0, 30.0, .1), ylim=c(0, 50), main = paste('n =', n)) n <- 2 for (i in 1:n.sampling) { x <- sample(population, n) # this is one experiment (sample) xbar[i] <- mean(x) } hist(xbar, breaks = seq(0.0, 30.0, .1), ylim=c(0, 50), main = paste('n =', n)) n <- 30 for (i in 1:n.sampling) { x <- sample(population, n) # this is one experiment (sample) xbar[i] <- mean(x) } hist(xbar, breaks = seq(0.0, 30.0, .1), ylim=c(0, 50), main = paste('n =', n)) ################################################## # Multivariate data # - Several variables are measured on each unit # - Variables are related ################################################## # Multivariate analysis # - All variables are analyzed simultaneously # - Descriptive and inferential methods # - Find signals in noisy data # - Modeling = describing the structure of data ################################################## # The aims of multivariate analysis # - Data exploration # + Finding useful nonrandom patterns # + Finding questions, generating hypothesis # + Visualization # + Statistical significance is not an issue # - Confirmatory analysis # + Once you have well-defined hypothesis # + Statistical significance test is considered # - Most studies require both ################################################## # Data analysis is an art # - Skillful interpretation of evidence # - Development of hunches # - Don't adhere rigidly to rules and criteria ################################################## # Visualizing data # - A well-chosen graph is powerful # - Graph helps discover unexpected patterns # - But we also see patterns when they are absent ################################################## # Exploring multivariate data # - Reveal associations between variables # - Scatterplot ################################################## # read in data (table format), you need to change the address where the file is located airpoll <- read.table("/Users/yasu/Desktop/ma/Data/airpollw.txt", header=TRUE) attach(airpoll) airpoll ################################################## # # Rainfall: mean annual precipitation in inches # Education: median school years completed for >25 # Popden: population density # Nonwhite: percentage of nonwhite # NOX: nitrogen oxide # SO2: sulfur dioxide # Mortality: total age-adjusted mortality rate # ################################################## # How is sulfur oxide related to mortality? ################################################## par(mfrow=c(1,3), mai=c(.8,.8,.8,.8)) par(pty="s") plot(SO2,Mortality,pch=1,lwd=2,cex.lab=2,cex.axis=2) title("(a)",cex.main=2) plot(SO2,Mortality,pch=1,lwd=2,cex.lab=2,cex.axis=2) abline(lm(Mortality~SO2),lwd=2) title("(b)",cex.main=2) plot(SO2,Mortality,pch=1,lwd=2,cex.lab=2,cex.axis=2) rug(jitter(SO2),side=1) rug(jitter(Mortality),side=2) title("(c)",cex.main=2) ################################################## # Points labeled using location names ################################################## names<-abbreviate(row.names(airpoll)) par(mfrow=c(1,1), cex=1.5) plot(SO2,Mortality,lwd=2,type="n") text(SO2,Mortality,labels=names,lwd=2) ################################################## # Scatterplot with distributional information # and regression lines ################################################## dev.off() # # set up plotting area for scatterplot par(fig=c(0,0.7,0,0.7)) plot(SO2,Mortality,lwd=2) # add regression line abline(lm(Mortality~SO2),lwd=2) # add locally weighted regression line lines(lowess(SO2,Mortality),lwd=2) # set up plotting area for distributional information par(fig=c(0,0.7,0.65,1),new=TRUE) # plot histogram of SO2 hist(SO2,lwd=2, xlab="", main="") par(fig=c(0.65,1,0,0.7),new=TRUE) # plot boxplot of SO2 boxplot(Mortality,lwd=2, main="") ################################################## # Outliers can distort correlation coefficient r # convex hull trimming ################################################## dev.off() # # find points defining convex hull hull<-chull(SO2,Mortality) plot(SO2,Mortality,pch=1,cex=1.5) # plot and shade convex hull polygon(SO2[hull],Mortality[hull],density=15,angle=30) points(SO2[hull],Mortality[hull],pch=19,col='red') cor(SO2,Mortality) # r before removal cor(SO2[-hull],Mortality[-hull]) # r after removal ################################################## # Test for independence using chiplot ################################################## dev.off() source("/Users/yasu/Desktop/maFall2009/functions.txt") # chiplot(SO2,Mortality,vlabs=c("SO2","Mortality")) # ################################################## # Distributional info using bivariate boxplot ################################################## dev.off() # bvbox(cbind(SO2,Mortality),xlab="SO2",ylab="Mortality") # bvbox(cbind(SO2,Mortality),xlab="SO2",ylab="Mortality",method="O") # # ################################################## # Two-dimensional histogram and perspective plot ################################################## dev.off() # h2d <- hist2d(SO2, Mortality) persp(h2d$x, h2d$y, h2d$counts, xlab='SO2', ylab='Mortality', zlab='Frequency', theta = -40, phi = 40, r=5) # den <- bivden(SO2, Mortality) persp(den$seqx, den$seqy, den$den, xlab='SO2', ylab='Mortality', zlab='Density', lwd=2, theta = -40, phi = 40, r=5) # plot(SO2, Mortality) contour(den$seqx, den$seqy, den$den, lwd=2, nlevels=20, add=T, labcex=1, col='brown') ################################################## # Representing other variables ################################################## plot(SO2,Mortality,pch=1,lwd=2,ylim=c(700,1200),xlim=c(-5,300)) symbols(SO2,Mortality,circles=Rainfall,inches=0.4,add=TRUE,lwd=2, fg='blue') # ################################################## # Scatterplot matrix ################################################## pairs(airpoll) # pairs(airpoll,panel=function(x,y) {abline(lsfit(x,y)$coef,lwd=2) lines(lowess(x,y),lty=2,lwd=2) points(x,y)}) # ################################################## # Three-dimensional plots ################################################## # scatterplot3d(SO2, NOX, Mortality, highlight.3d=TRUE, type='h') # ################################################## # Conditioning plot ################################################## coplot(Mortality~SO2|Popden) # coplot(Mortality~SO2|Popden,panel=function(x,y,col,pch) panel.smooth(x,y,span=1)) #