########################################################################## # Correlation coefficient ########################################################################## # The correlation coefficient r is a measure of the strength and direction # of the linear relationship between two continuous variables # r = sum( ((x - xbar)/s_x) * ((y - ybar)/s_y) ) / (n-1) # -1 <= r <= 1 # r = -1 when the data lie exactly on a line with a negative slope. # r = 1 when the data lie exactly on a line with a positive slope. # r = 0 means there is no linear relationship. # The correlation coefficient measures only the strength of the linear relationship. # r = 0 does not by itself imply that two variables are unrelated. # It could be the case that there is a strong nonlinear relationship between the variables. # The value of r^2 is often used as a summary statistic. # r^2 is the proportion of the variability in y that is explained by the regression line. # When r=-1, or r=1, r^2=1 and all of the variability in y is explained by x. # In other words, there is no variability around the regression line. # A large r^2 value implies that the data is more tightly clustered around a line. # Correlation and causation. ########################################################################## # Residuals ########################################################################## # We will draw the response variable (dependent variable) on the y axis # and the explanatory variable (independent variable) on the x axis. # For any line drawn through the points, the vertical distance from a # point to the line is called a residual. # Residuals are positve for points above the line # and negative for points below the line. ########################################################################## # Least Squares line ########################################################################## # Choose a line that minimize the sum of squared residuals. # The least square line is # y = a + bx # where a in the intercept and b is the slope. # b = r * (s_y / s_x) # a = ybar - (b * xbar) # Simple linear regression uses the least squares line to predict y for each x. # Plot the data to make sure if a linear fit is appropriate. ########################################################################## # Regression diagnostics ########################################################################## # Regression framework assumes that the variance is constant # for different values of the explanatory x variable. # A plot of residuals versus the fitted values (or the original x values) # makes it easier to see these potential problems. # Patterns in a residual plot indicate nonlinearity. # q-q plot is a quantile-quantile plot. # q-q plot compares the distributions of two data sets. ########################################################################## # read in table data bp.obese <- read.table("/Users/yasu/Desktop/ma/Data/bpobese.txt", header=TRUE) # The first column is sex (0 = Male, 1 = Female). # The second column is measurement of obesity. # The third column is measurement on blood pressure. # Any relationship between obeseity and blood pressure? # first attach and plot. attach(bp.obese) plot(bp, obese) # Some degree of correlation between the two variables. # Pearson's product moment correlation. r <- cor(bp,obese) # Test the hypothesis that the true correlation is zero # t(n-2) = r * sqrt((n - 2)/(1 - (r^2))) t <- r * sqrt((length(bp) - 2)/(1 - (r^2))) t (1 - pt(t, 100)) * 2 pt(t, 100, lower.tail = F) * 2 cor.test(bp,obese) # Any linear relationship between x (obese) and y (bp)? # y_i = a + bx_i + e_i # a is the intercept (constant term) # b is the slope (coefficient) # e is normal error with mean = 0 and sd = sd_y|x lr <- lm(bp~obese) summary(lr) # A five number summary for the residuals. # A table of the regression estimate. # The first row corresponds to the intercept or constant term (a in the model). # Test the hypothesis that H_0: a = 0 by computing the t-statistic t=a/SEa. # This statistic has a t-distribution with n-2 degrees of freedom. # The last column entry gives the p-value for this test. # The second row is for the coefficient of the variable obese (b in the model). # The null hypothesis H_0: b = 0. # The test for no slope is the same as the test for no correlation. # The Multiple R-squared is simply the square of the correlation. # R-squared is the proportion of variability accounted for by the model. # The Adjusted R-squared adjusts for the number of explanatory variables. # An Adjusted R-squared value close to 1 implies a very good fit # The F-statistic used to test the hypothesis that all the coefficients are zero. # In the case of simple linear regression, H_0: b = 0. # Do the resduals and the fitted values behave as expected? par(mfcol=c(1, 2)) plot(obese,bp) lines(obese,fitted(lr)) plot(obese,bp) abline(lr) fitted(lr) names(lr) lr$fitted.values predict(lr) fit <- fitted(lr) res <- residuals(lr) par(mfcol=c(2, 3)) plot(obese,bp) lines(obese,fitted(lr)) plot(fit, res, xlab='fitted', ylab='residual') # The residual vs. fitted plot looks quite random, but for a few outliers. # A QQ-plot of the residuals to see if they look normal. qqnorm(res) qqline(res) # A clear departure from normality is visible. # The distribution of residuals is skewed to the right. hist(res) boxplot(res, horizontal=T) # confidence interval # find the t value associated with 95% tval <- qt(.975, 100) summary(lr) intercept.ci <- c(96.818 - (8.92*tval), 96.818 + (8.92*tval)) slope.ci <- c(23.001 - (6.667*tval), 23.001 + (6.667*tval)) confint(lr) # Fitting more than one variables. lr2 <- lm(bp ~ obese + sex) summary(lr2) coplot(bp~obese|sex) plot(obese[sex==0], bp[sex==0], xlim=c(0.8, 2.5), ylim=c(80, 200)) plot(obese[sex==1], bp[sex==1], xlim=c(0.8, 2.5), ylim=c(80, 200)) names(lr2) # Any interesting relationships between expenditures and other variables? # read in data (table format) afl <- read.table("/Users/yasu/Desktop/ma/Data/aflspend.dat", header=TRUE) #Club: The name of the AFL club #Spent: Total football expenditure, in millions of Australian dollars #Wins: The total number of wins from 2003 to 2007 #GrandFinals: The total number of Grand Final appearances from 2003 to 2007 #Place3to4: The total number of times placed 3 or 4 #Place5to8: The total number of times placed from 5 to 8 #Top8: The total number of times placed in the Top 8 # Decide what data you want to use. And do some correlation and regression analyses.