##This is a Lecture for Logistic regression ## CSDATA (Appendix D-2, from Moore and Mc Cabe: Introduction to the practice of statistics) ## In this problem the authors of the book define a HighGPA variable (HGPA) to be 1 ## if the respective student's GPA is higher than 3.0 and 0 otherwise. ## hsm, hss and hse are high school grades in respectivelly math sciences and english ## satm and satv are SAT scores in math and verbal ## sex is 1 for men and 2 for women GPA.data=read.csv("LogisticRegEXMooreMcCabe.csv",header=TRUE) is.factor(GPA.data$higpa) is.factor(GPA.data$sex) ## We need to make them factors ## I am using nonumerical varables for this it makes absolutelly no difference GPA.data$higpa=factor(GPA.data$higpa, labels = c("low","high")) GPA.data$sex = factor(GPA.data$sex, labels = c("men","women")) ##For logistic regression we could use glm as bellow or the function lrm from the package "Design" ## glm analysis (do not forget to specify family=binomial otherwise you are doing regular regression. GPA.logistic.reg=glm(higpa~hsm+hss+hse, data=GPA.data, family=binomial) summary(GPA.logistic.reg) anova(GPA.logistic.reg) plot(GPA.logistic.reg,which=1) ## tells R to only plot the first image hist(GPA.logistic.reg$residuals, breaks=seq(min(GPA.logistic.reg$residuals),max(GPA.logistic.reg$residuals)+1,by=0.5),xlim=c(-10,10), probability=T, col='light blue') points(density(GPA.logistic.reg$residuals, bw=.5), type='l', lwd=3, col='red') ### Comparing Models. The problem is that they have to be nested r0= glm(higpa~1, data=GPA.data, family=binomial) r1m= glm(higpa~hsm, data=GPA.data, family=binomial) r1s= glm(higpa~hss, data=GPA.data, family=binomial) r1e= glm(higpa~hse, data=GPA.data, family=binomial) r2ms= glm(higpa~hsm+hss, data=GPA.data, family=binomial) r2me= glm(higpa~hsm+hse, data=GPA.data, family=binomial) r2se= glm(higpa~hss+hse, data=GPA.data, family=binomial) r3mse= glm(higpa~hsm+hss+hse, data=GPA.data, family=binomial) lr.test <- function (r.small,r.big) { ## function to test differences between models pchisq(r.small$deviance - r.big$deviance, df=1, lower.tail=F) } ## test: are all the coeff equal to zero: lr.test(r0,r3mse) ## no they are not lr.test(r0,r1m) lr.test(r0,r1s) lr.test(r0,r1e) ## all three are signifficant for explaining GPA. #Note that based on the p-values alone the math score is the more signifficant # So we will put that in the model and check what should be put next. lr.test(r1m,r2ms) lr.test(r1m,r2me) ## Better to add the science score. lr.test(r2ms,r3mse) ## the english score is not signifficant (recall CS majors) #What about the other stuff (SAT scores): r2sat= glm(higpa~satm+satv, data=GPA.data, family=binomial) summary(r2sat) anova(r2sat) r1satm= glm(higpa~satm, data=GPA.data, family=binomial) r1satv= glm(higpa~satv, data=GPA.data, family=binomial) lr.test(r0,r1satm) lr.test(r0,r1satv) ## Both about the same lr.test(r1satm,r2sat) lr.test(r1satv,r2sat) #Borderline signifficant. So we keep them. #Now let us look to combine the models r5hssat=glm(higpa~hsm+hss+hse+satm+satv, data=GPA.data, family=binomial) summary(r5hssat) r6full= glm(higpa~hsm+hss+hse+satm+satv+sex, data=GPA.data, family=binomial) summary(r6full) # Now on with the construction: r3hssatm= glm(higpa~hsm+hss+satm, data=GPA.data, family=binomial) r3hssatv= glm(higpa~hsm+hss+satv, data=GPA.data, family=binomial) lr.test(r2ms , r3hssatm) lr.test(r2ms , r3hssatv) #Maybe add the verbal (one explanation could be that the satmath is already explained by the other 2 variables already in the model) r3hssex= glm(higpa~hsm+hss+sex, data=GPA.data, family=binomial) lr.test(r2ms , r3hssex) ## not important r4hssat=glm(higpa~hsm+hss+satv+satm, data=GPA.data, family=binomial) lr.test(r3hssatv,r4hssat) #ALL RIGHT: So we found out that the best variables for predicting CS GPA are the high school scores for MATH SCIENCE and the verbal part of the SAT #The model is: r3hssatv= glm(higpa~hsm+hss+satv, data=GPA.data, family=binomial) summary(r3hssatv) par(mfrow=c(2,2)) plot(r3hssatv,ask=F) par(mfrow=c(1,1)) # latex(r3hssatv) ##IN CONTRAST Using the Design library and the function lrm library(Design) lrm(higpa~hsm+hss+satv, data=GPA.data) r <- lrm(higpa~hsm+hss+satv, data=GPA.data, y=T,x=T) P <- resid(r,"gof")['P'] resid(r,"partial",pl=T) title(signif(P)) latex(r) ## Only if you have latex installed on your system ### Finally to predict probabilities: GPA.data[1:10,] predict(r3hssatv,GPA.data[1:10,],type="response")