##This is a Lecture for Logistic regression
## CSDATA (Appendix D-2, from Moore and Mc Cabe: Introduction to the practice of statistics)
## In this problem the authors of the book define a HighGPA variable (HGPA) to be 1 
## if the respective student's GPA is higher than 3.0 and 0 otherwise.
## hsm, hss and hse are high school grades in respectivelly math sciences and english
## satm and satv are SAT scores in math and verbal 
## sex is 1 for men and 2 for women

GPA.data=read.csv("LogisticRegEXMooreMcCabe.csv",header=TRUE)

is.factor(GPA.data$higpa)
is.factor(GPA.data$sex)

## We need to make them factors
## I am using nonumerical varables for this it makes absolutelly no difference

GPA.data$higpa=factor(GPA.data$higpa, labels = c("low","high"))
GPA.data$sex = factor(GPA.data$sex, labels = c("men","women"))

##For logistic regression we could use glm as bellow or the function lrm from the package "Design"
## glm analysis (do not forget to specify family=binomial otherwise you are doing regular regression.

GPA.logistic.reg=glm(higpa~hsm+hss+hse, data=GPA.data, family=binomial)
summary(GPA.logistic.reg)
anova(GPA.logistic.reg)

plot(GPA.logistic.reg,which=1)  ## tells R to only plot the first image

hist(GPA.logistic.reg$residuals, breaks=seq(min(GPA.logistic.reg$residuals),max(GPA.logistic.reg$residuals)+1,by=0.5),xlim=c(-10,10), probability=T, col='light blue')
points(density(GPA.logistic.reg$residuals, bw=.5), type='l', lwd=3, col='red')


### Comparing Models. The problem is that they have to be nested 
r0= glm(higpa~1, data=GPA.data, family=binomial)
r1m= glm(higpa~hsm, data=GPA.data, family=binomial)
r1s= glm(higpa~hss, data=GPA.data, family=binomial)
r1e= glm(higpa~hse, data=GPA.data, family=binomial)
r2ms= glm(higpa~hsm+hss, data=GPA.data, family=binomial)
r2me= glm(higpa~hsm+hse, data=GPA.data, family=binomial)
r2se= glm(higpa~hss+hse, data=GPA.data, family=binomial)
r3mse= glm(higpa~hsm+hss+hse, data=GPA.data, family=binomial)

lr.test <- function (r.small,r.big) {                           ## function to test differences between models
  pchisq(r.small$deviance - r.big$deviance, df=1, lower.tail=F)
}

## test: are all the coeff equal to zero:
lr.test(r0,r3mse)    ## no they are not


lr.test(r0,r1m)
lr.test(r0,r1s)
lr.test(r0,r1e)    
## all three are signifficant for explaining GPA. 
#Note that based on the p-values alone the math score is the more signifficant
# So we will put that in the model and check what should be put next.

lr.test(r1m,r2ms)
lr.test(r1m,r2me)
## Better to add the science score.
lr.test(r2ms,r3mse)
## the english score is not signifficant (recall CS majors)

#What about the other stuff (SAT scores):
r2sat= glm(higpa~satm+satv, data=GPA.data, family=binomial)
summary(r2sat)
anova(r2sat)
r1satm= glm(higpa~satm, data=GPA.data, family=binomial)
r1satv= glm(higpa~satv, data=GPA.data, family=binomial)

lr.test(r0,r1satm)
lr.test(r0,r1satv)
## Both about the same
lr.test(r1satm,r2sat)
lr.test(r1satv,r2sat)
#Borderline signifficant. So we keep them.

#Now let us look to combine the models
r5hssat=glm(higpa~hsm+hss+hse+satm+satv, data=GPA.data, family=binomial)
summary(r5hssat)

r6full= glm(higpa~hsm+hss+hse+satm+satv+sex, data=GPA.data, family=binomial)
summary(r6full)

# Now on with the construction:
r3hssatm= glm(higpa~hsm+hss+satm, data=GPA.data, family=binomial)
r3hssatv= glm(higpa~hsm+hss+satv, data=GPA.data, family=binomial)
lr.test(r2ms , r3hssatm)
lr.test(r2ms , r3hssatv)

#Maybe add the verbal (one explanation could be that the satmath is already explained by the other 2 variables already in the model)
r3hssex= glm(higpa~hsm+hss+sex, data=GPA.data, family=binomial)
lr.test(r2ms , r3hssex)    ## not important

r4hssat=glm(higpa~hsm+hss+satv+satm, data=GPA.data, family=binomial)
lr.test(r3hssatv,r4hssat)

#ALL RIGHT: So we found out that the best variables for predicting CS GPA are the high school scores for MATH SCIENCE and the verbal part of the SAT 
#The model is:  r3hssatv= glm(higpa~hsm+hss+satv, data=GPA.data, family=binomial)

summary(r3hssatv)
par(mfrow=c(2,2))
plot(r3hssatv,ask=F)
par(mfrow=c(1,1))

# latex(r3hssatv)

##IN CONTRAST Using the Design library and the function lrm

library(Design)

lrm(higpa~hsm+hss+satv, data=GPA.data)
r <- lrm(higpa~hsm+hss+satv, data=GPA.data, y=T,x=T)
P <- resid(r,"gof")['P']
resid(r,"partial",pl=T)
title(signif(P))

latex(r)      ## Only if you have latex installed on your system

### Finally to predict probabilities: 
GPA.data[1:10,]
predict(r3hssatv,GPA.data[1:10,],type="response")