# Logistic regression # binomial: for binary or dichotomous response variable, 0 and 1, such as presence and absence, success and failure, etc.. # multinomial: for more than two categories. # Depression level (0 or 1) # People's occupational choices # Brand choices based on gender and age. # A recent finding of a market research group claims that among digital camera choices, women prefer Kodak more than men and men prefer Canon more than women. # As in linear regression, we are looking for a relationship between our response variable and a set of independent variables, which are often called covariates. # We can transform the output of a linear regression to be suitable for probabilities # logit(p) = log(o) = log(p/(1-p)) = b0 + b1x1 + b2x2 + ... # The odds are simply the ratio of the chance an event will occur, divided by the chance it will not. # p =.8, then the odds = .8 / .2 = 4 to 1 - 4 times as likely # exp(logit(p)) = exp(b0 + b1x1 + b2x2 + ...) # exp(log(p/(1-p))) = p/(1-p) = exp(b0)exp(b1x1)exp(b2x2)... # exp(log(3)) = 3 # The inverse of the logit function is the logistic function. # if logit(p) = z, then p = exp(z) / (1 + exp(z)) # odds = p / (1 - p) # p = odds / (1 + odds) # the intuition is that a unit change in the value of the variable should change the odds by a constant multiplicative amount x <- seq(-10, 10, by=1) plot(x, exp(x)/(1+exp(x)),type="l", main="Logistic curve") depress <- read.table("/Users/yasu/Desktop/ma/Data/depress.txt", header=FALSE) colnames(depress) <- c('id', 'sex', 'age', 'marital', 'education', 'employment', 'income', 'religion', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16', 'c17', 'c18', 'c19', 'c20', 'cesd', 'cases', 'drink', 'health', 'regulardoc', 'treat', 'beddays', 'acuteill', 'chronicill') attach(depress) # Classification of individuals by depression level and sex. table1 <- table(sex, cases) print(table1) # Regressing depression level on sex. # Note: We are using sex01 which is a (0, 1) variable rather than sex which is a (1, 2) variable. sex01 <- sex - 1 logit1 <- glm(cases ~ sex01, depress, family=binomial) summary(logit1) # A Wald test is used to test the statistical significance of each coefficient (b) in the model. # A Wald test calculates a Z statistic, which is b/se. # This z value is then squared, yielding a Wald statistic with a chi-square distribution. # Akaike information criterion is a measure of the goodness of fit of an estimated statistical model. # AIC offers a measure of the information lost when a given model is used (entropy or uncertainty associated with a model). # AIC considers precision and complexity of the model - number of parameters and fit. # AIC is not a test on the model in the sense of hypothesis testing. # It is a tool for model selection. # Given a data set, several competing models may be ranked according to their AIC, with the one having the lowest AIC being the best. # Regressing depression levels on age, sex and income. logit2 <- glm(cases ~ age+income+sex01, depress, family=binomial) summary(logit2) # We can test if we can profitably drop a factor with the anova function to give an analysis of deviance table # or the drop1 function to try dropping each factor. anova(logit2, test="Chisq") drop1(logit2, test="Chisq") add1(logit2, test="Chisq") anova(logit1, logit2, test="Chisq") # Regressing depression levels on age and income. logit3 <- glm(cases ~ age+income, depress, family=binomial) summary(logit3) detach(depress) # Multinominal Logit Model mydata <- read.table("/Users/yasu/Desktop/ma/Data/brand.txt", header=T) attach(mydata) names(mydata) table(brand) table(female) summary(age) xtabs(~ female + brand) # Multiple logistic regression analyses, one for each pair of outcomes: One problem with this approach is that each analysis is run on a different sample. # Or use the Multinomial Logit Model. library(VGAM) mlogit<- vglm(brand~female+age, family=multinomial(), na.action=na.pass) summary(mlogit) # log(P(brand=1)/P(brand=3)) = b_10 + b_11*female + b_12*age # log(P(brand=2)/P(brand=3)) = b_20 + b_21*female + b_22*age # for one unit change in the variable age, the log of the ratio of the two probabilities, # P(brand=1)/P(brand=3), will decrease by 0.696, and the log of the ratio of the two probabilities, # P(brand=2)/P(brand=3), will decrease by 0.318. # We can say that older people tend to prefer brand 3. # Create predicted probabilities for female vs. male female <- c(0,1) age <- c(mean(mydata$age)) newdata1 <- data.frame(female,age) newdata1 newdata1$predicted <-predict(mlogit,newdata=newdata1,type="response") newdata1 # Create predicted probabilities for age newdata2<-data.frame(female=mean(female),age=seq(24,38,1)) predicted<-predict(mlogit,newdata=newdata2,type="response") cbind(age=newdata2$age,predicted) # Create predicted probabilities for female vs. male and age newdata3<-data.frame(female=0,age=seq(24,38,1)) male_p<-predict(mlogit,newdata=newdata3,type="response") newdata4<-data.frame(female=1,age=seq(24,38,1)) female_p<-predict(mlogit,newdata=newdata4,type="response") age<-newdata3$age par(mfcol=c(1,3), cex=1.5, oma=c(2,2,1,1)) plot(male_p[,1]~age,type="l",col="blue",lwd=1, ylab="Predicted Probability for Brand = 1", xlab="Age", ylim=c(min(female_p), max(female_p))) lines(female_p[,1]~age,col="red",lwd=1) legend(30,.93,c("Males","Females"),col=c("blue","red"),lwd=c(1,1)) plot(male_p[,2]~age,type="l",col="blue",lwd=1, ylab="Predicted Probability for Brand = 2", xlab="Age", ylim=c(min(female_p), max(female_p))) lines(female_p[,2]~age,col="red",lwd=1) plot(male_p[,3]~age,type="l",col="blue",lwd=1, ylab="Predicted Probability for Brand = 3", xlab="Age", ylim=c(min(female_p), max(female_p))) lines(female_p[,3]~age,col="red",lwd=1)