This notebook contains R code supporting the lecture on binomial/logistic regression from STA303, Summer 2018. The classic orings dataset is analyzed, along with the Wisconsin Breast Cancer dataset. The former introduces binomial regression, the latter logistic regression. Inference in the former and prediction in the latter are shown, including likelihood ratio tests, true/false positive rates and ROC curves.

library(tidyverse)
library(faraway)


# Load and plot the orings data

data(orings)
orings_tbl <- as_data_frame(orings) %>%
  mutate(prop_damaged = damage / 6) # Maximum likelihood estimate

glimpse(orings_tbl)
## Observations: 23
## Variables: 3
## $ temp         <dbl> 53, 57, 58, 63, 66, 67, 67, 67, 68, 69, 70, 70, 7...
## $ damage       <dbl> 5, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0...
## $ prop_damaged <dbl> 0.8333333, 0.1666667, 0.1666667, 0.1666667, 0.000...
orings_tbl %>%
  ggplot(aes(x = temp,y = prop_damaged)) +
  theme_classic() +
  geom_point(pch=21) +
  geom_smooth(method = "lm",se = FALSE,colour="blue") +
  labs(title = "Orings Data",
       subtitle = "Probability of o-ring failure for 23 space shuttle missions as a function of temperature on launch day",
       x = "Temperature",
       y = "Probability of Damage") +
  scale_y_continuous(labels = scales::percent_format())

# Fit a binomial GLM
# family = binomial ==> binomial regression
# family = poisson ==> poisson regression
# family = gaussian ==> same as lm()
# 
# Standard errors got from likelihood theory
# NOT exact
# They are off in same cases as deviance is off
# ...including binomial regression when m_i are small.
# So, here.
# 
# Residual deviance: deviance as defined in lecture.
# -2log(LRT) between full and SATURATED models
# 
# Null deviance: deviance of the null model,
# So -2log(LRT) between null and residual
#
# Goal was: find a model that fits "as well as"
# saturated model, or at least better than null.
# So, look at (null deviance) - (residual deviance)-
# "amount of deviance explained" by the fitted model
#
# How big is big enough?
#
# (null) - (residual) approx chi squared, df = dim(null) - dim(fitted model)
# Here, this equals 22 - 21 = 1
# This also corresponds to the fact that we fit a model with one parameter
# more than the intercept
#
# Mean of chisquare(n) = n
# Variance = 2n
# So quick check: if model fits as well as saturated,
# Residual deviance won't be more than 2sqrt(2df) away from df
# and if fits better than null, then (null - residual) will be
# more than 2sqrt(2df) from its df

glm1 <- glm(cbind(damage,6-damage) ~ temp,data=orings_tbl,family=binomial)
summary(glm1)
## 
## Call:
## glm(formula = cbind(damage, 6 - damage) ~ temp, family = binomial, 
##     data = orings_tbl)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9529  -0.7345  -0.4393  -0.2079   1.9565  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 11.66299    3.29626   3.538 0.000403 ***
## temp        -0.21623    0.05318  -4.066 4.78e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 38.898  on 22  degrees of freedom
## Residual deviance: 16.912  on 21  degrees of freedom
## AIC: 33.675
## 
## Number of Fisher Scoring iterations: 6
# Don't believe me? (Actually it's because I'm not sure...)
glm1null <- glm(cbind(damage,6-damage) ~ 1,data=orings_tbl,family=binomial)
summary(glm1null)
## 
## Call:
## glm(formula = cbind(damage, 6 - damage) ~ 1, family = binomial, 
##     data = orings_tbl)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9984  -0.9984  -0.9984   0.6947   4.4781  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.4463     0.3143  -7.783 7.06e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 38.898  on 22  degrees of freedom
## Residual deviance: 38.898  on 22  degrees of freedom
## AIC: 53.66
## 
## Number of Fisher Scoring iterations: 4
# Plot the fitted curve
orings_tbl %>%
  mutate(predicted_prob = ilogit(coef(glm1)[1] + coef(glm1)[2]*temp)) %>%
  ggplot(aes(x = temp,y = prop_damaged)) +
  theme_classic() +
  geom_point(pch=21) +
  geom_line(aes(y = predicted_prob),colour="blue",size = 1) +
  labs(title = "Orings Data - Fitted Binomial Regression Model",
       subtitle = "Probability of o-ring failure for 23 space shuttle missions as a function of temperature on launch day",
       x = "Temperature",
       y = "Probability of Damage") +
  scale_y_continuous(labels = scales::percent_format())

# Looks better. Can we extrapolate below the range of observed data?

orings_tbl %>%
  ggplot(aes(x = temp,y = prop_damaged)) +
  theme_classic() +
  geom_point(pch=21) +
  geom_line(data = data_frame(
    temp = seq(25,90,by=0.1),
    prop_damaged = ilogit(coef(glm1)[1] + coef(glm1)[2]*temp)
    ),colour="blue",size = 1) +
  labs(title = "Orings Data - Fitted Binomial Regression Model",
       subtitle = "Probability of o-ring failure for 23 space shuttle missions as a function of temperature on launch day",
       x = "Temperature",
       y = "Probability of Damage") +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_x_continuous(breaks = seq(30,80,by=10))

# Test the hypothesis that temperature as no effect on probability of failure
# Compare the model deviance with the null deviance
# 
# In R, type str(glm1) to see all the stuff available
# in the object glm1

D_temp <- glm1$null.deviance - deviance(glm1)
D_temp
## [1] 21.98538
# Compare to a chisq(2 - 1) distribution
1 - pchisq(D_temp,1)
## [1] 2.747351e-06
# Confidence interval for beta(temp)
betatemp_se <- sqrt(diag(vcov(glm1)))[2]
cint_temp <- c(coef(glm1)[2] - 1.96 * betatemp_se,coef(glm1)[2] + 1.96 * betatemp_se)
cint_temp
##       temp       temp 
## -0.3204606 -0.1120067
# What about a confidence interval for the factor by which odds increases for a unit increase in temp?
# That's exp(beta), by the way
exp(cint_temp)
##      temp      temp 
## 0.7258146 0.8940383
# Predict at a new temp
# E.g. 31F, the launch temperature on the day Challenger exploded

xnew <- cbind(c(1,31)) # Intercept
etapred <- t(xnew) %*% cbind(coef(glm1))
# What is that? That's eta!
etapred
##          [,1]
## [1,] 4.959746
# Predicted probability:
ilogit(etapred)
##           [,1]
## [1,] 0.9930342
# About 99%- yikes
# Confidence interval for the prediction?
etapred_sd <- sqrt(t(xnew) %*% vcov(glm1) %*% xnew)
etapred_sd
##         [,1]
## [1,] 1.66731
eta_confint <- c(etapred - 1.96 * etapred_sd,etapred + 1.96 * etapred_sd)
eta_confint
## [1] 1.691818 8.227674
ilogit(eta_confint)
## [1] 0.8444631 0.9997329
# Compare this with the output of the predict() function
predict(glm1,newdata = data_frame(temp = 31),se.fit = TRUE,type = "link")
## $fit
##        1 
## 4.959746 
## 
## $se.fit
## [1] 1.66731
## 
## $residual.scale
## [1] 1
predict(glm1,newdata = data_frame(temp = 31),se.fit = TRUE,type = "response")
## $fit
##         1 
## 0.9930342 
## 
## $se.fit
##          1 
## 0.01153332 
## 
## $residual.scale
## [1] 1
### Binary logistic regression: complete example ###

# We'll analyze the Wisconsin Breast Cancer Data, exercise 2 chapter 2 page 58 from ELMR
# Note the data is misspelled in the Faraway package
data(wbca) # Called wbcd in the book
wbca_tbl <- as_data_frame(wbca)

glimpse(wbca_tbl)
## Observations: 681
## Variables: 10
## $ Class <int> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0,...
## $ Adhes <int> 1, 5, 1, 1, 3, 8, 1, 1, 1, 1, 1, 1, 3, 1, 10, 4, 1, 1, 6...
## $ BNucl <int> 1, 10, 2, 4, 1, 10, 10, 1, 1, 1, 1, 1, 3, 3, 9, 1, 1, 1,...
## $ Chrom <int> 3, 3, 3, 3, 3, 9, 3, 3, 1, 2, 3, 2, 4, 3, 5, 4, 2, 3, 4,...
## $ Epith <int> 2, 7, 2, 3, 2, 7, 2, 2, 2, 2, 1, 2, 2, 2, 7, 6, 2, 2, 4,...
## $ Mitos <int> 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2,...
## $ NNucl <int> 1, 2, 1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 4, 1, 5, 3, 1, 1, 1,...
## $ Thick <int> 5, 5, 3, 6, 4, 8, 1, 2, 2, 4, 1, 2, 5, 1, 8, 7, 4, 4, 10...
## $ UShap <int> 1, 4, 1, 8, 1, 10, 1, 2, 1, 1, 1, 1, 3, 1, 5, 6, 1, 1, 7...
## $ USize <int> 1, 4, 1, 8, 1, 10, 1, 1, 1, 2, 1, 1, 3, 1, 7, 4, 1, 1, 7...
# First step: take a look at the response, Class, which is an indicator of malignancy of
# tumours (1 == benign, 0 == malignant)
with(wbca_tbl,table(Class))
## Class
##   0   1 
## 238 443
443 / (443 + 238)
## [1] 0.650514
238 / (443 + 238)
## [1] 0.349486
# We have 9 predictors, so it's feasible to plot the data separately for each predictor
# We'll do this in a clever, "tidy" way

wbca_tbl %>%
  gather(variable,value,Adhes:USize) %>%
  ggplot(aes(x = value,y = Class)) +
  theme_classic() +
  facet_wrap(~variable) +
  geom_jitter(width=0.5,height=0.1,alpha = 0.3) +
  scale_y_continuous(breaks = c(0,1),labels = c("Malignant","Benign")) +
  labs(title = "Malignancy of Tumours, by Predictor",
       subtitle = "WBCA Data",
       x = "Predictor Value",
       y = "Malignant or Benign?")

# We have several predictors, so it is a good idea to look at their correlation matrix
round(cor(wbca_tbl %>% dplyr::select(-Class)),2)
##       Adhes BNucl Chrom Epith Mitos NNucl Thick UShap USize
## Adhes  1.00  0.68  0.67  0.59  0.42  0.60  0.49  0.68  0.70
## BNucl  0.68  1.00  0.68  0.59  0.34  0.58  0.59  0.72  0.70
## Chrom  0.67  0.68  1.00  0.62  0.35  0.66  0.55  0.73  0.75
## Epith  0.59  0.59  0.62  1.00  0.48  0.63  0.52  0.72  0.75
## Mitos  0.42  0.34  0.35  0.48  1.00  0.43  0.35  0.44  0.46
## NNucl  0.60  0.58  0.66  0.63  0.43  1.00  0.53  0.72  0.72
## Thick  0.49  0.59  0.55  0.52  0.35  0.53  1.00  0.66  0.64
## UShap  0.68  0.72  0.73  0.72  0.44  0.72  0.66  1.00  0.91
## USize  0.70  0.70  0.75  0.75  0.46  0.72  0.64  0.91  1.00
corrplot::corrplot(cor(wbca_tbl %>% dplyr::select(-Class)),order="AOE")

# There are some highly correlated variables, especially UShap and USize

# Fit an initial binary logistic regression model
wbca_glm1 <- glm(Class ~ .,data = wbca_tbl,family = binomial)
summary(wbca_glm1)
## 
## Call:
## glm(formula = Class ~ ., family = binomial, data = wbca_tbl)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.48282  -0.01179   0.04739   0.09678   3.06425  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 11.16678    1.41491   7.892 2.97e-15 ***
## Adhes       -0.39681    0.13384  -2.965  0.00303 ** 
## BNucl       -0.41478    0.10230  -4.055 5.02e-05 ***
## Chrom       -0.56456    0.18728  -3.014  0.00257 ** 
## Epith       -0.06440    0.16595  -0.388  0.69795    
## Mitos       -0.65713    0.36764  -1.787  0.07387 .  
## NNucl       -0.28659    0.12620  -2.271  0.02315 *  
## Thick       -0.62675    0.15890  -3.944 8.01e-05 ***
## UShap       -0.28011    0.25235  -1.110  0.26699    
## USize        0.05718    0.23271   0.246  0.80589    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 881.388  on 680  degrees of freedom
## Residual deviance:  89.464  on 671  degrees of freedom
## AIC: 109.46
## 
## Number of Fisher Scoring iterations: 8
# The coefficients all look reasonable
# What about the residual deviance?
# ALL THESE QUANTITIES ARE USELESS FOR BINARY REGRESSION!
# But you can still compute them... but you shouldn't
Dstat <- wbca_glm1$null.deviance - deviance(wbca_glm1)
Dstat
## [1] 791.924
# Compare to a chisq(680 - 671) distribution
1 - pchisq(Dstat,9)
## [1] 0
# So the model fits better than the null model
# NO! CAN'T SAY THAT
# Compare to the saturated model
# But wait... what is the log likelihood of the saturated model for BINARY logistic regression?
y <- wbca_tbl %>% pull(Class)
ll_sat <- sum( y*log(y) + (1 - y)*log(1 - y))
ll_sat
## [1] NaN
# What's the problem?
# 
# Take dev_sat = 0 (again, why?)
# Then a test of whether the model fits as well as the saturated model is simply a test of comparing
# the residual deviance to its degrees of freedom
deviance(wbca_glm1)
## [1] 89.4642
1 - pchisq(deviance(wbca_glm1),671)
## [1] 1
# Small residual deviance ==> model fits the data "as well" as the saturated model... usually
# But here, we cannot use residual deviance, because it doesn't depend on the data. Only on p-hat
# Note though that the chisquare approximation is terrible for binary data, and there are other problems in using deviance with binary data
# Let's make sure the null model doesn't also fit the data as well as the saturated model
1 - pchisq(wbca_glm1$null.deviance,680)
## [1] 2.625263e-07
# But don't trust these comparisons, for BINARY data
# The above would be a good thing to do for binomial data with bigger n per observation
# Or for count data
# But NOT for BINARY data
# 
# The only reason I showed you any of this in this example is to show you the R code
# for calculating these quantites.
# 
# Now. Can we get a simpler model that fits just as well? Again, stepwise selection based on AIC
# not a great thing for binary regression, but here's how to do it in R. Let's see what happens:

wbca_glm2 <- step(wbca_glm1,
                  lower = glm(Class~1,data=wbca_tbl,family=binomial))
## Start:  AIC=109.46
## Class ~ Adhes + BNucl + Chrom + Epith + Mitos + NNucl + Thick + 
##     UShap + USize
## 
##         Df Deviance    AIC
## - USize  1   89.523 107.52
## - Epith  1   89.613 107.61
## - UShap  1   90.627 108.63
## <none>       89.464 109.46
## - Mitos  1   93.551 111.55
## - NNucl  1   95.204 113.20
## - Adhes  1   98.844 116.84
## - Chrom  1   99.841 117.84
## - BNucl  1  109.000 127.00
## - Thick  1  110.239 128.24
## 
## Step:  AIC=107.52
## Class ~ Adhes + BNucl + Chrom + Epith + Mitos + NNucl + Thick + 
##     UShap
## 
##         Df Deviance    AIC
## - Epith  1   89.662 105.66
## - UShap  1   91.355 107.36
## <none>       89.523 107.52
## - Mitos  1   93.552 109.55
## - NNucl  1   95.231 111.23
## - Adhes  1   99.042 115.04
## - Chrom  1  100.153 116.15
## - BNucl  1  109.064 125.06
## - Thick  1  110.465 126.47
## 
## Step:  AIC=105.66
## Class ~ Adhes + BNucl + Chrom + Mitos + NNucl + Thick + UShap
## 
##         Df Deviance    AIC
## <none>       89.662 105.66
## - UShap  1   91.884 105.88
## - Mitos  1   93.714 107.71
## - NNucl  1   95.853 109.85
## - Adhes  1  100.126 114.13
## - Chrom  1  100.844 114.84
## - BNucl  1  109.762 123.76
## - Thick  1  110.632 124.63
summary(wbca_glm2)
## 
## Call:
## glm(formula = Class ~ Adhes + BNucl + Chrom + Mitos + NNucl + 
##     Thick + UShap, family = binomial, data = wbca_tbl)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.44161  -0.01119   0.04962   0.09741   3.08205  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  11.0333     1.3632   8.094 5.79e-16 ***
## Adhes        -0.3984     0.1294  -3.080  0.00207 ** 
## BNucl        -0.4192     0.1020  -4.111 3.93e-05 ***
## Chrom        -0.5679     0.1840  -3.085  0.00203 ** 
## Mitos        -0.6456     0.3634  -1.777  0.07561 .  
## NNucl        -0.2915     0.1236  -2.358  0.01837 *  
## Thick        -0.6216     0.1579  -3.937 8.27e-05 ***
## UShap        -0.2541     0.1785  -1.423  0.15461    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 881.388  on 680  degrees of freedom
## Residual deviance:  89.662  on 673  degrees of freedom
## AIC: 105.66
## 
## Number of Fisher Scoring iterations: 8
# We removed Epith and USize. Remember that USize was highly correlated with UShap. This actually makes sense.
# It's likely that either of these variables would have been fine; e.g. if there are business/scientific
# reasons for wanting to include one over the other, you probably could choose
# 
# What next? Let's look at the actual predicted probabilities. The variables in the model were all pretty skewed, so what do
# we expect the distribution of predicted probabilities to look like?

# FYI, you should be able to compute the below from scratch, i.e. not using predict()
wbca_predicted_probs <- predict(wbca_glm2,type="response")

data_frame(x = wbca_predicted_probs) %>%
  ggplot(aes(x = x)) +
  theme_classic() +
  geom_histogram(bins = 100,colour = "black",fill = "orange") +
  labs(title = "Histogram of Predicted Probabilities",
       subtitle = "Predicted probability of tumour being benign, logistic regression on WBCA data",
       x = "Predicted Probability",
       y = "# of tumours") +
  scale_x_continuous(labels = scales::percent_format())

# Most of the predicted probabilities are near 0, or near 1
# How to make a hard 0/1 prediction for each case? The textbook suggests using a cutoff of 0.5.
# Let's look at the classification error if we do that

wbca_tbl %>%
  dplyr::select(Class) %>%
  bind_cols(data_frame(predprob = wbca_predicted_probs)) %>%
  mutate(ypred = as.numeric(predprob > .5)) %>%
  group_by(Class,ypred) %>%
  summarize(cnt = n(),
            pct = scales::percent(cnt / nrow(wbca_tbl)))
## # A tibble: 4 x 4
## # Groups:   Class [?]
##   Class ypred   cnt pct  
##   <int> <dbl> <int> <chr>
## 1     0     0   227 33.3%
## 2     0     1    11 1.62%
## 3     1     0     9 1.32%
## 4     1     1   434 63.7%
# What about using .9 as a cutoff?

wbca_tbl %>%
  dplyr::select(Class) %>%
  bind_cols(data_frame(predprob = wbca_predicted_probs)) %>%
  mutate(ypred = as.numeric(predprob > .9)) %>%
  group_by(Class,ypred) %>%
  summarize(cnt = n(),
            pct = scales::percent(cnt / nrow(wbca_tbl)))
## # A tibble: 4 x 4
## # Groups:   Class [?]
##   Class ypred   cnt pct   
##   <int> <dbl> <int> <chr> 
## 1     0     0   237 34.8% 
## 2     0     1     1 0.147%
## 3     1     0    16 2.35% 
## 4     1     1   427 62.7%
# False Positive: ACTUAL negative, PREDICT positive
# False Negative: ACTUAL positive, PREDICT negative
# True Positive: ACTUAL positive, PREDICT positive
# True Negative: ACTUAL negative, PREDICT negative
# 
# False positive rate: FP / (All actual negatives)
# = FP / (FP + TN)
# True positive rate: TP / (All actual positives)
# = TP / (TP + FN)
# 
# Strongly recommend you read the wikipedia article too.
# And then copy this out 100 times.

# We see there is a tradeoff- When we increase the cutoff, we get less false positives, and more false negatives
# Since a positive here means benign, a false positive is classifying a malignant tumour as being benign, which seems worse
# We can plot a parametric curve of the false positives vs true positives for cutoffs ranging from 0 to 1- the ROC curve
# A wide ROC curve means there exists favourable cutoffs, i.e. the model is good
# 
# I won't require you to be able to create the below from scratch like I am here
# But you do need to know how to interpret the below graph.

fpr <- function(y,ypred) sum(ypred==1 & y==0) / sum(y==0)
tpr <- function(y,ypred) sum(ypred==1 & y==1) / sum(y==1)

wbca_with_predprob <- wbca_tbl %>%
  dplyr::select(Class) %>%
  bind_cols(data_frame(predprob = wbca_predicted_probs))

roc_data <- wbca_with_predprob %>%
  arrange(predprob) %>%
  pull(predprob) %>%
  round(3) %>%
  unique() %>%
  map(~c(.x,tpr(y = wbca_with_predprob$Class,ypred = as.numeric(wbca_with_predprob$predprob >= .x)),fpr(y = wbca_with_predprob$Class,ypred = as.numeric(wbca_with_predprob$predprob >= .x)))) %>%
  map(~.x %>% as_data_frame() %>% t() %>% as_data_frame()) %>%
  reduce(bind_rows) %>%
  rename(h=V1,tpr=V2,fpr=V3)

roc_data %>%
  ggplot(aes(x=fpr,y=tpr)) +
  theme_light() +
  geom_path(colour="#195FFF",size=1) +
  scale_x_continuous(labels=scales::percent_format()) +
  scale_y_continuous(labels=scales::percent_format()) +
  labs(title="ROC Curve",
       subtitle = "Wisconsin Breast Cancer Data - Logistic Regression Model",
       x="FPR",
       y="TPR") +
  geom_line(data = data_frame(x = c(0,1),y = c(0,1)),aes(x=x,y=y),linetype="dashed",colour="red")

# Every point on this curve is a (FPR,TPR)
# combination for a given cutoff
# Answers: can we pick a cutoff that separates the 0s and 1s?
# And, what FPR/TPR correspond to a given cutoff?