Supplementary R code for lecture 5, STA303 summer 2018

This notebook contains R code supporting the lecture on count (Poisson) regression from STA303, Summer 2018. The Galapagos Islands Species data are analyzed with a discussion on model assumptions, transformations, overdispersion and offsets. A dose-response model is fit to the salmonella dataset, with a discussion of assumptions, predictor transformations, and overdispersion. Both analyses follow but expand upon those from the Count Regression chapter in Faraway: Extending the Linear Model with R

library(tidyverse)
library(faraway)
library(MASS)

# Load the Galapagos Islands data

data(gala)
gala_tbl <- gala %>%
  as_data_frame() %>%
  dplyr::select(-Endemics) # Alternative response variable; remove from analysis

glimpse(gala_tbl)

## Observations: 30
## Variables: 6
## $ Species   <dbl> 58, 31, 3, 25, 2, 18, 24, 10, 8, 2, 97, 93, 58, 5, 4...
## $ Area      <dbl> 25.09, 1.24, 0.21, 0.10, 0.05, 0.34, 0.08, 2.33, 0.0...
## $ Elevation <dbl> 346, 109, 114, 46, 77, 119, 93, 168, 71, 112, 198, 1...
## $ Nearest   <dbl> 0.6, 0.6, 2.8, 1.9, 1.9, 8.0, 6.0, 34.1, 0.4, 2.6, 1...
## $ Scruz     <dbl> 0.6, 26.3, 58.7, 47.4, 1.9, 8.0, 12.0, 290.2, 0.4, 5...
## $ Adjacent  <dbl> 1.84, 572.33, 0.78, 0.18, 903.82, 1.84, 0.34, 2.85, ...

# Want to model species as a function of the other variables
# Look at its distribution
gala_tbl %>%
  ggplot(aes(x = Species)) +
  theme_classic() +
  geom_histogram(bins = 10,colour = "black",fill = "orange") +
  labs(title = "Histogram of Species",
       subtitle = "Galapagos Islands plant species",
       x = "Species",
       y = "# Islands")

# Definitely not normal
# Some high counts, but a lot of low ones
# What happens if we do a normal linear regression?

gala_lm1 <- lm(Species ~ .,data=gala_tbl)
summary(gala_lm1)

## 
## Call:
## lm(formula = Species ~ ., data = gala_tbl)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -111.679  -34.898   -7.862   33.460  182.584 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.068221  19.154198   0.369 0.715351    
## Area        -0.023938   0.022422  -1.068 0.296318    
## Elevation    0.319465   0.053663   5.953 3.82e-06 ***
## Nearest      0.009144   1.054136   0.009 0.993151    
## Scruz       -0.240524   0.215402  -1.117 0.275208    
## Adjacent    -0.074805   0.017700  -4.226 0.000297 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 60.98 on 24 degrees of freedom
## Multiple R-squared:  0.7658, Adjusted R-squared:  0.7171 
## F-statistic:  15.7 on 5 and 24 DF,  p-value: 6.838e-07

# The fit isn't even that bad
# But the diagnostics...

data_frame(x = residuals(gala_lm1)) %>%
  mutate_at("x",funs( (. - mean(.)) / sd(.))) %>%
  arrange(x) %>%
  mutate(q = qnorm(seq(1,length(residuals(gala_lm1))) / (1+length(residuals(gala_lm1))))) %>%
  ggplot(aes(x = q,y = x)) +
  theme_classic() +
  geom_point() +
  geom_abline(slope = 1,intercept = 0) +
  labs(title = "Normal QQ-Plot of Residuals",
       subtitle = "Normal linear model for Galapagos Islands plant species data",
       x = "Theoretical Quantiles",
       y = "Sample Quantiles")

# Not actually that bad. A lot of the counts are pretty large

data_frame(resid = residuals(gala_lm1),
           fitted = fitted(gala_lm1)) %>%
  mutate_at("resid",funs( (. - mean(.)) / sd(.))) %>%
  ggplot(aes(x = fitted,y = resid)) +
  theme_classic() +
  geom_point() +
  #geom_smooth() +
  geom_hline(yintercept = 0) +
  geom_hline(yintercept = -2,linetype="dashed",colour="red") +
  geom_hline(yintercept = 2,linetype="dashed",colour="red") +
  labs(title = "Residuals vs Fitted Values",
       subtitle = "Normal linear model for Galapagos Islands plant species data",
       x = "Fitted Values",
       y = "Residuals")

# Do you see the problem? This is classic
# Transformation? We have the variance stabilizing transformation. Let's try the boxcox one too and compare them
# 
gala_boxcox <- boxcox(gala_lm1)

# 0.5 is within the range of "good" values. So, let's try a square root transformation
# Boxcox actually should be (y^lambda - 1)/lambda
# Try "variance-stabilizing transformation":
gala_lm2 <- lm(sqrt(Species) ~ .,data=gala_tbl)
summary(gala_lm2)

## 
## Call:
## lm(formula = sqrt(Species) ~ ., data = gala_tbl)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5572 -1.4969 -0.3031  1.3527  5.2110 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.3919243  0.8712678   3.893 0.000690 ***
## Area        -0.0019718  0.0010199  -1.933 0.065080 .  
## Elevation    0.0164784  0.0024410   6.751 5.55e-07 ***
## Nearest      0.0249326  0.0479495   0.520 0.607844    
## Scruz       -0.0134826  0.0097980  -1.376 0.181509    
## Adjacent    -0.0033669  0.0008051  -4.182 0.000333 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.774 on 24 degrees of freedom
## Multiple R-squared:  0.7827, Adjusted R-squared:  0.7374 
## F-statistic: 17.29 on 5 and 24 DF,  p-value: 2.874e-07

# Fit is reasonable. Diagnostics?

data_frame(x = residuals(gala_lm2)) %>%
  mutate_at("x",funs( (. - mean(.)) / sd(.))) %>%
  arrange(x) %>%
  mutate(q = qnorm(seq(1,length(residuals(gala_lm2))) / (1+length(residuals(gala_lm2))))) %>%
  ggplot(aes(x = q,y = x)) +
  theme_classic() +
  geom_point() +
  geom_abline(slope = 1,intercept = 0) +
  labs(title = "Normal QQ-Plot of Residuals",
       subtitle = "Normal linear model for Galapagos Islands plant species data with square-root transformation",
       x = "Theoretical Quantiles",
       y = "Sample Quantiles")

data_frame(resid = residuals(gala_lm2),
           fitted = fitted(gala_lm2)) %>%
  mutate_at("resid",funs( (. - mean(.)) / sd(.))) %>%
  ggplot(aes(x = fitted,y = resid)) +
  theme_classic() +
  geom_point() +
  geom_hline(yintercept = 0) +
  geom_hline(yintercept = -2,linetype="dashed",colour="red") +
  geom_hline(yintercept = 2,linetype="dashed",colour="red") +
  labs(title = "Residuals vs Fitted Values",
       subtitle = "Normal linear model for Galapagos Islands plant species data with square-root transformation",
       x = "Fitted Values",
       y = "Residuals")

# Much better.
# 
# Before we do any modelling, really, we should at least take a look at the relationship
# between the response and each predictor

plot_gala <- function(x) {
  gala_tbl %>%
    mutate(Species = log(Species)) %>%
    ggplot(aes_string(x = x,y = "Species")) +
    theme_classic() +
    geom_point() +
    labs(subtitle = "Galapagos Island Species Data",
         title = stringr::str_c("Species vs ",x),
         x = x,
         y = "log(Species)") +
    scale_x_continuous(labels = scales::comma_format())
  
}

cowplot::plot_grid(
  plot_gala("Area"),
  plot_gala("Elevation"),
  plot_gala("Nearest"),
  plot_gala("Scruz"),
  plot_gala("Adjacent"),
  ncol = 3
)

# There are several variables that clearly have nonlinear relationships with log(Species)
# Let's investigate the Poisson GLM

### Poisson GLM ###

gala_glm1 <- glm(Species ~ .,data=gala_tbl,family=poisson)
summary(gala_glm1)

## 
## Call:
## glm(formula = Species ~ ., family = poisson, data = gala_tbl)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.2752  -4.4966  -0.9443   1.9168  10.1849  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.155e+00  5.175e-02  60.963  < 2e-16 ***
## Area        -5.799e-04  2.627e-05 -22.074  < 2e-16 ***
## Elevation    3.541e-03  8.741e-05  40.507  < 2e-16 ***
## Nearest      8.826e-03  1.821e-03   4.846 1.26e-06 ***
## Scruz       -5.709e-03  6.256e-04  -9.126  < 2e-16 ***
## Adjacent    -6.630e-04  2.933e-05 -22.608  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3510.73  on 29  degrees of freedom
## Residual deviance:  716.85  on 24  degrees of freedom
## AIC: 889.68
## 
## Number of Fisher Scoring iterations: 5

# Coefficients are all small, but remember their effect is multiplicative now, not additive
# That residual deviance is a bit ridiculous. Is the fit really that bad?
# It could be caused by an outlier. But note that the null deviance is also huge.
# What proportion of the null deviance is explained by the model?
(gala_glm1$null.deviance - deviance(gala_glm1)) / gala_glm1$null.deviance

## [1] 0.7958128

# So the model does explain much of the deviance
# 
# While in the normal linear model we assumed constant variance, 
# here was are assuming that Var(Y) = E(Y)
# This is a model assumption, and needs to be checked
# Note that the graph in Faraway is labelled incorrectly- it's a log-log plot

data_frame(muhat = predict(gala_glm1,type="response"),
           varhat = (gala_tbl$Species - muhat)^2) %>%
  mutate_all(log) %>%
  ggplot(aes(x = muhat,y = varhat)) +
  theme_classic() +
  geom_point() +
  geom_abline(slope = 1,intercept = 0,colour = "red") +
  labs(title = "Mean-Variance Relationship",
       subtitle = "Poisson GLM, Galapagos Islands species data",
       x = "Predicted Mean",
       y = "Estimated Variance at Predicted Mean")

# It looks like we have overdispersion
# 
# Estimate dispersion
n <- nrow(gala_tbl)
p <- length(coef(gala_glm1))
phi <- data_frame(muhat = predict(gala_glm1,type="response"),
                  varhat = (gala_tbl$Species - muhat)^2) %>%
  summarize(phi = sum(varhat / muhat) / (n - p)) %>%
  pull(phi)

phi

## [1] 31.74914

# Yes, lots of overdispersion
# Compute the new standard errors

summary(gala_glm1,dispersion = phi)

## 
## Call:
## glm(formula = Species ~ ., family = poisson, data = gala_tbl)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.2752  -4.4966  -0.9443   1.9168  10.1849  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.1548079  0.2915897  10.819  < 2e-16 ***
## Area        -0.0005799  0.0001480  -3.918 8.95e-05 ***
## Elevation    0.0035406  0.0004925   7.189 6.53e-13 ***
## Nearest      0.0088256  0.0102621   0.860    0.390    
## Scruz       -0.0057094  0.0035251  -1.620    0.105    
## Adjacent    -0.0006630  0.0001653  -4.012 6.01e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 31.74914)
## 
##     Null deviance: 3510.73  on 29  degrees of freedom
## Residual deviance:  716.85  on 24  degrees of freedom
## AIC: 889.68
## 
## Number of Fisher Scoring iterations: 5

summary(gala_glm1)

## 
## Call:
## glm(formula = Species ~ ., family = poisson, data = gala_tbl)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.2752  -4.4966  -0.9443   1.9168  10.1849  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.155e+00  5.175e-02  60.963  < 2e-16 ***
## Area        -5.799e-04  2.627e-05 -22.074  < 2e-16 ***
## Elevation    3.541e-03  8.741e-05  40.507  < 2e-16 ***
## Nearest      8.826e-03  1.821e-03   4.846 1.26e-06 ***
## Scruz       -5.709e-03  6.256e-04  -9.126  < 2e-16 ***
## Adjacent    -6.630e-04  2.933e-05 -22.608  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3510.73  on 29  degrees of freedom
## Residual deviance:  716.85  on 24  degrees of freedom
## AIC: 889.68
## 
## Number of Fisher Scoring iterations: 5

sqrt(diag(vcov(gala_glm1)))

##  (Intercept)         Area    Elevation      Nearest        Scruz 
## 5.174952e-02 2.627298e-05 8.740704e-05 1.821260e-03 6.256200e-04 
##     Adjacent 
## 2.932754e-05

sqrt(diag(vcov(gala_glm1)) * phi)

##  (Intercept)         Area    Elevation      Nearest        Scruz 
## 0.2915897482 0.0001480387 0.0004925069 0.0102621392 0.0035251417 
##     Adjacent 
## 0.0001652500

# Offset. Area is potentially an offset
# First, try a model with log(Area)
# 
gala_glm2 <- glm(Species ~ log(Area) + Elevation + Nearest + Scruz + Adjacent,
                 data = gala_tbl,
                 family = poisson)
summary(gala_glm2)

## 
## Call:
## glm(formula = Species ~ log(Area) + Elevation + Nearest + Scruz + 
##     Adjacent, family = poisson, data = gala_tbl)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.5525  -2.9223  -0.8698   2.8767   7.6345  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.518e+00  5.105e-02  68.900  < 2e-16 ***
## log(Area)    3.994e-01  1.524e-02  26.214  < 2e-16 ***
## Elevation   -3.804e-04  8.844e-05  -4.301 1.70e-05 ***
## Nearest     -4.152e-03  1.729e-03  -2.402   0.0163 *  
## Scruz       -3.118e-03  6.050e-04  -5.154 2.55e-07 ***
## Adjacent    -1.671e-04  2.964e-05  -5.636 1.74e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3510.73  on 29  degrees of freedom
## Residual deviance:  411.46  on 24  degrees of freedom
## AIC: 584.29
## 
## Number of Fisher Scoring iterations: 5

# Coefficient for log(Area) is about 0.4
# This is probably too small to consider to be "close to 1"
# Especially relative to its own standard error of 0.015
# Probably in practice, would NOT fix it to be 1
# But, let's see what happens when we do

gala_glm3 <- glm(Species ~ offset(log(Area)) + Elevation + Nearest + Scruz + Adjacent,
                 data = gala_tbl,
                 family = poisson)
summary(gala_glm3)

## 
## Call:
## glm(formula = Species ~ offset(log(Area)) + Elevation + Nearest + 
##     Scruz + Adjacent, family = poisson, data = gala_tbl)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -14.2091   -0.6762    3.4247    6.9750   14.4016  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.155e+00  6.222e-02  34.642  < 2e-16 ***
## Elevation   -2.966e-03  5.913e-05 -50.153  < 2e-16 ***
## Nearest     -1.674e-02  1.553e-03 -10.780  < 2e-16 ***
## Scruz       -1.078e-03  7.384e-04  -1.460    0.144    
## Adjacent     1.568e-04  2.808e-05   5.582 2.37e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 5766.9  on 29  degrees of freedom
## Residual deviance: 1565.0  on 25  degrees of freedom
## AIC: 1735.9
## 
## Number of Fisher Scoring iterations: 6

# Exercise: can you try transformations (e.g. log) on other predictors
# to get a better model?
# Or interactions?
# If you're super keen, try to get a significantly better model than me,
# and show me!

### Example ###

# Use the salmonella data, exercise 2 Faraway chapter 3 page 73
# Data represents number of colonies (a count) for levels of dose of quinoline
# 
# Load the data and do some initial data analysis

data(salmonella)
salmonella_tbl <- as_data_frame(salmonella)

glimpse(salmonella_tbl)

## Observations: 18
## Variables: 2
## $ colonies <int> 15, 21, 29, 16, 18, 21, 16, 26, 33, 27, 41, 60, 33, 3...
## $ dose     <int> 0, 0, 0, 10, 10, 10, 33, 33, 33, 100, 100, 100, 333, ...

# Plot it. Technically dose is continuous, but there are only 6 unique values,
# so it would be better to do boxplots. Except for the additional fact that there are only
# 3 datapoints per level.
# 
# So let's just plot them.
salmonella_tbl %>%
  ggplot(aes(x = dose,y = colonies)) +
  theme_classic() +
  geom_point() +
  labs(title = "Salmonella Data",
       subtitle = "# of colonies for each observed dose of quinoline",
       x = "Dose",
       y = "# colonies")

# That doesn't tell us much. Dose increases exponentially; let's log it
salmonella_tbl %>%
  mutate_at("dose",funs(log(. + 1))) %>%
  ggplot(aes(x = dose,y = colonies)) +
  theme_classic() +
  geom_point() +
  labs(title = "Salmonella Data",
       subtitle = "# of colonies for each observed dose of quinoline",
       x = "log(Dose+1)",
       y = "# colonies")

# Hmmm... maybe the boxplots were a better idea
# We could treat dose as categorical in the model

salmonella_tbl %>%
  mutate_at("dose",as.factor) %>%
  ggplot(aes(x = dose,y = colonies)) +
  theme_classic() +
  geom_boxplot() +
  labs(title = "Salmonella Data",
       subtitle = "Boxplot of # of colonies for each observed dose of quinoline",
       x = "Dose",
       y = "# colonies")

# That's better!
# 
# Fit an initial glm
dose_glm1 <- glm(colonies ~ dose,data=salmonella_tbl,family = poisson)
summary(dose_glm1)

## 
## Call:
## glm(formula = colonies ~ dose, family = poisson, data = salmonella_tbl)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6482  -1.8225  -0.2993   1.2917   5.1861  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 3.3219950  0.0540292  61.485   <2e-16 ***
## dose        0.0001901  0.0001172   1.622    0.105    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 78.358  on 17  degrees of freedom
## Residual deviance: 75.806  on 16  degrees of freedom
## AIC: 172.34
## 
## Number of Fisher Scoring iterations: 4

# Not a good fit. Why? There could be reasons other than overdispersion
# Let's look at the fitted model
curve_data <- data_frame(x = seq(1,1000,by=0.1),
                         y = exp(coef(dose_glm1)[1] + x*coef(dose_glm1)[2]))

salmonella_tbl %>%
  ggplot(aes(x = dose,y = colonies)) +
  theme_classic() +
  geom_point() +
  geom_line(data = curve_data,aes(x = x,y = y)) +
  labs(title = "Salmonella Data - Fitted Poisson GLM",
       subtitle = "# of colonies for each observed dose of quinoline",
       x = "Dose",
       y = "# colonies")

# It's more likely here that we got the functional form of the model wrong
# We already said that dose was increasing exponentially, so let's log it in the model
dose_glm2 <- glm(colonies ~ log(dose+1),data=salmonella_tbl,family = poisson)
summary(dose_glm2)

## 
## Call:
## glm(formula = colonies ~ log(dose + 1), family = poisson, data = salmonella_tbl)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0764  -1.4488  -0.2306   0.9259   4.7212  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    3.01989    0.09712  31.095  < 2e-16 ***
## log(dose + 1)  0.08585    0.02018   4.255 2.09e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 78.358  on 17  degrees of freedom
## Residual deviance: 59.629  on 16  degrees of freedom
## AIC: 156.17
## 
## Number of Fisher Scoring iterations: 4

# Check out the residual deviance- it's not great, but it's better than it was
curve_data <- data_frame(x = seq(1,1000,by=0.1),
                         y = exp(coef(dose_glm2)[1] + log(x+1)*coef(dose_glm2)[2]))

salmonella_tbl %>%
  ggplot(aes(x = dose,y = colonies)) +
  theme_classic() +
  geom_point() +
  geom_line(data = curve_data,aes(x = x,y = y)) +
  labs(title = "Salmonella Data - Fitted Poisson GLM",
       subtitle = "# of colonies for each observed dose of quinoline",
       x = "Dose",
       y = "# colonies")

# Better. Can we do any better than this?
# From before, it looked like there might be some sort of a quadratic relationship
# between log dose and colonies
dose_glm3 <- glm(colonies ~ log(dose+1) + I(log(dose+1)^2),data=salmonella_tbl,family = poisson)
summary(dose_glm3)

## 
## Call:
## glm(formula = colonies ~ log(dose + 1) + I(log(dose + 1)^2), 
##     family = poisson, data = salmonella_tbl)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7345  -1.2974  -0.3907   1.0539   4.4299  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         2.922028   0.128564  22.728   <2e-16 ***
## log(dose + 1)       0.169533   0.071061   2.386    0.017 *  
## I(log(dose + 1)^2) -0.011367   0.009193  -1.237    0.216    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 78.358  on 17  degrees of freedom
## Residual deviance: 58.082  on 15  degrees of freedom
## AIC: 156.62
## 
## Number of Fisher Scoring iterations: 4

# Nah, the residual deviance didn't really change
# 
# Now estimate the overdispersion from the better model

n <- nrow(salmonella_tbl)
p <- length(coef(dose_glm2))

phi <- data_frame(muhat = predict(dose_glm2,type="response"),
                  varhat = (salmonella_tbl$colonies - muhat)^2) %>%
  summarize(phi = sum(varhat / muhat) / (n - p)) %>%
  pull(phi)
phi

## [1] 3.95387

summary(dose_glm2,dispersion = phi)

## 
## Call:
## glm(formula = colonies ~ log(dose + 1), family = poisson, data = salmonella_tbl)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0764  -1.4488  -0.2306   0.9259   4.7212  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    3.01989    0.19311   15.64   <2e-16 ***
## log(dose + 1)  0.08585    0.04012    2.14   0.0324 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 3.95387)
## 
##     Null deviance: 78.358  on 17  degrees of freedom
## Residual deviance: 59.629  on 16  degrees of freedom
## AIC: 156.17
## 
## Number of Fisher Scoring iterations: 4

Supplementary R code for lecture 5, STA303 summer 2018

Alex Stringer

Fri Jul 27 14:09:32 2018