Supplementary R code for lectures 2 and 3, STA303 summer 2018

This notebook is the result of the R code presented during the lectures on one- and two-way ANOVA for STA303 (Methods of Data Analysis II), Summer 2018.

This contains code for analyzing two datasets from Faraway, Linear Models with R (chapters 14 and 15), including creating ANOVA tables from scratch, looking at design matrices, etc. as well as using R’s standard modelling API. Also includes many examples of using ggplot, including making QQ plots from scratch

library(tidyverse)
library(faraway)

# Analyze the "pvc" data from chapter 15
data(pvc)

# Put it in a dplyr tbl
pvc_t <- as_data_frame(pvc)

glimpse(pvc_t)

## Observations: 48
## Variables: 3
## $ psize    <dbl> 36.2, 36.3, 35.3, 35.0, 30.8, 30.6, 29.8, 29.6, 32.0,...
## $ operator <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,...
## $ resin    <fct> 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 1, 1,...

summary(pvc$psize)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   27.10   30.18   31.85   32.35   35.02   37.10

# Pairwise bloxplot
pvc_t %>%
  ggplot(aes(x = operator,y = psize)) +
  theme_classic() +
  geom_boxplot() +
  labs(title = "Boxplot of Particle Size by Operator",
       x = "Operator",
       y = "Particle Size"
  )

# Summary statistics

# Grand mean and standard deviation
grand_mean <- pvc_t %>%
  summarize(grand_mean = mean(psize),grand_sd = sd(psize))
grand_mean

## # A tibble: 1 x 2
##   grand_mean grand_sd
##        <dbl>    <dbl>
## 1       32.4     2.75

# Operator means and standard deviations
group_means <- pvc_t %>%
  group_by(operator) %>%
  summarize(group_mean = mean(psize),
            group_median = median(psize),
            group_sd = sd(psize),
            group_size = n())
group_means

## # A tibble: 3 x 5
##   operator group_mean group_median group_sd group_size
##   <fct>         <dbl>        <dbl>    <dbl>      <int>
## 1 1              32.9         32.2     2.72         16
## 2 2              32.7         31.9     2.63         16
## 3 3              31.4         31.0     2.82         16

# Sums of squares
# Here it gets a bit tricky
sums_of_squares <- pvc_t %>%
  # Add in the group mean
  left_join(group_means,by="operator") %>%
  summarize(total = sum( (psize - grand_mean$grand_mean)^2 ),
            error = sum( (psize - group_mean)^2  ),
            model = total - error
  ) %>%
  gather(type,SS,total:model)
sums_of_squares

## # A tibble: 3 x 2
##   type     SS
##   <chr> <dbl>
## 1 total 354. 
## 2 error 334. 
## 3 model  20.7

# Create the ANOVA table
m <- pvc_t %>% pull(operator) %>% unique() %>% length()
n <- nrow(pvc_t)
sums_of_squares$df <- c(n-1,n-m,m-1)

anova_table <- sums_of_squares %>%
  mutate(MS = SS / df)
anova_table

## # A tibble: 3 x 4
##   type     SS    df    MS
##   <chr> <dbl> <dbl> <dbl>
## 1 total 354.     47  7.54
## 2 error 334.     45  7.42
## 3 model  20.7     2 10.4

Fstat <- (anova_table %>% filter(type == 'model') %>% pull(MS)) / (anova_table %>% filter(type == 'error') %>% pull(MS))
Fstat

## [1] 1.396666

critval <- qf(.95,m-1,n-m)
critval

## [1] 3.204317

# For a dataset of this size, we'd need to see an average between group sum-of-squares about 3.2 times as large as the average
# within-group sum-of-squares in order to conclude that the group means really were different

pval <- 1 - pf(Fstat,m-1,n-m)
pval

## [1] 0.257939

# ...or instead of all that, we can just use the aov function in R

pvc_anova <- aov(psize ~ operator,data = pvc_t)
summary(pvc_anova)

##             Df Sum Sq Mean Sq F value Pr(>F)
## operator     2   20.7  10.359   1.397  0.258
## Residuals   45  333.8   7.417

# Total sum of squares:
sum( (pvc_t$psize - mean(pvc_t$psize))^2 )

## [1] 354.4792

var(pvc_t$psize) * (nrow(pvc_t) - 1)

## [1] 354.4792

# Homework: calculate Df column yourself
# Then calculate Mean Sq and F value yourself


### Assumptions
### 

# Already looked at standard deviations of groups, and the boxplot
# 
# Normality:
# 
# Two random variables are equal in distribution
# if and only if their cumulative distribution functions
# are equal.
# Easier to compare their INVERSE CDFs- the so-called 
# "quantile functions"
pvc_t %>%
  mutate_at("psize",funs( (. - mean(.)) / sd(.))) %>%
  arrange(psize) %>%
  mutate(q = qnorm(1:n() / (n() + 1))) %>%
  ggplot(aes(x = q,y = psize)) +
  theme_classic() +
  geom_point() +
  geom_abline(slope = 1,intercept = 0,colour = "red") +
  labs(title = "Normal QQ-plot",
       x = "Theoretical Quantiles",
       y = "Sample Quantiles")

# Some minor deviations, but nothing alarming
# We care more about deviations in the tails than
# in the centre, for testing normality of the residuals (why?)
#
# To get better understanding: recreate the above, but for data
# that is definitely not normally distributed
# E.g. rexp(100)
# or rpois(100)
# or even better: rt(100,df = 5)
# 

### Using linear regression
### 

# Create the model matrix
# Specify the contrasts
# 
# contr.treatment = "Effects coding" from lecture
# This is actually the default
# This happens automatically when you include an intercept in the model
# ...and including an intercept is the default.
X1 <- model.matrix(psize ~ operator,data=pvc_t,contrasts = "contr.treatment")
#View(X1)

t(X1) %*% X1 # Not orthogonal

##             (Intercept) operator2 operator3
## (Intercept)          48        16        16
## operator2            16        16         0
## operator3            16         0        16

# To get cell means coding, remove the intercept
# Add a -1 into the formula
X2 <- model.matrix(psize ~ operator -1,data=pvc_t)
#View(X2)

t(X2) %*% X2 # Orthogonal

##           operator1 operator2 operator3
## operator1        16         0         0
## operator2         0        16         0
## operator3         0         0        16

# Define a standard linear model
# 
lm1 <- lm(psize ~ operator,data=pvc_t)
summary(lm1)

## 
## Call:
## lm(formula = psize ~ operator, data = pvc_t)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3375 -2.2813 -0.6094  2.3719  4.5625 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  32.9438     0.6809  48.386   <2e-16 ***
## operator2    -0.2625     0.9629  -0.273    0.786    
## operator3    -1.5063     0.9629  -1.564    0.125    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.723 on 45 degrees of freedom
## Multiple R-squared:  0.05845,    Adjusted R-squared:  0.0166 
## F-statistic: 1.397 on 2 and 45 DF,  p-value: 0.2579

# What are those parameters estimating?
coef(lm1)[1]

## (Intercept) 
##    32.94375

coef(lm1)[1] + coef(lm1)[2]

## (Intercept) 
##    32.68125

coef(lm1)[1] + coef(lm1)[3]

## (Intercept) 
##     31.4375

group_means

## # A tibble: 3 x 5
##   operator group_mean group_median group_sd group_size
##   <fct>         <dbl>        <dbl>    <dbl>      <int>
## 1 1              32.9         32.2     2.72         16
## 2 2              32.7         31.9     2.63         16
## 3 3              31.4         31.0     2.82         16

# Now fit the null model, and compare them
lmnull <- lm(psize ~ 1,data=pvc_t)
summary(lmnull)

## 
## Call:
## lm(formula = psize ~ 1, data = pvc_t)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2542 -2.1792 -0.5042  2.6708  4.7458 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  32.3542     0.3964   81.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.746 on 47 degrees of freedom

# What is that parameter estimating?
coef(lmnull)

## (Intercept) 
##    32.35417

grand_mean$grand_mean

## [1] 32.35417

# Compare them. VERY CONFUSING: in R, the anova() function is used to perform a likelihood ratio test
# that two models fit the data equally well. For a standard linear regression (models of class "lm"), it goes one step further
# and performs an F test, accounting for sigma being estimated (this is review from STA302). The F-test for
# comparison of nested linear regression models, when all variables are categorical, is the same as the F-test
# performed in the ANOVA table output by aov()
# 
anova(lmnull,lm1)

## Analysis of Variance Table
## 
## Model 1: psize ~ 1
## Model 2: psize ~ operator
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     47 354.48                           
## 2     45 333.76  2    20.718 1.3967 0.2579

# Same F statistic! This is provably true


### Two-Way ANOVA ###

# The PVC data has two factors: operator and resin
# Want to estimate group means of operator, group means of resin, and the interaction between the two
# 
# First need to check: are there data for all combinations of levels?
# Need that to be true in order for the sum of squares decomposition
# as learned in class to hold
# Also, we need replication- more than one datapoint for each combination

pvc_t %>%
  group_by(operator,resin) %>%
  summarize(count = n()) %>%
  print(n = Inf) # Prints the whole table

## # A tibble: 24 x 3
## # Groups:   operator [?]
##    operator resin count
##    <fct>    <fct> <int>
##  1 1        1         2
##  2 1        2         2
##  3 1        3         2
##  4 1        4         2
##  5 1        5         2
##  6 1        6         2
##  7 1        7         2
##  8 1        8         2
##  9 2        1         2
## 10 2        2         2
## 11 2        3         2
## 12 2        4         2
## 13 2        5         2
## 14 2        6         2
## 15 2        7         2
## 16 2        8         2
## 17 3        1         2
## 18 3        2         2
## 19 3        3         2
## 20 3        4         2
## 21 3        5         2
## 22 3        6         2
## 23 3        7         2
## 24 3        8         2

# What about just by resin?
pvc_t %>%
  group_by(resin) %>%
  summarize(count = n(),stddev = sd(psize)) %>%
  print(n = Inf) # Prints the whole table

## # A tibble: 8 x 3
##   resin count stddev
##   <fct> <int>  <dbl>
## 1 1         6  0.692
## 2 2         6  1.03 
## 3 3         6  1.57 
## 4 4         6  1.08 
## 5 5         6  1.29 
## 6 6         6  0.853
## 7 7         6  1.76 
## 8 8         6  1.85

# Yes, and furthermore, the design is balanced- each possible combination of factors has an equal number of reps

# Start in the same way, with boxplots, this time of both factors
boxplt1 <- pvc_t %>%
  ggplot(aes(x = operator,y = psize)) +
  theme_classic() +
  geom_boxplot() +
  labs(title = "Boxplot of Particle Size by Operator",
       x = "Operator",
       y = "Particle Size"
  )

boxplt2 <- pvc_t %>%
  ggplot(aes(x = resin,y = psize)) +
  theme_classic() +
  geom_boxplot() +
  labs(title = "Boxplot of Particle Size by Resin",
       x = "Resin",
       y = "Particle Size"
  )

# Plot these plots on a grid, together
# We use the plot_grid function in the cowplot package
# This is the only function we use from this package,
# so don't load the whole package with library()
# Just reference a single function, with cowplot::
# Still need to install with install.packages("cowplot")
cowplot::plot_grid(boxplt1,boxplt2,nrow=1)

# Unfortunately, only 6 observations per resin level
# Not enough to accurately estimate a variance
# Constant variance assumption? INCONCLUSIVE
# 


# Interaction plots
# You can do these two ways...
# 
# Note: if you're trying to plot a line in ggplot
# and you get an error about the "group aesthetic",
# try setting group=1 in aes()
plt1 <- pvc_t %>%
  group_by(operator,resin) %>%
  summarize(psize = mean(psize)) %>%
  ggplot(aes(x = operator,y = psize,group=resin)) +
  facet_grid(~resin) + 
  theme_classic() +
  geom_point() +
  geom_path() +
  labs(title = "Interaction Plot",
       subtitle = "Facetted by levels of resin",
       x = "Operator",
       y = "Particle Size")

plt1

plt2 <- pvc_t %>%
  group_by(operator,resin) %>%
  summarize(psize = mean(psize)) %>%
  ggplot(aes(x = resin,y = psize,group=operator)) +
  facet_grid(~operator) + 
  theme_classic() +
  geom_point() +
  geom_path() +
  labs(title = "Interaction Plot",
       subtitle = "Facetted by levels of operator",
       x = "Resin",
       y = "Particle Size")

plt2

cowplot::plot_grid(plt1,plt2,ncol=1)

# Does the mean of operator differ among levels of resin?
# Does the mean of resin differ among levels of operator?
# 
# For marginal mean: look at slope of line (different from zero?)
# and whether lines have same vertical location across plots
# 
# For interactions, look at shape of lines. Same across plots?
#
# Even with practice, it is very difficult to accurately assess all three things
# (two marginal means and their interaction) simultaneously, so be careful!


# Do the ANOVA using aov()
twoway_anova <- aov(psize ~ operator + resin + operator:resin,data=pvc_t)
summary(twoway_anova)

##                Df Sum Sq Mean Sq F value   Pr(>F)    
## operator        2  20.72   10.36   7.007  0.00401 ** 
## resin           7 283.95   40.56  27.439 5.66e-10 ***
## operator:resin 14  14.34    1.02   0.693  0.75987    
## Residuals      24  35.48    1.48                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Or, more common:
twoway_anova <- aov(psize ~ operator * resin,data=pvc_t)
summary(twoway_anova)

##                Df Sum Sq Mean Sq F value   Pr(>F)    
## operator        2  20.72   10.36   7.007  0.00401 ** 
## resin           7 283.95   40.56  27.439 5.66e-10 ***
## operator:resin 14  14.34    1.02   0.693  0.75987    
## Residuals      24  35.48    1.48                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Compare the sums of squares to the ANOVA without interactions
twoway_anova_mainonly <- aov(psize ~ operator + resin,data=pvc_t)
summary(twoway_anova_mainonly)

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## operator     2  20.72   10.36   7.902  0.00135 ** 
## resin        7 283.95   40.56  30.943 8.11e-14 ***
## Residuals   38  49.82    1.31                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# The sums of squares are the same, but the MSE went down, which makes the F-stats for these
# effects larger, and the corresponding p-values smaller

# Equivalent regression model
lm2 <- lm(psize ~ operator + resin + operator:resin,data=pvc_t)
summary(lm2)

## 
## Call:
## lm(formula = psize ~ operator + resin + operator:resin, data = pvc_t)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7000 -0.3625  0.0000  0.3625  2.7000 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       36.2500     0.8598  42.164  < 2e-16 ***
## operator2         -0.8500     1.2159  -0.699 0.491216    
## operator3         -0.9500     1.2159  -0.781 0.442245    
## resin2            -1.1000     1.2159  -0.905 0.374615    
## resin3            -5.5500     1.2159  -4.565 0.000126 ***
## resin4            -6.5500     1.2159  -5.387 1.56e-05 ***
## resin5            -4.4000     1.2159  -3.619 0.001372 ** 
## resin6            -6.0500     1.2159  -4.976 4.42e-05 ***
## resin7            -3.3500     1.2159  -2.755 0.011014 *  
## resin8             0.5500     1.2159   0.452 0.655078    
## operator2:resin2   1.0500     1.7195   0.611 0.547175    
## operator3:resin2  -0.8500     1.7195  -0.494 0.625567    
## operator2:resin3  -0.2000     1.7195  -0.116 0.908372    
## operator3:resin3  -0.5500     1.7195  -0.320 0.751842    
## operator2:resin4   1.2000     1.7195   0.698 0.491960    
## operator3:resin4  -0.1000     1.7195  -0.058 0.954105    
## operator2:resin5   0.4000     1.7195   0.233 0.818024    
## operator3:resin5  -1.6000     1.7195  -0.931 0.361376    
## operator2:resin6   1.3000     1.7195   0.756 0.456985    
## operator3:resin6   0.5000     1.7195   0.291 0.773715    
## operator2:resin7   0.4500     1.7195   0.262 0.795782    
## operator3:resin7   0.8500     1.7195   0.494 0.625567    
## operator2:resin8   0.5000     1.7195   0.291 0.773715    
## operator3:resin8  -2.7000     1.7195  -1.570 0.129454    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.216 on 24 degrees of freedom
## Multiple R-squared:  0.8999, Adjusted R-squared:  0.804 
## F-statistic: 9.382 on 23 and 24 DF,  p-value: 3.447e-07

# What are all these parameters estimating?
group_means_int <- pvc_t %>%
  group_by(operator,resin) %>%
  summarize(psize = mean(psize))
group_means_int

## # A tibble: 24 x 3
## # Groups:   operator [?]
##    operator resin psize
##    <fct>    <fct> <dbl>
##  1 1        1      36.2
##  2 1        2      35.2
##  3 1        3      30.7
##  4 1        4      29.7
##  5 1        5      31.8
##  6 1        6      30.2
##  7 1        7      32.9
##  8 1        8      36.8
##  9 2        1      35.4
## 10 2        2      35.4
## # ... with 14 more rows

# That's the mean of each group. Now, the mean should equal regression intercept plus main effect plus interaction
# For example...
# Mean of operator 1, resin 1 is 36.25
coef(lm2)['(Intercept)'] # Operator 1 and Resin 1 are the reference levels

## (Intercept) 
##       36.25

# Mean of operator 2, resin 2 is 35.35
coef(lm2)['(Intercept)'] + coef(lm2)['operator2'] + coef(lm2)['resin2'] + coef(lm2)['operator2:resin2']

## (Intercept) 
##       35.35

# ...and so on

# Likelihood ratio test
# We can test any hypothesis we want by specifying full model and reduced model.
# Let's first test the interaction hypothesis
# This corresponds to the gamma_ij = 0 for all ij in the linear model we wrote down
# In the equivalent linear model fit here, this corresponds to jointly testing all the operator:resin interaction
# coefficients equalling 0

lm3 <- lm(psize ~ operator + resin,data=pvc_t)
summary(lm3)

## 
## Call:
## lm(formula = psize ~ operator + resin, data = pvc_t)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9500 -0.6125 -0.0167  0.4063  3.6833 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.2396     0.5226  69.345  < 2e-16 ***
## operator2    -0.2625     0.4048  -0.648  0.52059    
## operator3    -1.5063     0.4048  -3.721  0.00064 ***
## resin2       -1.0333     0.6610  -1.563  0.12630    
## resin3       -5.8000     0.6610  -8.774 1.13e-10 ***
## resin4       -6.1833     0.6610  -9.354 2.11e-11 ***
## resin5       -4.8000     0.6610  -7.261 1.09e-08 ***
## resin6       -5.4500     0.6610  -8.245 5.46e-10 ***
## resin7       -2.9167     0.6610  -4.412 8.16e-05 ***
## resin8       -0.1833     0.6610  -0.277  0.78302    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.145 on 38 degrees of freedom
## Multiple R-squared:  0.8595, Adjusted R-squared:  0.8262 
## F-statistic: 25.82 on 9 and 38 DF,  p-value: 1.474e-13

anova(lm3,lm2)

## Analysis of Variance Table
## 
## Model 1: psize ~ operator + resin
## Model 2: psize ~ operator + resin + operator:resin
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     38 49.815                           
## 2     24 35.480 14    14.335 0.6926 0.7599

# Sum of squares, F and p all the same
#
# Let's test whether resin is useful at all- jointly test main effect of resin = 0 and interaction = 0

lm4 <- lm(psize ~ operator,data=pvc_t)
summary(lm4)

## 
## Call:
## lm(formula = psize ~ operator, data = pvc_t)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3375 -2.2813 -0.6094  2.3719  4.5625 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  32.9438     0.6809  48.386   <2e-16 ***
## operator2    -0.2625     0.9629  -0.273    0.786    
## operator3    -1.5063     0.9629  -1.564    0.125    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.723 on 45 degrees of freedom
## Multiple R-squared:  0.05845,    Adjusted R-squared:  0.0166 
## F-statistic: 1.397 on 2 and 45 DF,  p-value: 0.2579

anova(lm4,lm2)

## Analysis of Variance Table
## 
## Model 1: psize ~ operator
## Model 2: psize ~ operator + resin + operator:resin
##   Res.Df    RSS Df Sum of Sq     F    Pr(>F)    
## 1     45 333.76                                 
## 2     24  35.48 21    298.28 9.608 3.437e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

### Full Example ###
#
# The warpbreaks data (Faraway LMR, chapter 15, exercise 1) contains data on breaks in yarn woven
# by 9 looms, for each combination of levels of 2 factors: wool type (2 levels) and tension level (3 levels)
# 

# To clear environment from previous analysis:
rm(list = ls())

# Step 1: load the data

data(warpbreaks)
warpbreaks_tbl <- dplyr::as_data_frame(warpbreaks)

glimpse(warpbreaks_tbl)

## Observations: 54
## Variables: 3
## $ breaks  <dbl> 26, 30, 54, 25, 70, 52, 51, 26, 67, 18, 21, 29, 17, 12...
## $ wool    <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, ...
## $ tension <fct> L, L, L, L, L, L, L, L, L, M, M, M, M, M, M, M, M, M, ...

# Check the observation counts in each level of the factors
warpbreaks_tbl %>%
  group_by(wool,tension) %>%
  summarize(cnt = n())

## # A tibble: 6 x 3
## # Groups:   wool [?]
##   wool  tension   cnt
##   <fct> <fct>   <int>
## 1 A     L           9
## 2 A     M           9
## 3 A     H           9
## 4 B     L           9
## 5 B     M           9
## 6 B     H           9

# Plot the data. We're looking for
# 1) Main effects, and equality of variance- pairwise boxplots
# 2) Interaction effects- interaction plots
# 3) Normality of the raw data

# 1)
warpbreaks_boxplot_1 <- warpbreaks_tbl %>%
  ggplot(aes(x = wool,y = breaks)) +
  theme_classic() +
  geom_boxplot() +
  labs(title = "Boxplot of breaks by wool type",
       x = "Wool Type",
       y = "Breaks")

warpbreaks_boxplot_2 <- warpbreaks_tbl %>%
  ggplot(aes(x = tension,y = breaks)) +
  theme_classic() +
  geom_boxplot() +
  labs(title = "Boxplot of breaks by level of tension",
       x = "Tension Level",
       y = "Breaks")

cowplot::plot_grid(warpbreaks_boxplot_1,warpbreaks_boxplot_2,nrow=1)

# Comments?

# 2)

warpbreaks_interactionplot_1 <- warpbreaks_tbl %>%
  group_by(wool,tension) %>%
  summarize(group_mean = mean(breaks)) %>%
  ggplot(aes(x = wool,y = group_mean,group = tension)) +
  theme_classic() +
  facet_grid(~tension) +
  geom_point() +
  geom_line() +
  labs(title = "Interaction Plot, wool x tension",
       x = "Wool",
       y = "Mean # of breaks")

warpbreaks_interactionplot_2 <- warpbreaks_tbl %>%
  group_by(wool,tension) %>%
  summarize(group_mean = mean(breaks)) %>%
  ggplot(aes(x = tension,y = group_mean,group = wool)) +
  theme_classic() +
  facet_grid(~wool) +
  geom_point() +
  geom_line() +
  labs(title = "Interaction Plot, tension x wool",
       x = "Tension",
       y = "Mean # of breaks")

cowplot::plot_grid(warpbreaks_interactionplot_1,warpbreaks_interactionplot_2)

# Do you think there is a significant interaction?
# Does the relationship between the mean # of breaks for wool A and wool B differ across different levels of tension?
# Does the relationship between the mean # of breaks between low, medium and high tensions differ for each wool type?

# 3) 
warpbreaks_tbl %>%
  mutate_at("breaks",funs( (. - mean(.)) / sd(.))) %>%
  arrange(breaks) %>%
  mutate(q = qnorm(1:n() / (n() + 1))) %>%
  ggplot(aes(x = q,y = breaks)) +
  theme_classic() +
  geom_point() +
  geom_abline(slope = 1,intercept = 0) +
  labs(title = "Normal QQ-plot",
       x = "Theoretical Quantiles",
       y = "Sample Quantiles")

# That looks problematic. What to do?
# We need to find a suitable transformation that makes the response look closer to normally distributed
# Look closer

warpbreaks_tbl %>%
  ggplot(aes(x = breaks)) +
  theme_classic() +
  geom_histogram(aes(y = ..density..),colour="black",fill="#E89A3D",bins = 15) +
  geom_density(colour = "#A200E4") +
  labs(title = "Histogram and density for observed breaks data",
       x = "Breaks",
       y = "Density")

# The problem appears to be a long right tail.
# The response is positive- use a box-cox transformation

library(MASS)

breaks_boxcox <- boxcox(breaks ~ 1,data=warpbreaks_tbl)

breaks_boxcox$x[which(breaks_boxcox$y == max(breaks_boxcox$y))]

## [1] -0.2222222

# Indicates that lambda = -0.2222... is the best value for a power transformation, but that pretty much any value
# between around -.75 and .5 are acceptable.
# Lambda = 0 corresponds to a log transformation, so let's do that, since the result remains interpretable

warpbreaks_tbl_transform <- warpbreaks_tbl %>%
  mutate(log_breaks = log(breaks))

# If you get stupid ggplot errors like "invalid graphics state", whatever
# that means, try typing dev.off() to reset the internal graphics
# display device

warpbreaks_tbl_transform %>%
  ggplot(aes(x = log_breaks)) +
  theme_classic() +
  geom_histogram(aes(y = ..density..),colour="black",fill="#E89A3D",bins = 15) +
  geom_density(colour = "#A200E4") +
  labs(title = "Histogram and density for log-transformed observed breaks data",
       x = "Breaks",
       y = "Density")

# I mean, it looks better...
warpbreaks_tbl_transform %>%
  mutate_at("log_breaks",funs( (. - mean(.)) / sd(.))) %>%
  arrange(log_breaks) %>%
  mutate(q = qnorm(1:n() / (n() + 1))) %>%
  ggplot(aes(x = q,y = log_breaks)) +
  theme_classic() +
  geom_point() +
  geom_abline(slope = 1,intercept = 0) +
  labs(title = "Normal QQ-plot",
       x = "Theoretical Quantiles",
       y = "Sample Quantiles")

# That looks WAY better!
# But now, we have to re-do the original plots, don't we?

# 1)
warpbreaks_transformed_boxplot_1 <- warpbreaks_tbl_transform %>%
  ggplot(aes(x = wool,y = log_breaks)) +
  theme_classic() +
  geom_boxplot() +
  labs(title = "Boxplot of log(breaks) by wool type",
       x = "Wool Type",
       y = "log(Breaks)")

warpbreaks_transformed_boxplot_2 <- warpbreaks_tbl_transform %>%
  ggplot(aes(x = tension,y = log_breaks)) +
  theme_classic() +
  geom_boxplot() +
  labs(title = "Boxplot of log(breaks) by level of tension",
       x = "Tension Level",
       y = "log(Breaks)")

cowplot::plot_grid(warpbreaks_transformed_boxplot_1,warpbreaks_transformed_boxplot_2,nrow=1)

# Comments?
# The outliers disappeared, and now equality of variance looks more reasonable

# 2)

warpbreaks_transform_interactionplot_1 <- warpbreaks_tbl_transform %>%
  group_by(wool,tension) %>%
  summarize(group_mean = mean(log_breaks)) %>%
  ggplot(aes(x = wool,y = group_mean,group = tension)) +
  theme_classic() +
  facet_grid(~tension) +
  geom_point() +
  geom_line() +
  labs(title = "Interaction Plot, wool x tension",
       x = "Wool",
       y = "Mean log(# of breaks)")

warpbreaks_transform_interactionplot_2 <- warpbreaks_tbl_transform %>%
  group_by(wool,tension) %>%
  summarize(group_mean = mean(log_breaks)) %>%
  ggplot(aes(x = tension,y = group_mean,group = wool)) +
  theme_classic() +
  facet_grid(~wool) +
  geom_point() +
  geom_line() +
  labs(title = "Interaction Plot, tension x wool",
       x = "Tension",
       y = "Mean log(# of breaks)")

cowplot::plot_grid(warpbreaks_transform_interactionplot_1,warpbreaks_transform_interactionplot_2)

# Looks similar, but note the scale of the y-axis
# 
# We can now perform an ANOVA in order to test hypotheses of interest.
# There are two possible main effects, and one interaction, that we may wish to test
# We can first test the interaction, then test each of the main effects in question
# 
# First let's look at the ANOVA table

warpbreaks_aov1 <- aov(log_breaks ~ wool*tension,data=warpbreaks_tbl_transform)
summary(warpbreaks_aov1)

##              Df Sum Sq Mean Sq F value  Pr(>F)   
## wool          1  0.313  0.3125   2.234 0.14151   
## tension       2  2.176  1.0881   7.779 0.00118 **
## wool:tension  2  0.913  0.4566   3.264 0.04686 * 
## Residuals    48  6.714  0.1399                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# We have p-values on the cusp of the arbitrary .05 significance level, so we need to be more careful than usual

warpbreaks_lm1 <- lm(log_breaks ~ wool*tension,data=warpbreaks_tbl_transform)
summary(warpbreaks_lm1)

## 
## Call:
## lm(formula = log_breaks ~ wool * tension, data = warpbreaks_tbl_transform)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81504 -0.27885  0.04042  0.27319  0.64358 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      3.7179     0.1247  29.824  < 2e-16 ***
## woolB           -0.4356     0.1763  -2.471  0.01709 *  
## tensionM        -0.6012     0.1763  -3.410  0.00133 ** 
## tensionH        -0.6003     0.1763  -3.405  0.00134 ** 
## woolB:tensionM   0.6281     0.2493   2.519  0.01514 *  
## woolB:tensionH   0.2221     0.2493   0.891  0.37749    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.374 on 48 degrees of freedom
## Multiple R-squared:  0.3363, Adjusted R-squared:  0.2672 
## F-statistic: 4.864 on 5 and 48 DF,  p-value: 0.001116

warpbreaks_lm2 <- lm(log_breaks ~ wool + tension,data=warpbreaks_tbl_transform)
summary(warpbreaks_lm2)

## 
## Call:
## lm(formula = log_breaks ~ wool + tension, data = warpbreaks_tbl_transform)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.80421 -0.29975 -0.01627  0.28367  0.67424 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.5762     0.1063  33.644  < 2e-16 ***
## woolB        -0.1522     0.1063  -1.431 0.158540    
## tensionM     -0.2871     0.1302  -2.205 0.032048 *  
## tensionH     -0.4893     0.1302  -3.758 0.000448 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3906 on 50 degrees of freedom
## Multiple R-squared:  0.246,  Adjusted R-squared:  0.2008 
## F-statistic: 5.438 on 3 and 50 DF,  p-value: 0.002576

warpbreaks_lm3 <- lm(log_breaks ~ wool,data=warpbreaks_tbl_transform)
summary(warpbreaks_lm3)

## 
## Call:
## lm(formula = log_breaks ~ wool, data = warpbreaks_tbl_transform)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.01485 -0.31728 -0.02329  0.25904  0.93106 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.31744    0.08356  39.701   <2e-16 ***
## woolB       -0.15215    0.11817  -1.288    0.204    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4342 on 52 degrees of freedom
## Multiple R-squared:  0.0309, Adjusted R-squared:  0.01226 
## F-statistic: 1.658 on 1 and 52 DF,  p-value: 0.2036

warpbreaks_lm4 <- lm(log_breaks ~ tension,data=warpbreaks_tbl_transform)
summary(warpbreaks_lm4)

## 
## Call:
## lm(formula = log_breaks ~ tension, data = warpbreaks_tbl_transform)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.86110 -0.27811 -0.04066  0.31199  0.75031 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.5002     0.0930  37.637  < 2e-16 ***
## tensionM     -0.2871     0.1315  -2.183 0.033654 *  
## tensionH     -0.4893     0.1315  -3.720 0.000497 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3946 on 51 degrees of freedom
## Multiple R-squared:  0.2151, Adjusted R-squared:  0.1843 
## F-statistic: 6.989 on 2 and 51 DF,  p-value: 0.002077

# Test the interaction term
anova(warpbreaks_lm2,warpbreaks_lm1)

## Analysis of Variance Table
## 
## Model 1: log_breaks ~ wool + tension
## Model 2: log_breaks ~ wool * tension
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     50 7.6270                              
## 2     48 6.7138  2   0.91315 3.2642 0.04686 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# It's on the edge. Let's test whether a model with only each single main effect fits the data as well
# as a model with both main effects and their interaction
anova(warpbreaks_lm1,warpbreaks_lm3)

## Analysis of Variance Table
## 
## Model 1: log_breaks ~ wool * tension
## Model 2: log_breaks ~ wool
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1     48 6.7138                                  
## 2     52 9.8032 -4   -3.0893 5.5217 0.0009709 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(warpbreaks_lm1,warpbreaks_lm4)

## Analysis of Variance Table
## 
## Model 1: log_breaks ~ wool * tension
## Model 2: log_breaks ~ tension
##   Res.Df    RSS Df Sum of Sq     F  Pr(>F)  
## 1     48 6.7138                             
## 2     51 7.9395 -3   -1.2257 2.921 0.04339 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# What do we conclude?

Supplementary R code for lectures 2 and 3, STA303 summer 2018

Alex Stringer

Fri Jul 27 13:06:25 2018