This notebook is the result of the R code presented during the lectures on one- and two-way ANOVA for STA303 (Methods of Data Analysis II), Summer 2018.
This contains code for analyzing two datasets from Faraway, Linear Models with R (chapters 14 and 15), including creating ANOVA tables from scratch, looking at design matrices, etc. as well as using R’s standard modelling API. Also includes many examples of using ggplot, including making QQ plots from scratch
library(tidyverse)
library(faraway)
# Analyze the "pvc" data from chapter 15
data(pvc)
# Put it in a dplyr tbl
pvc_t <- as_data_frame(pvc)
glimpse(pvc_t)
## Observations: 48
## Variables: 3
## $ psize <dbl> 36.2, 36.3, 35.3, 35.0, 30.8, 30.6, 29.8, 29.6, 32.0,...
## $ operator <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,...
## $ resin <fct> 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 1, 1,...
summary(pvc$psize)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27.10 30.18 31.85 32.35 35.02 37.10
# Pairwise bloxplot
pvc_t %>%
ggplot(aes(x = operator,y = psize)) +
theme_classic() +
geom_boxplot() +
labs(title = "Boxplot of Particle Size by Operator",
x = "Operator",
y = "Particle Size"
)
# Summary statistics
# Grand mean and standard deviation
grand_mean <- pvc_t %>%
summarize(grand_mean = mean(psize),grand_sd = sd(psize))
grand_mean
## # A tibble: 1 x 2
## grand_mean grand_sd
## <dbl> <dbl>
## 1 32.4 2.75
# Operator means and standard deviations
group_means <- pvc_t %>%
group_by(operator) %>%
summarize(group_mean = mean(psize),
group_median = median(psize),
group_sd = sd(psize),
group_size = n())
group_means
## # A tibble: 3 x 5
## operator group_mean group_median group_sd group_size
## <fct> <dbl> <dbl> <dbl> <int>
## 1 1 32.9 32.2 2.72 16
## 2 2 32.7 31.9 2.63 16
## 3 3 31.4 31.0 2.82 16
# Sums of squares
# Here it gets a bit tricky
sums_of_squares <- pvc_t %>%
# Add in the group mean
left_join(group_means,by="operator") %>%
summarize(total = sum( (psize - grand_mean$grand_mean)^2 ),
error = sum( (psize - group_mean)^2 ),
model = total - error
) %>%
gather(type,SS,total:model)
sums_of_squares
## # A tibble: 3 x 2
## type SS
## <chr> <dbl>
## 1 total 354.
## 2 error 334.
## 3 model 20.7
# Create the ANOVA table
m <- pvc_t %>% pull(operator) %>% unique() %>% length()
n <- nrow(pvc_t)
sums_of_squares$df <- c(n-1,n-m,m-1)
anova_table <- sums_of_squares %>%
mutate(MS = SS / df)
anova_table
## # A tibble: 3 x 4
## type SS df MS
## <chr> <dbl> <dbl> <dbl>
## 1 total 354. 47 7.54
## 2 error 334. 45 7.42
## 3 model 20.7 2 10.4
Fstat <- (anova_table %>% filter(type == 'model') %>% pull(MS)) / (anova_table %>% filter(type == 'error') %>% pull(MS))
Fstat
## [1] 1.396666
critval <- qf(.95,m-1,n-m)
critval
## [1] 3.204317
# For a dataset of this size, we'd need to see an average between group sum-of-squares about 3.2 times as large as the average
# within-group sum-of-squares in order to conclude that the group means really were different
pval <- 1 - pf(Fstat,m-1,n-m)
pval
## [1] 0.257939
# ...or instead of all that, we can just use the aov function in R
pvc_anova <- aov(psize ~ operator,data = pvc_t)
summary(pvc_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## operator 2 20.7 10.359 1.397 0.258
## Residuals 45 333.8 7.417
# Total sum of squares:
sum( (pvc_t$psize - mean(pvc_t$psize))^2 )
## [1] 354.4792
var(pvc_t$psize) * (nrow(pvc_t) - 1)
## [1] 354.4792
# Homework: calculate Df column yourself
# Then calculate Mean Sq and F value yourself
### Assumptions
###
# Already looked at standard deviations of groups, and the boxplot
#
# Normality:
#
# Two random variables are equal in distribution
# if and only if their cumulative distribution functions
# are equal.
# Easier to compare their INVERSE CDFs- the so-called
# "quantile functions"
pvc_t %>%
mutate_at("psize",funs( (. - mean(.)) / sd(.))) %>%
arrange(psize) %>%
mutate(q = qnorm(1:n() / (n() + 1))) %>%
ggplot(aes(x = q,y = psize)) +
theme_classic() +
geom_point() +
geom_abline(slope = 1,intercept = 0,colour = "red") +
labs(title = "Normal QQ-plot",
x = "Theoretical Quantiles",
y = "Sample Quantiles")
# Some minor deviations, but nothing alarming
# We care more about deviations in the tails than
# in the centre, for testing normality of the residuals (why?)
#
# To get better understanding: recreate the above, but for data
# that is definitely not normally distributed
# E.g. rexp(100)
# or rpois(100)
# or even better: rt(100,df = 5)
#
### Using linear regression
###
# Create the model matrix
# Specify the contrasts
#
# contr.treatment = "Effects coding" from lecture
# This is actually the default
# This happens automatically when you include an intercept in the model
# ...and including an intercept is the default.
X1 <- model.matrix(psize ~ operator,data=pvc_t,contrasts = "contr.treatment")
#View(X1)
t(X1) %*% X1 # Not orthogonal
## (Intercept) operator2 operator3
## (Intercept) 48 16 16
## operator2 16 16 0
## operator3 16 0 16
# To get cell means coding, remove the intercept
# Add a -1 into the formula
X2 <- model.matrix(psize ~ operator -1,data=pvc_t)
#View(X2)
t(X2) %*% X2 # Orthogonal
## operator1 operator2 operator3
## operator1 16 0 0
## operator2 0 16 0
## operator3 0 0 16
# Define a standard linear model
#
lm1 <- lm(psize ~ operator,data=pvc_t)
summary(lm1)
##
## Call:
## lm(formula = psize ~ operator, data = pvc_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3375 -2.2813 -0.6094 2.3719 4.5625
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.9438 0.6809 48.386 <2e-16 ***
## operator2 -0.2625 0.9629 -0.273 0.786
## operator3 -1.5063 0.9629 -1.564 0.125
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.723 on 45 degrees of freedom
## Multiple R-squared: 0.05845, Adjusted R-squared: 0.0166
## F-statistic: 1.397 on 2 and 45 DF, p-value: 0.2579
# What are those parameters estimating?
coef(lm1)[1]
## (Intercept)
## 32.94375
coef(lm1)[1] + coef(lm1)[2]
## (Intercept)
## 32.68125
coef(lm1)[1] + coef(lm1)[3]
## (Intercept)
## 31.4375
group_means
## # A tibble: 3 x 5
## operator group_mean group_median group_sd group_size
## <fct> <dbl> <dbl> <dbl> <int>
## 1 1 32.9 32.2 2.72 16
## 2 2 32.7 31.9 2.63 16
## 3 3 31.4 31.0 2.82 16
# Now fit the null model, and compare them
lmnull <- lm(psize ~ 1,data=pvc_t)
summary(lmnull)
##
## Call:
## lm(formula = psize ~ 1, data = pvc_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2542 -2.1792 -0.5042 2.6708 4.7458
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.3542 0.3964 81.62 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.746 on 47 degrees of freedom
# What is that parameter estimating?
coef(lmnull)
## (Intercept)
## 32.35417
grand_mean$grand_mean
## [1] 32.35417
# Compare them. VERY CONFUSING: in R, the anova() function is used to perform a likelihood ratio test
# that two models fit the data equally well. For a standard linear regression (models of class "lm"), it goes one step further
# and performs an F test, accounting for sigma being estimated (this is review from STA302). The F-test for
# comparison of nested linear regression models, when all variables are categorical, is the same as the F-test
# performed in the ANOVA table output by aov()
#
anova(lmnull,lm1)
## Analysis of Variance Table
##
## Model 1: psize ~ 1
## Model 2: psize ~ operator
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 47 354.48
## 2 45 333.76 2 20.718 1.3967 0.2579
# Same F statistic! This is provably true
### Two-Way ANOVA ###
# The PVC data has two factors: operator and resin
# Want to estimate group means of operator, group means of resin, and the interaction between the two
#
# First need to check: are there data for all combinations of levels?
# Need that to be true in order for the sum of squares decomposition
# as learned in class to hold
# Also, we need replication- more than one datapoint for each combination
pvc_t %>%
group_by(operator,resin) %>%
summarize(count = n()) %>%
print(n = Inf) # Prints the whole table
## # A tibble: 24 x 3
## # Groups: operator [?]
## operator resin count
## <fct> <fct> <int>
## 1 1 1 2
## 2 1 2 2
## 3 1 3 2
## 4 1 4 2
## 5 1 5 2
## 6 1 6 2
## 7 1 7 2
## 8 1 8 2
## 9 2 1 2
## 10 2 2 2
## 11 2 3 2
## 12 2 4 2
## 13 2 5 2
## 14 2 6 2
## 15 2 7 2
## 16 2 8 2
## 17 3 1 2
## 18 3 2 2
## 19 3 3 2
## 20 3 4 2
## 21 3 5 2
## 22 3 6 2
## 23 3 7 2
## 24 3 8 2
# What about just by resin?
pvc_t %>%
group_by(resin) %>%
summarize(count = n(),stddev = sd(psize)) %>%
print(n = Inf) # Prints the whole table
## # A tibble: 8 x 3
## resin count stddev
## <fct> <int> <dbl>
## 1 1 6 0.692
## 2 2 6 1.03
## 3 3 6 1.57
## 4 4 6 1.08
## 5 5 6 1.29
## 6 6 6 0.853
## 7 7 6 1.76
## 8 8 6 1.85
# Yes, and furthermore, the design is balanced- each possible combination of factors has an equal number of reps
# Start in the same way, with boxplots, this time of both factors
boxplt1 <- pvc_t %>%
ggplot(aes(x = operator,y = psize)) +
theme_classic() +
geom_boxplot() +
labs(title = "Boxplot of Particle Size by Operator",
x = "Operator",
y = "Particle Size"
)
boxplt2 <- pvc_t %>%
ggplot(aes(x = resin,y = psize)) +
theme_classic() +
geom_boxplot() +
labs(title = "Boxplot of Particle Size by Resin",
x = "Resin",
y = "Particle Size"
)
# Plot these plots on a grid, together
# We use the plot_grid function in the cowplot package
# This is the only function we use from this package,
# so don't load the whole package with library()
# Just reference a single function, with cowplot::
# Still need to install with install.packages("cowplot")
cowplot::plot_grid(boxplt1,boxplt2,nrow=1)
# Unfortunately, only 6 observations per resin level
# Not enough to accurately estimate a variance
# Constant variance assumption? INCONCLUSIVE
#
# Interaction plots
# You can do these two ways...
#
# Note: if you're trying to plot a line in ggplot
# and you get an error about the "group aesthetic",
# try setting group=1 in aes()
plt1 <- pvc_t %>%
group_by(operator,resin) %>%
summarize(psize = mean(psize)) %>%
ggplot(aes(x = operator,y = psize,group=resin)) +
facet_grid(~resin) +
theme_classic() +
geom_point() +
geom_path() +
labs(title = "Interaction Plot",
subtitle = "Facetted by levels of resin",
x = "Operator",
y = "Particle Size")
plt1
plt2 <- pvc_t %>%
group_by(operator,resin) %>%
summarize(psize = mean(psize)) %>%
ggplot(aes(x = resin,y = psize,group=operator)) +
facet_grid(~operator) +
theme_classic() +
geom_point() +
geom_path() +
labs(title = "Interaction Plot",
subtitle = "Facetted by levels of operator",
x = "Resin",
y = "Particle Size")
plt2
cowplot::plot_grid(plt1,plt2,ncol=1)
# Does the mean of operator differ among levels of resin?
# Does the mean of resin differ among levels of operator?
#
# For marginal mean: look at slope of line (different from zero?)
# and whether lines have same vertical location across plots
#
# For interactions, look at shape of lines. Same across plots?
#
# Even with practice, it is very difficult to accurately assess all three things
# (two marginal means and their interaction) simultaneously, so be careful!
# Do the ANOVA using aov()
twoway_anova <- aov(psize ~ operator + resin + operator:resin,data=pvc_t)
summary(twoway_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## operator 2 20.72 10.36 7.007 0.00401 **
## resin 7 283.95 40.56 27.439 5.66e-10 ***
## operator:resin 14 14.34 1.02 0.693 0.75987
## Residuals 24 35.48 1.48
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Or, more common:
twoway_anova <- aov(psize ~ operator * resin,data=pvc_t)
summary(twoway_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## operator 2 20.72 10.36 7.007 0.00401 **
## resin 7 283.95 40.56 27.439 5.66e-10 ***
## operator:resin 14 14.34 1.02 0.693 0.75987
## Residuals 24 35.48 1.48
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Compare the sums of squares to the ANOVA without interactions
twoway_anova_mainonly <- aov(psize ~ operator + resin,data=pvc_t)
summary(twoway_anova_mainonly)
## Df Sum Sq Mean Sq F value Pr(>F)
## operator 2 20.72 10.36 7.902 0.00135 **
## resin 7 283.95 40.56 30.943 8.11e-14 ***
## Residuals 38 49.82 1.31
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The sums of squares are the same, but the MSE went down, which makes the F-stats for these
# effects larger, and the corresponding p-values smaller
# Equivalent regression model
lm2 <- lm(psize ~ operator + resin + operator:resin,data=pvc_t)
summary(lm2)
##
## Call:
## lm(formula = psize ~ operator + resin + operator:resin, data = pvc_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7000 -0.3625 0.0000 0.3625 2.7000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.2500 0.8598 42.164 < 2e-16 ***
## operator2 -0.8500 1.2159 -0.699 0.491216
## operator3 -0.9500 1.2159 -0.781 0.442245
## resin2 -1.1000 1.2159 -0.905 0.374615
## resin3 -5.5500 1.2159 -4.565 0.000126 ***
## resin4 -6.5500 1.2159 -5.387 1.56e-05 ***
## resin5 -4.4000 1.2159 -3.619 0.001372 **
## resin6 -6.0500 1.2159 -4.976 4.42e-05 ***
## resin7 -3.3500 1.2159 -2.755 0.011014 *
## resin8 0.5500 1.2159 0.452 0.655078
## operator2:resin2 1.0500 1.7195 0.611 0.547175
## operator3:resin2 -0.8500 1.7195 -0.494 0.625567
## operator2:resin3 -0.2000 1.7195 -0.116 0.908372
## operator3:resin3 -0.5500 1.7195 -0.320 0.751842
## operator2:resin4 1.2000 1.7195 0.698 0.491960
## operator3:resin4 -0.1000 1.7195 -0.058 0.954105
## operator2:resin5 0.4000 1.7195 0.233 0.818024
## operator3:resin5 -1.6000 1.7195 -0.931 0.361376
## operator2:resin6 1.3000 1.7195 0.756 0.456985
## operator3:resin6 0.5000 1.7195 0.291 0.773715
## operator2:resin7 0.4500 1.7195 0.262 0.795782
## operator3:resin7 0.8500 1.7195 0.494 0.625567
## operator2:resin8 0.5000 1.7195 0.291 0.773715
## operator3:resin8 -2.7000 1.7195 -1.570 0.129454
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.216 on 24 degrees of freedom
## Multiple R-squared: 0.8999, Adjusted R-squared: 0.804
## F-statistic: 9.382 on 23 and 24 DF, p-value: 3.447e-07
# What are all these parameters estimating?
group_means_int <- pvc_t %>%
group_by(operator,resin) %>%
summarize(psize = mean(psize))
group_means_int
## # A tibble: 24 x 3
## # Groups: operator [?]
## operator resin psize
## <fct> <fct> <dbl>
## 1 1 1 36.2
## 2 1 2 35.2
## 3 1 3 30.7
## 4 1 4 29.7
## 5 1 5 31.8
## 6 1 6 30.2
## 7 1 7 32.9
## 8 1 8 36.8
## 9 2 1 35.4
## 10 2 2 35.4
## # ... with 14 more rows
# That's the mean of each group. Now, the mean should equal regression intercept plus main effect plus interaction
# For example...
# Mean of operator 1, resin 1 is 36.25
coef(lm2)['(Intercept)'] # Operator 1 and Resin 1 are the reference levels
## (Intercept)
## 36.25
# Mean of operator 2, resin 2 is 35.35
coef(lm2)['(Intercept)'] + coef(lm2)['operator2'] + coef(lm2)['resin2'] + coef(lm2)['operator2:resin2']
## (Intercept)
## 35.35
# ...and so on
# Likelihood ratio test
# We can test any hypothesis we want by specifying full model and reduced model.
# Let's first test the interaction hypothesis
# This corresponds to the gamma_ij = 0 for all ij in the linear model we wrote down
# In the equivalent linear model fit here, this corresponds to jointly testing all the operator:resin interaction
# coefficients equalling 0
lm3 <- lm(psize ~ operator + resin,data=pvc_t)
summary(lm3)
##
## Call:
## lm(formula = psize ~ operator + resin, data = pvc_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9500 -0.6125 -0.0167 0.4063 3.6833
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.2396 0.5226 69.345 < 2e-16 ***
## operator2 -0.2625 0.4048 -0.648 0.52059
## operator3 -1.5063 0.4048 -3.721 0.00064 ***
## resin2 -1.0333 0.6610 -1.563 0.12630
## resin3 -5.8000 0.6610 -8.774 1.13e-10 ***
## resin4 -6.1833 0.6610 -9.354 2.11e-11 ***
## resin5 -4.8000 0.6610 -7.261 1.09e-08 ***
## resin6 -5.4500 0.6610 -8.245 5.46e-10 ***
## resin7 -2.9167 0.6610 -4.412 8.16e-05 ***
## resin8 -0.1833 0.6610 -0.277 0.78302
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.145 on 38 degrees of freedom
## Multiple R-squared: 0.8595, Adjusted R-squared: 0.8262
## F-statistic: 25.82 on 9 and 38 DF, p-value: 1.474e-13
anova(lm3,lm2)
## Analysis of Variance Table
##
## Model 1: psize ~ operator + resin
## Model 2: psize ~ operator + resin + operator:resin
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 38 49.815
## 2 24 35.480 14 14.335 0.6926 0.7599
# Sum of squares, F and p all the same
#
# Let's test whether resin is useful at all- jointly test main effect of resin = 0 and interaction = 0
lm4 <- lm(psize ~ operator,data=pvc_t)
summary(lm4)
##
## Call:
## lm(formula = psize ~ operator, data = pvc_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3375 -2.2813 -0.6094 2.3719 4.5625
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.9438 0.6809 48.386 <2e-16 ***
## operator2 -0.2625 0.9629 -0.273 0.786
## operator3 -1.5063 0.9629 -1.564 0.125
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.723 on 45 degrees of freedom
## Multiple R-squared: 0.05845, Adjusted R-squared: 0.0166
## F-statistic: 1.397 on 2 and 45 DF, p-value: 0.2579
anova(lm4,lm2)
## Analysis of Variance Table
##
## Model 1: psize ~ operator
## Model 2: psize ~ operator + resin + operator:resin
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 45 333.76
## 2 24 35.48 21 298.28 9.608 3.437e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### Full Example ###
#
# The warpbreaks data (Faraway LMR, chapter 15, exercise 1) contains data on breaks in yarn woven
# by 9 looms, for each combination of levels of 2 factors: wool type (2 levels) and tension level (3 levels)
#
# To clear environment from previous analysis:
rm(list = ls())
# Step 1: load the data
data(warpbreaks)
warpbreaks_tbl <- dplyr::as_data_frame(warpbreaks)
glimpse(warpbreaks_tbl)
## Observations: 54
## Variables: 3
## $ breaks <dbl> 26, 30, 54, 25, 70, 52, 51, 26, 67, 18, 21, 29, 17, 12...
## $ wool <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, ...
## $ tension <fct> L, L, L, L, L, L, L, L, L, M, M, M, M, M, M, M, M, M, ...
# Check the observation counts in each level of the factors
warpbreaks_tbl %>%
group_by(wool,tension) %>%
summarize(cnt = n())
## # A tibble: 6 x 3
## # Groups: wool [?]
## wool tension cnt
## <fct> <fct> <int>
## 1 A L 9
## 2 A M 9
## 3 A H 9
## 4 B L 9
## 5 B M 9
## 6 B H 9
# Plot the data. We're looking for
# 1) Main effects, and equality of variance- pairwise boxplots
# 2) Interaction effects- interaction plots
# 3) Normality of the raw data
# 1)
warpbreaks_boxplot_1 <- warpbreaks_tbl %>%
ggplot(aes(x = wool,y = breaks)) +
theme_classic() +
geom_boxplot() +
labs(title = "Boxplot of breaks by wool type",
x = "Wool Type",
y = "Breaks")
warpbreaks_boxplot_2 <- warpbreaks_tbl %>%
ggplot(aes(x = tension,y = breaks)) +
theme_classic() +
geom_boxplot() +
labs(title = "Boxplot of breaks by level of tension",
x = "Tension Level",
y = "Breaks")
cowplot::plot_grid(warpbreaks_boxplot_1,warpbreaks_boxplot_2,nrow=1)
# Comments?
# 2)
warpbreaks_interactionplot_1 <- warpbreaks_tbl %>%
group_by(wool,tension) %>%
summarize(group_mean = mean(breaks)) %>%
ggplot(aes(x = wool,y = group_mean,group = tension)) +
theme_classic() +
facet_grid(~tension) +
geom_point() +
geom_line() +
labs(title = "Interaction Plot, wool x tension",
x = "Wool",
y = "Mean # of breaks")
warpbreaks_interactionplot_2 <- warpbreaks_tbl %>%
group_by(wool,tension) %>%
summarize(group_mean = mean(breaks)) %>%
ggplot(aes(x = tension,y = group_mean,group = wool)) +
theme_classic() +
facet_grid(~wool) +
geom_point() +
geom_line() +
labs(title = "Interaction Plot, tension x wool",
x = "Tension",
y = "Mean # of breaks")
cowplot::plot_grid(warpbreaks_interactionplot_1,warpbreaks_interactionplot_2)
# Do you think there is a significant interaction?
# Does the relationship between the mean # of breaks for wool A and wool B differ across different levels of tension?
# Does the relationship between the mean # of breaks between low, medium and high tensions differ for each wool type?
# 3)
warpbreaks_tbl %>%
mutate_at("breaks",funs( (. - mean(.)) / sd(.))) %>%
arrange(breaks) %>%
mutate(q = qnorm(1:n() / (n() + 1))) %>%
ggplot(aes(x = q,y = breaks)) +
theme_classic() +
geom_point() +
geom_abline(slope = 1,intercept = 0) +
labs(title = "Normal QQ-plot",
x = "Theoretical Quantiles",
y = "Sample Quantiles")
# That looks problematic. What to do?
# We need to find a suitable transformation that makes the response look closer to normally distributed
# Look closer
warpbreaks_tbl %>%
ggplot(aes(x = breaks)) +
theme_classic() +
geom_histogram(aes(y = ..density..),colour="black",fill="#E89A3D",bins = 15) +
geom_density(colour = "#A200E4") +
labs(title = "Histogram and density for observed breaks data",
x = "Breaks",
y = "Density")
# The problem appears to be a long right tail.
# The response is positive- use a box-cox transformation
library(MASS)
breaks_boxcox <- boxcox(breaks ~ 1,data=warpbreaks_tbl)
breaks_boxcox$x[which(breaks_boxcox$y == max(breaks_boxcox$y))]
## [1] -0.2222222
# Indicates that lambda = -0.2222... is the best value for a power transformation, but that pretty much any value
# between around -.75 and .5 are acceptable.
# Lambda = 0 corresponds to a log transformation, so let's do that, since the result remains interpretable
warpbreaks_tbl_transform <- warpbreaks_tbl %>%
mutate(log_breaks = log(breaks))
# If you get stupid ggplot errors like "invalid graphics state", whatever
# that means, try typing dev.off() to reset the internal graphics
# display device
warpbreaks_tbl_transform %>%
ggplot(aes(x = log_breaks)) +
theme_classic() +
geom_histogram(aes(y = ..density..),colour="black",fill="#E89A3D",bins = 15) +
geom_density(colour = "#A200E4") +
labs(title = "Histogram and density for log-transformed observed breaks data",
x = "Breaks",
y = "Density")
# I mean, it looks better...
warpbreaks_tbl_transform %>%
mutate_at("log_breaks",funs( (. - mean(.)) / sd(.))) %>%
arrange(log_breaks) %>%
mutate(q = qnorm(1:n() / (n() + 1))) %>%
ggplot(aes(x = q,y = log_breaks)) +
theme_classic() +
geom_point() +
geom_abline(slope = 1,intercept = 0) +
labs(title = "Normal QQ-plot",
x = "Theoretical Quantiles",
y = "Sample Quantiles")
# That looks WAY better!
# But now, we have to re-do the original plots, don't we?
# 1)
warpbreaks_transformed_boxplot_1 <- warpbreaks_tbl_transform %>%
ggplot(aes(x = wool,y = log_breaks)) +
theme_classic() +
geom_boxplot() +
labs(title = "Boxplot of log(breaks) by wool type",
x = "Wool Type",
y = "log(Breaks)")
warpbreaks_transformed_boxplot_2 <- warpbreaks_tbl_transform %>%
ggplot(aes(x = tension,y = log_breaks)) +
theme_classic() +
geom_boxplot() +
labs(title = "Boxplot of log(breaks) by level of tension",
x = "Tension Level",
y = "log(Breaks)")
cowplot::plot_grid(warpbreaks_transformed_boxplot_1,warpbreaks_transformed_boxplot_2,nrow=1)
# Comments?
# The outliers disappeared, and now equality of variance looks more reasonable
# 2)
warpbreaks_transform_interactionplot_1 <- warpbreaks_tbl_transform %>%
group_by(wool,tension) %>%
summarize(group_mean = mean(log_breaks)) %>%
ggplot(aes(x = wool,y = group_mean,group = tension)) +
theme_classic() +
facet_grid(~tension) +
geom_point() +
geom_line() +
labs(title = "Interaction Plot, wool x tension",
x = "Wool",
y = "Mean log(# of breaks)")
warpbreaks_transform_interactionplot_2 <- warpbreaks_tbl_transform %>%
group_by(wool,tension) %>%
summarize(group_mean = mean(log_breaks)) %>%
ggplot(aes(x = tension,y = group_mean,group = wool)) +
theme_classic() +
facet_grid(~wool) +
geom_point() +
geom_line() +
labs(title = "Interaction Plot, tension x wool",
x = "Tension",
y = "Mean log(# of breaks)")
cowplot::plot_grid(warpbreaks_transform_interactionplot_1,warpbreaks_transform_interactionplot_2)
# Looks similar, but note the scale of the y-axis
#
# We can now perform an ANOVA in order to test hypotheses of interest.
# There are two possible main effects, and one interaction, that we may wish to test
# We can first test the interaction, then test each of the main effects in question
#
# First let's look at the ANOVA table
warpbreaks_aov1 <- aov(log_breaks ~ wool*tension,data=warpbreaks_tbl_transform)
summary(warpbreaks_aov1)
## Df Sum Sq Mean Sq F value Pr(>F)
## wool 1 0.313 0.3125 2.234 0.14151
## tension 2 2.176 1.0881 7.779 0.00118 **
## wool:tension 2 0.913 0.4566 3.264 0.04686 *
## Residuals 48 6.714 0.1399
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# We have p-values on the cusp of the arbitrary .05 significance level, so we need to be more careful than usual
warpbreaks_lm1 <- lm(log_breaks ~ wool*tension,data=warpbreaks_tbl_transform)
summary(warpbreaks_lm1)
##
## Call:
## lm(formula = log_breaks ~ wool * tension, data = warpbreaks_tbl_transform)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.81504 -0.27885 0.04042 0.27319 0.64358
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.7179 0.1247 29.824 < 2e-16 ***
## woolB -0.4356 0.1763 -2.471 0.01709 *
## tensionM -0.6012 0.1763 -3.410 0.00133 **
## tensionH -0.6003 0.1763 -3.405 0.00134 **
## woolB:tensionM 0.6281 0.2493 2.519 0.01514 *
## woolB:tensionH 0.2221 0.2493 0.891 0.37749
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.374 on 48 degrees of freedom
## Multiple R-squared: 0.3363, Adjusted R-squared: 0.2672
## F-statistic: 4.864 on 5 and 48 DF, p-value: 0.001116
warpbreaks_lm2 <- lm(log_breaks ~ wool + tension,data=warpbreaks_tbl_transform)
summary(warpbreaks_lm2)
##
## Call:
## lm(formula = log_breaks ~ wool + tension, data = warpbreaks_tbl_transform)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.80421 -0.29975 -0.01627 0.28367 0.67424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5762 0.1063 33.644 < 2e-16 ***
## woolB -0.1522 0.1063 -1.431 0.158540
## tensionM -0.2871 0.1302 -2.205 0.032048 *
## tensionH -0.4893 0.1302 -3.758 0.000448 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3906 on 50 degrees of freedom
## Multiple R-squared: 0.246, Adjusted R-squared: 0.2008
## F-statistic: 5.438 on 3 and 50 DF, p-value: 0.002576
warpbreaks_lm3 <- lm(log_breaks ~ wool,data=warpbreaks_tbl_transform)
summary(warpbreaks_lm3)
##
## Call:
## lm(formula = log_breaks ~ wool, data = warpbreaks_tbl_transform)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.01485 -0.31728 -0.02329 0.25904 0.93106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.31744 0.08356 39.701 <2e-16 ***
## woolB -0.15215 0.11817 -1.288 0.204
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4342 on 52 degrees of freedom
## Multiple R-squared: 0.0309, Adjusted R-squared: 0.01226
## F-statistic: 1.658 on 1 and 52 DF, p-value: 0.2036
warpbreaks_lm4 <- lm(log_breaks ~ tension,data=warpbreaks_tbl_transform)
summary(warpbreaks_lm4)
##
## Call:
## lm(formula = log_breaks ~ tension, data = warpbreaks_tbl_transform)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.86110 -0.27811 -0.04066 0.31199 0.75031
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5002 0.0930 37.637 < 2e-16 ***
## tensionM -0.2871 0.1315 -2.183 0.033654 *
## tensionH -0.4893 0.1315 -3.720 0.000497 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3946 on 51 degrees of freedom
## Multiple R-squared: 0.2151, Adjusted R-squared: 0.1843
## F-statistic: 6.989 on 2 and 51 DF, p-value: 0.002077
# Test the interaction term
anova(warpbreaks_lm2,warpbreaks_lm1)
## Analysis of Variance Table
##
## Model 1: log_breaks ~ wool + tension
## Model 2: log_breaks ~ wool * tension
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 50 7.6270
## 2 48 6.7138 2 0.91315 3.2642 0.04686 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# It's on the edge. Let's test whether a model with only each single main effect fits the data as well
# as a model with both main effects and their interaction
anova(warpbreaks_lm1,warpbreaks_lm3)
## Analysis of Variance Table
##
## Model 1: log_breaks ~ wool * tension
## Model 2: log_breaks ~ wool
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 48 6.7138
## 2 52 9.8032 -4 -3.0893 5.5217 0.0009709 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(warpbreaks_lm1,warpbreaks_lm4)
## Analysis of Variance Table
##
## Model 1: log_breaks ~ wool * tension
## Model 2: log_breaks ~ tension
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 48 6.7138
## 2 51 7.9395 -3 -1.2257 2.921 0.04339 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# What do we conclude?