# Load the tidyverse packages
suppressMessages({
  suppressWarnings({
    library(tidyverse)
  })
})

Likelihood

Recall in lecture how we defined the likelihood function: for a random vector \(X = (X_{1},\ldots,X_{n})\) whose joint distribution depends on parameter \(\theta = (\theta_{1},\ldots,\theta_{p})\), the likelihood function is the joint density/mass function of \(X\) treated as a function of \(\theta\) for fixed \(X\): \[ L(\theta | X) = f(x ; \theta) \] The most common case is where \(X\) represents an IID random sample of a single random variable; the joint density and hence the likelihood is a product of the marginal densities of \(X_{i}\), which are all the same: \[ L(\theta|X) = \prod_{i=1}^{n}f(x_{i};\theta) \] For computational and theoretical reasons, we said we would work primarily with the log-likelihood, \[ \ell(\theta) = \log{L(\theta)} \overset{IID}{=} \sum_{i=1}^{n}\ell_{i}(\theta) \] What does it mean, “for fixed \(X\)”? Every time we take a random sample \(x = (x_{1},\ldots,x_{n})\), we get a different log-likelihood function. This is the point: the likelihood describes the relative frequency with which each value of \(\theta\) would have generated the observed data, in repeated sampling from a population with true parameter eqaul to \(\theta\). Values of \(\theta\) that give higher likelihoods would generate the observed data more frequently- and hence are said to be more likely to have generated the observed data.

This suggests a framework for estimation. If we have a family of probability distributions in mind (we generally get this from subject-matter theory or by assessing the data visually; see examples), then we should find the value(s) of the parameter(s) of this family of distributions that maximize the likelihood function- this/these is/are the value(s) most likely to have generated the sample that we observed. This is the method of maximum likelihood.

Examples

Normal Distribution - Beeswax Data

Consider the dataset from Rice (2006), page 379, on the melting point in degrees celcius of beeswax. As described by Rice, the aim of the study was to detect synthetic additions to pure beeswax, and the science behind this dictates that the melting point is one way to measure this. It was of scientific interest to fit a probability model to these data. How to proceed?

We can read in the data as follows:

path <- "https://raw.githubusercontent.com/awstringer1/leaf2018/gh-pages/datasets/"
beeswax <- readr::read_csv(stringr::str_c(path,"beeswax.txt"))
## Parsed with column specification:
## cols(
##   MeltingPoint = col_double(),
##   Hydrocarbon = col_double()
## )
glimpse(beeswax)
## Observations: 59
## Variables: 2
## $ MeltingPoint <dbl> 63.78, 63.45, 63.58, 63.08, 63.40, 64.42, 63.27, ...
## $ Hydrocarbon  <dbl> 14.27, 14.80, 12.28, 17.09, 15.10, 12.92, 15.56, ...

We see the dataset has two variables; we are interested in MeltingPoint. Let’s make a histogram to investigate the shape of the distribution of this variable:

beeswax_hist <- beeswax %>%
  ggplot(aes(x = MeltingPoint)) +
  theme_classic() +
  geom_histogram(aes(y = ..density..),bins = 25,colour = "black",fill = "red") +
  labs(title = "Histogram of Beeswax Melting Points",
       subtitle = "Beeswax data",
       x = "Melting Point (degrees celcius",
       y = "Density")

beeswax_hist

It is hard to tell visually if a Gaussian curve would be a good fit, since the histogram is not very smooth due to the low sample size.

Recall the Gaussian distribution has density given by \[ f(x;\mu,\sigma^{2}) = \frac{1}{\sqrt{2\pi\sigma^{2}}}\times\exp\left( -\frac{1}{2\sigma^{2}}\left( x - \mu\right)^{2}\right) \] which for an IID random sample of size \(n\) gives log-likelihood \[ \ell(\mu,\sigma^{2}) = -\frac{n}{2}\log\left( 2\pi\sigma^{2}\right) -\frac{1}{2\sigma^{2}}\sum_{i=1}^{n}\left( x_{i} - \mu\right)^{2} \] Let’s plot this for the beeswax data as a function of \(\mu\) for fixed \(\sigma^{2}\) and vice-versa.

ll_mu <- function(mu) {
  x <- beeswax$MeltingPoint
  s2 <- var(x)
  n <- length(x)
  out <- numeric(length(mu))
  for (i in 1:length(mu)) {
    out[i] <- -(n/2) * log(2*pi*s2) - (1/(2*s2)) * sum((x - mu[i])^2)
  }
  out
}
ll_sigma <- function(sigmasq) {
  x <- beeswax$MeltingPoint
  xbar <- mean(x)
  n <- length(x)
  out <- numeric(length(sigmasq))
  for (i in 1:length(sigmasq)) {
    out[i] <- -(n/2) * log(2*pi*sigmasq[i]) - (1/(2*sigmasq[i])) * sum((x - xbar)^2)
  }
  out
}


beeswax_muplot <-  data_frame(mu = seq(62.2,65,by=0.01),
                              ll = ll_mu(mu)) %>%
  ggplot(aes(x = mu,y = ll,group = 1)) +
  theme_classic() + 
  geom_line() +
  geom_vline(xintercept = mean(beeswax$MeltingPoint),colour="purple") +
  labs(title = "Log-Likelihood for mu, Beeswax Melting Point Data",
       subtitle = "Sigma fixed at MLE. Purple line indictaes MLE for mu",
       x = "Mu",
       y = "Log-Likelihood") +
  scale_x_continuous(breaks = seq(62,65,by=0.5))


beeswax_sigmaplot <-  data_frame(sigmasq = seq(0.01,.5,by=0.01),
                              ll = ll_sigma(sigmasq)) %>%
  ggplot(aes(x = sigmasq,y = ll,group = 1)) +
  theme_classic() + 
  geom_line() +
  geom_vline(xintercept = var(beeswax$MeltingPoint),colour="purple") +
  labs(title = "Log-Likelihood for sigma-squared, Beeswax Melting Point Data",
       subtitle = "Mu fixed at MLE. Purple line indicates MLE for sigma-squared",
       x = "Sigma-Squared",
       y = "Log-Likelihood")

cowplot::plot_grid(beeswax_muplot,beeswax_sigmaplot,nrow=1)

We see some interesting things:

  • The log-likelihood for \(\mu\) is of the same shape as the density of \(x_{i}\). This makes sense, because both are functions of \(x_{i}\) and \(\mu\) only through the \(\sum_{i=1}^{n}(x_{i} - \mu)^{2}\) term, which is symmetric in both those arguments
  • The MLE for \(\mu\) is clearly at the centre of this plot, and equal to 63.59- this is exactly the centre of the histogram of the data. Values of \(\mu\) that are away from the maximum in either direction are equally less likely to have generated the observed data
  • The situation for \(\sigma^{2}\) is completely different. The MLE is the sample variance (as seen in lecture) as indicated by the purple line. Values very near the MLE are nearly exactly as likely to have generated the observed data as the MLE- we lack precision in estimating this parameter. We see also that values that are less than the MLE quickly become very unlikely, while values much larger than the MLE are still pretty likely. Think about what these values represent in the context of the data that was observed. We got a sample mean of 63.59 and a range of datapoints of 62.85, 64.42. If the population variance were very small, say \(0.02\), then seeing a value of \(62.85\), say, would be extremely improbable, as this value would be over \(5\) standard deviations from the mean. However, if the population variance were very large, say \(0.5\), then all the values we have observed are actually quite close to the mean! Hence larger values of the population variance are much more likely to have generated the data we observed than small values.

Let’s investigate further what different values of the parameters mean in the context of the data we have observed. Fixing \(\sigma^{2}\) at its MLE \(s_{n}^{2}\), we overlay a Normal density curve on the histogram of observed data, for \(\mu\) at its MLE, \(\bar{x}\):

beeswax_hist +
  geom_line(data = data_frame(x = seq(min(beeswax$MeltingPoint),max(beeswax$MeltingPoint),by = 0.01),
                              y = dnorm(x,mean = mean(beeswax$MeltingPoint),sd = sd(beeswax$MeltingPoint))),
            mapping = aes(x = x,y = y,group = 1),
            colour = "purple")

The fit seems to capture the shape of the data, although the histogram is kind of chunky.

Now, what happens when we vary \(\mu\)? Varying \(\mu\) corresponds to fitting curves with the same shape, but different location- shifted along the \(x\)-axis. At the risk of sounding repetitive: the likelihood function for \(\mu\) describes how likely each curve was to have generated the data we observed. While the above curve is the most likely, curves that are shifted slightly could still have a high probability of generating the sample we saw; curves that are shifted very far away from this curve would have a very low probability of generating the data we saw.

To see what this means, plot the curves generated by a few values of \(\mu\), and think about how probable/improbable each curve is to generate the red histogram if we sampled from it:

cowplot::plot_grid(
  beeswax_hist +
  geom_line(data = data_frame(x = seq(min(beeswax$MeltingPoint),max(beeswax$MeltingPoint),by = 0.01),
                              y = dnorm(x,mean = mean(beeswax$MeltingPoint)-.2,sd = sd(beeswax$MeltingPoint))),
            mapping = aes(x = x,y = y,group = 1),
            colour = "purple") +
    labs(subtitle = "Still somewhat likely!"),
  beeswax_hist +
  geom_line(data = data_frame(x = seq(min(beeswax$MeltingPoint),max(beeswax$MeltingPoint),by = 0.01),
                              y = dnorm(x,mean = mean(beeswax$MeltingPoint)+.2,sd = sd(beeswax$MeltingPoint))),
            mapping = aes(x = x,y = y,group = 1),
            colour = "purple") +
    labs(subtitle = "Still somewhat likely!"),
  beeswax_hist +
  geom_line(data = data_frame(x = seq(min(beeswax$MeltingPoint),max(beeswax$MeltingPoint),by = 0.01),
                              y = dnorm(x,mean = mean(beeswax$MeltingPoint)-.5,sd = sd(beeswax$MeltingPoint))),
            mapping = aes(x = x,y = y,group = 1),
            colour = "purple") +
    labs(subtitle = "Not very likely!"),
  beeswax_hist +
  geom_line(data = data_frame(x = seq(min(beeswax$MeltingPoint),max(beeswax$MeltingPoint),by = 0.01),
                              y = dnorm(x,mean = mean(beeswax$MeltingPoint)+.5,sd = sd(beeswax$MeltingPoint))),
            mapping = aes(x = x,y = y,group = 1),
            colour = "purple") +
    labs(subtitle = "Not very likely!"),
  nrow = 2,
  ncol = 2
)

What about \(\sigma^{2}\)? The first histogram we plotted had both \(\mu\) and \(\sigma^{2}\) fixed at their MLEs, so it is also the reference point for investigating the effect of varying \(\sigma^{2}\). Varying \(\sigma^{2}\) changes the shape of the curve -specifically, how flat/peaked it is- while keeping the location the same. We know from our earlier discussion that low values of \(\sigma^{2}\) should give curves that would generate the observed data with very low probability, while higher values give curves that still could reasonably have generated the observed data. Let’s look at the fit for various values of \(\sigma^{2}\):

cowplot::plot_grid(
  beeswax_hist +
  geom_line(data = data_frame(x = seq(min(beeswax$MeltingPoint),max(beeswax$MeltingPoint),by = 0.01),
                              y = dnorm(x,mean = mean(beeswax$MeltingPoint),sd = sd(beeswax$MeltingPoint)*.95)),
            mapping = aes(x = x,y = y,group = 1),
            colour = "purple") +
    labs(subtitle = "Slightly lower sigma-squared - still somewhat likely!"),
  beeswax_hist +
  geom_line(data = data_frame(x = seq(min(beeswax$MeltingPoint),max(beeswax$MeltingPoint),by = 0.01),
                              y = dnorm(x,mean = mean(beeswax$MeltingPoint),sd = sd(beeswax$MeltingPoint)*.2)),
            mapping = aes(x = x,y = y,group = 1),
            colour = "purple") +
    labs(subtitle = "Much lower sigma-squared - very unlikely!"),
  beeswax_hist +
  geom_line(data = data_frame(x = seq(min(beeswax$MeltingPoint),max(beeswax$MeltingPoint),by = 0.01),
                              y = dnorm(x,mean = mean(beeswax$MeltingPoint),sd = sd(beeswax$MeltingPoint)*1.2)),
            mapping = aes(x = x,y = y,group = 1),
            colour = "purple") +
    labs(subtitle = "Slightly higher sigma-squared - still very likely!"),
  beeswax_hist +
  geom_line(data = data_frame(x = seq(min(beeswax$MeltingPoint),max(beeswax$MeltingPoint),by = 0.01),
                              y = dnorm(x,mean = mean(beeswax$MeltingPoint),sd = sd(beeswax$MeltingPoint)*4)),
            mapping = aes(x = x,y = y,group = 1),
            colour = "purple") +
    labs(subtitle = "Much higher sigma-squared - still pretty likely!"),
  nrow = 2,
  ncol = 2
)

Gamma Distribution - Rainfall Data

Recall the data on rainfall in inches from \(227\) Illinois storms between 1960 - 1964 discussed in the Method of Moments example. We did a good job of fitting a Gamma distribution to these data, estimating the parameters \(\alpha\) and \(\beta\) via the Method of Moments. We also fit the simpler Exponential model, obtained from the Gamma model by setting \(\alpha = 1\). The fit was poorer than the Gamma fit.

Let’s apply the method of Maximum Likelihood to estimating the parameters from these data. We will stick with the Gamma model, since the Method of Moments estimator and the Maximum Likelihood Estimator are the same (both equal \(\bar{X}\)) for the Exponential distribution. We hope that our MLE estimates will be very close to the MoM estimates, i.e. that both will give good fits. There is some theory that supports this notion.

First, we plot the data and compute the MoM estimates as before:

path <- "https://raw.githubusercontent.com/awstringer1/leaf2018/gh-pages/datasets/"
rainfall <- readr::read_csv(stringr::str_c(path,"illinois60.txt"),col_names = "rainfall") %>%
  bind_rows(readr::read_csv(stringr::str_c(path,"illinois61.txt"),col_names = "rainfall")) %>%
  bind_rows(readr::read_csv(stringr::str_c(path,"illinois62.txt"),col_names = "rainfall")) %>%
  bind_rows(readr::read_csv(stringr::str_c(path,"illinois63.txt"),col_names = "rainfall")) %>%
  bind_rows(readr::read_csv(stringr::str_c(path,"illinois64.txt"),col_names = "rainfall"))
## Parsed with column specification:
## cols(
##   rainfall = col_double()
## )
## Parsed with column specification:
## cols(
##   rainfall = col_double()
## )
## Parsed with column specification:
## cols(
##   rainfall = col_double()
## )
## Parsed with column specification:
## cols(
##   rainfall = col_double()
## )
## Parsed with column specification:
## cols(
##   rainfall = col_double()
## )
glimpse(rainfall)
## Observations: 227
## Variables: 1
## $ rainfall <dbl> 0.020, 0.001, 0.001, 0.120, 0.080, 0.420, 1.720, 0.05...
rainplot <- rainfall %>%
  ggplot(aes(x = rainfall)) +
  theme_classic() + 
  geom_histogram(aes(y = ..density..),colour = "black",fill = "#ff9933",bins=50) +
  labs(title = "Emprical Distribution of Rainfall",
       subtitle = "Illinois rainfall dataset, 227 storms from 1960 - 1964",
       x = "Amount of Rainfall (inches)",
       y = "Density")

alpha_mom <- mean(rainfall$rainfall)^2 / var(rainfall$rainfall)
alpha_mom
## [1] 0.3762506
lambda_mom <- mean(rainfall$rainfall) / var(rainfall$rainfall)
lambda_mom
## [1] 1.676755
rainplot +
  geom_line(data = data_frame(x = seq(0,2.5,by=0.01),
                              y = dgamma(x,shape = alpha_mom,rate = lambda_mom)),
            mapping = aes(x = x,y = y,group = 1),
            colour = "blue") +
  labs(title = "Gamma Model for Rainfall Data - Method of Moments")

Let’s plot the likelihood for \(\alpha\) and \(\lambda\) for these data and the Gamma model, and draw lines where the MoM estimators lie.

gamma_loglik <- function(alpha = mean(x)^2 / var(x),
                      lambda = mean(x) / var(x),
                      x = rainfall$rainfall) {
  sum(dgamma(x,shape = alpha,scale = lambda,log = TRUE))
}

gamma_loglik_v_alpha <- function(alpha,
                      lambda = mean(x) / var(x),
                      x = rainfall$rainfall) {
  out <- numeric(length(alpha))
  for (i in 1:length(alpha)) {
    out[i] <- gamma_loglik(alpha[i],lambda,x)
  }
  out
}
gamma_loglik_v_lambda <- function(lambda,
                      alpha = mean(x)^2 / var(x),
                      x = rainfall$rainfall) {
  out <- numeric(length(lambda))
  for (i in 1:length(lambda)) {
    out[i] <- gamma_loglik(lambda[i],alpha,x)
  }
  out
}

alpha_plot <- data_frame(a = seq(0.01,3,by=0.01),
                         ll = gamma_loglik_v_alpha(alpha = a)) %>%
  ggplot(aes(x = a,y = ll,group = 1)) +
  theme_classic() + 
  geom_line() +
  geom_vline(xintercept = alpha_mom,colour = "purple") +
  labs(title = "Log-Likelihood for Alpha",
       subtitle = "Rainfall Data, Gamma Model - Lambda fixed at MoM value. Purple line is MoM Estimator",
       x = "Alpha",
       y = "Log-Likelihood") +
  scale_x_continuous(breaks = seq(0,3,by=0.2)) +
  scale_y_continuous(breaks = seq(200,-2000,by = -200),labels = scales::comma_format())

lambda_plot <- data_frame(b = seq(0.01,3,by=0.01),
                         ll = gamma_loglik_v_lambda(lambda = b)) %>%
  ggplot(aes(x = b,y = ll,group = 1)) +
  theme_classic() + 
  geom_line() +
  geom_vline(xintercept = lambda_mom,colour = "purple") +
  labs(title = "Log-Likelihood for Lambda",
       subtitle = "Rainfall Data, Gamma Model - Alpha fixed at MoM value. Purple line is MoM Estimator",
       x = "Lambda",
       y = "Log-Likelihood") +
  scale_x_continuous(breaks = seq(0,3,by=0.2)) +
  scale_y_continuous(breaks = seq(200,-2000,by = -200),labels = scales::comma_format())

cowplot::plot_grid(alpha_plot,lambda_plot,nrow=2)

Each viewed on their own, the MoM estimators appear to be suboptimal when viewed from the perspective of the likelihood. What is happening? The problem is that each estimator depends on the other, and we are fixing each at its MoM value, and then looking at the resulting countour of the bivariate likelihood function. This isn’t necessarily the correct contour to be looking at; different \(\alpha\) values would produce different \(\lambda\) curves, with potentially different optima.

Let’s try to maximize the likelihood, jointly. We will assume (because it is true; not going to prove/verify) that the log-likelihood is convex and possesses a unique optimum which is a maximum. The density of \(X \sim Gamma(\alpha,\lambda)\) is \[ f(x;\alpha,\lambda) = \frac{\lambda^{\alpha}}{\Gamma{(\alpha})} x^{\alpha-1}\exp\left( -\lambda x\right) \] The log-likelihood is then \[ \ell(\alpha,\lambda) = n\alpha\log{\lambda} - n\log{\Gamma(\alpha)} - \lambda n \bar{x} \] The two score statistics here make up the gradient of the log-likelihood \[ \begin{aligned} S(\alpha) &= \frac{\partial\ell}{\partial\alpha} = n\log{\lambda} - n\psi(\alpha) \\ S(\lambda) &= \frac{\partial\ell}{\partial\lambda} = \frac{n\alpha}{\lambda} - n\bar{x} \\ \nabla \ell &\equiv \left(S(\alpha),S(\lambda)\right) \end{aligned} \] where \(\psi(\alpha) \equiv \partial \log{\Gamma(\alpha)} / \partial\alpha\) is the “digamma” function. Setting \(\nabla \ell = 0\) gives a system of equations that are satisfied at the maximum. Trying to solve this yields \[ \begin{aligned} \hat{\lambda} &= \frac{\hat{\alpha}}{\bar{x}} \\ \log{\hat{\lambda}} &= \psi(\hat{\alpha}) \end{aligned} \] which has no closed-form solution.

We turn to iterative methods. Without going into the mathematical details of optimization, there are two simple methods we can use here:

  • Code up the gradient and find its root using a root-finding method like Newton’s Method
  • Code up the log-likelihood itself and find its optimum using any box-constrained optimization routine (note there is a lot of overlap between these two possibilities!)

R has good univariate root-finding methods in its base library, and there are special packages that can find multivariate roots. R has even better built-in optimization routines; the nlminb function peforms box-constrained optimization on essentially arbitrary functions. With such a well-behaved function, this is likely to be the quickest solution here.

Let’s give it a try:

# nlminb requires the input function to be of a certain form
# Note also that nlminb performs MINIMZATION. So if you want to maximize, your function
# should return minus the value it usually returns
input_function <- function(params) -gamma_loglik(alpha = params[1],lambda = params[2])
# Perform the optimization on the upper quarter-plane in R^2 with (alpha,lambda) > 0
# Use the method of moments estimates as starting values- this is very good practice
opt <- nlminb(start = c(alpha_mom,lambda_mom),objective = input_function,lower = c(0.01,0.01))

print(opt)
## $par
## [1] 0.4407915 0.5090662
## 
## $objective
## [1] -185.3477
## 
## $convergence
## [1] 0
## 
## $iterations
## [1] 17
## 
## $evaluations
## function gradient 
##       27       45 
## 
## $message
## [1] "relative convergence (4)"
alpha_mle <- opt$par[1]
lambda_mle <- opt$par[2]

The MLEs are different than the MoM estimators. How different, in terms of log-likelihood?

gamma_loglik(alpha = alpha_mle,lambda = lambda_mle) - 
gamma_loglik(alpha = alpha_mom,lambda = lambda_mom)
## [1] 35.30569

The MLE’s provide a much more likely set of parameters.

Let’s plot the resulting curves:

cowplot::plot_grid(
  rainplot +
  geom_line(data = data_frame(x = seq(0,2.5,by=0.01),
                              y = dgamma(x,shape = alpha_mom,rate = lambda_mom)),
            mapping = aes(x = x,y = y,group = 1),
            colour = "blue") +
  labs(title = "Gamma Model for Rainfall Data - Method of Moments"),
  rainplot +
  geom_line(data = data_frame(x = seq(0,2.5,by=0.01),
                              y = dgamma(x,shape = alpha_mle,rate = lambda_mle)),
            mapping = aes(x = x,y = y,group = 1),
            colour = "blue") +
  labs(title = "Gamma Model for Rainfall Data - Maximum Likelihood"),
  nrow = 1
)