--- title: "Nonlinear model approximation" output: rmarkdown::html_vignette: rmarkdown::pdf_document: vignette: > %\VignetteIndexEntry{Nonlinear model approximation} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} header-includes: - \newcommand{\bm}[1]{\boldsymbol{#1}} - \newcommand{\wt}[1]{\widetilde{#1}} - \newcommand{\ol}[1]{\overline{#1}} - \newcommand{\wh}[1]{\widehat{#1}} - \DeclareMathOperator*{\argmax}{arg\,max} - \newcommand{\proper}[1]{\mathsf{#1}} - \newcommand{\pExp}{\proper{Exp}} - \newcommand{\pN}{\proper{N}} - \newcommand{\pPo}{\proper{Po}} - \newcommand{\pGa}{\proper{Ga}} - \newcommand{\pE}{\proper{E}} - \newcommand{\pP}{\proper{P}} - \newcommand{\pVar}{\proper{Var}} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", dev = "png", dev.args = list(type = "cairo-png"), fig.width = 7, fig.height = 5 ) ``` ```{r setup, include = FALSE} library(inlabru) library(ggplot2) ``` See the `method` vignette for details of the general `inlabru` method. Here, we'll take a look at a small toy example to see the difference between a few posterior approximation methods. Note: This vignette is not completed yet. ## A small toy problem Hierarchical model: $$ \begin{aligned} \lambda &\sim \pExp(\gamma) \\ (y_i|\lambda) &\sim \pPo(\lambda), \text{ independent across $i=1,\dots,n$} \end{aligned} $$ With $\ol{y}=\frac{1}{n}\sum_{i=1}^n y_i$, the posterior density is $$ \begin{aligned} p(\lambda | \{y_i\}) &\propto p(\lambda, y_1,\dots,y_n) \\ &\propto \exp(-\gamma\lambda) \exp(-n\lambda) \lambda^{n\ol{y}} \\ &= \exp\{-(\gamma+n)\lambda\} \lambda^{n\ol{y}}, \end{aligned} $$ which is the density of a $\pGa(\alpha = 1+n\ol{y}, \beta = \gamma+n)$ distribution. ## Latent Gaussian predictor version Introducing a latent Gaussian variable $u\sim\pN(0,1)$, the model can be reformulated as $$ \begin{aligned} \lambda(u) &=-\ln\{1-\Phi(u)\}/\gamma \\ (y_i|u) &\sim \pPo(\lambda(u)) \end{aligned} $$ We will need some derivatives of $\lambda$ with respect to $u$: $$ \begin{aligned} \frac{\partial\lambda(u)}{\partial u} &= \frac{1}{\gamma}\frac{\phi(u)}{1-\Phi(u)} = \lambda'(u) \\ \frac{\partial^2\lambda(u)}{\partial u^2} &= - \frac{1}{\gamma}\frac{\phi(u)}{1-\Phi(u)} \left( u + \frac{\phi(u)}{1-\Phi(u)} \right) = -\lambda'(u)\left\{u-\gamma\lambda'(u)\right\} \\ \frac{\partial\ln\lambda(u)}{\partial u} &= \frac{1}{\lambda(u)} \frac{\partial\lambda(u)}{\partial u} =\frac{1}{-\ln\{1-\Phi(u)\}} \frac{\phi(u)}{1-\Phi(u)} = \frac{\lambda'(u)}{\lambda(u)} \\ \frac{\partial^2\ln\lambda(u)}{\partial u^2} &= \frac{1}{\lambda(u)} \frac{\partial^2\lambda(u)}{\partial u^2} -\frac{1}{\lambda(u)^2} \left(\frac{\partial\lambda(u)}{\partial u}\right)^2 = \frac{-\lambda'(u)\{u - \gamma\lambda'(u)\}}{\lambda(u)} - \frac{\lambda'(u)^2}{\lambda(u)^2}\\ &= -\frac{\lambda'(u)}{\lambda(u)}\left\{ u - \gamma\lambda'(u) +\frac{\lambda'(u)}{\lambda(u)} \right\} \end{aligned} $$ ```{r} lambda <- function(u, gamma) { -pnorm(u, lower.tail = FALSE, log.p = TRUE) / gamma } lambda_inv <- function(lambda, gamma) { qnorm(-lambda * gamma, lower.tail = FALSE, log.p = TRUE) } D1lambda <- function(u, gamma) { exp(dnorm(u, log = TRUE) - pnorm(u, lower.tail = FALSE, log.p = TRUE)) / gamma } D2lambda <- function(u, gamma) { D1L <- D1lambda(u, gamma) -D1L * (u - gamma * D1L) } D1log_lambda <- function(u, gamma) { D1lambda(u, gamma) / lambda(u, gamma) } D2log_lambda <- function(u, gamma) { D1logL <- D1log_lambda(u, gamma) -D1logL * (u - gamma * D1lambda(u, gamma = gamma) + D1logL) } ``` ## Latent Gaussian posterior approximations A basic approximation of the posterior distribution for $\lambda$ given $y$ can be defined as a deterministic transformation of a Gaussian distribution obtained from a 2nd order Taylor approximation of $\ln p(u|\{y_i\})$ at the posterior mode $u_0$ of $p(u|\{y_i\})$. The needed derivatives are $$ \begin{aligned} \frac{\partial\ln p(u|\{y_i\})}{\partial u} &= \frac{\partial\ln\phi(u)}{\partial u} - n\lambda'(u) + n\ol{y}\frac{\lambda'(u)}{\lambda(u)} = -u + n\frac{\lambda'(u)}{\lambda(u)}\left\{ \ol{y} - \lambda(u) \right\} \\ \frac{\partial^2\ln p(u|\{y_i\})}{\partial u^2} &= -1 - n \frac{\lambda'(u)}{\lambda(u)}\left\{ u - \gamma\lambda'(u) + \frac{\lambda'(u)}{\lambda(u)} \right\} \left\{ \ol{y} - \lambda(u) \right\} - n \frac{\lambda'(u)^2}{\lambda(u)} \end{aligned} $$ At the mode $u_0$, the first order derivative is zero, and $$ \begin{aligned} \left.\frac{\partial^2\ln p(u|\{y_i\})}{\partial u^2}\right|_{u=u_0} &= -1 - \left\{ u_0 - \gamma\lambda'(u_0) + \frac{\lambda'(u_0)}{\lambda(u_0)} \right\} u_0 - n \frac{\lambda'(u_0)^2}{\lambda(u_0)} . \end{aligned} $$ The quadratic approximation of the log-posterior density at the mode $u_0$ is then $$ \ln \breve{p}(u|\{y_i\}) = \text{const} - \frac{(u-u_0)^2}{2} \left[ - \left.\frac{\partial^2\ln p(u|\{y_i\})}{\partial u^2}\right|_{u=u_0} \right] $$ In `inlabru`, the approximation first linearises $\ln \lambda(u)$ at $u_0$ before applying the Taylor approximation of $\ln p(u|\{y_i\})$. The linearised log-predictor is $$ \ln \ol{\lambda}(u) = \ln \lambda(u_0) + \frac{\lambda'(u_0)}{\lambda(u_0)}(u - u_0) $$ so that $$ \ol{\lambda}'(u) = \frac{\lambda'(u_0)}{\lambda(u_0)} \ol{\lambda}(u) $$ and the second order derivative of the linearised log-posterior density is $$ \begin{aligned} \left.\frac{\partial^2\ln \ol{p}(u|\{y_i\})}{\partial u^2}\right|_{u=u_0} &= -1 - n \frac{\lambda'(u_0)^2}{\lambda(u_0)} . \end{aligned} $$ ```{r} log_p <- function(u, y, gamma) { L <- lambda(u, gamma) n <- length(y) dnorm(u, log = TRUE) - n * L + n * mean(y) * log(L) - sum(lgamma(y + 1)) } D1log_p <- function(u, y, gamma) { n <- length(y) -u + n * D1log_lambda(u, gamma) * (mean(y) - lambda(u, gamma)) } D2log_p <- function(u, y, gamma) { n <- length(y) -1 + n * D2log_lambda(u, gamma) * (mean(y) - lambda(u, gamma)) - n * D1log_lambda(u, gamma) * D1lambda(u, gamma) } ``` ```{r} g <- 1 y <- c(0, 1, 2) y <- c(0, 0, 0, 0, 0) y <- rpois(5, 5) mu_quad <- uniroot( D1log_p, lambda_inv((1 + sum(y)) / (g + length(y)) * c(1 / 10, 10), gamma = g), y = y, gamma = g )$root sd_quad <- (-D2log_p(mu_quad, y = y, gamma = g))^-0.5 sd_lin <- (1 + length(y) * D1log_lambda(mu_quad, gamma = g)^2 * lambda(mu_quad, gamma = g))^-0.5 lambda0 <- lambda(mu_quad, gamma = g) ``` ### Posterior densities ```{r} ggplot() + xlim( lambda(mu_quad - 4 * sd_quad, gamma = g), lambda(mu_quad + 4 * sd_quad, gamma = g) ) + xlab("lambda") + ylab("Density") + geom_function( fun = function(x) { exp(log_p(lambda_inv(x, gamma = g), y = y, gamma = g)) / D1lambda(lambda_inv(x, gamma = g), gamma = g) / ( exp(log_p(lambda_inv(lambda0, gamma = g), y = y, gamma = g)) / D1lambda(lambda_inv(lambda0, gamma = g), gamma = g) ) * dgamma(lambda0, shape = 1 + sum(y), rate = g + length(y)) }, mapping = aes(col = "Theory"), n = 1000 ) + geom_function( fun = dgamma, args = list(shape = 1 + sum(y), rate = g + length(y)), mapping = aes(col = "Theory"), n = 1000 ) + geom_function( fun = function(x) { dnorm(lambda_inv(x, gamma = g), mean = mu_quad, sd = sd_quad) / D1lambda(lambda_inv(x, gamma = g), gamma = g) }, mapping = aes(col = "Quadratic"), n = 1000 ) + geom_function( fun = function(x) { dnorm(lambda_inv(x, gamma = g), mean = mu_quad, sd = sd_lin) / D1lambda(lambda_inv(x, gamma = g), gamma = g) }, mapping = aes(col = "Linearised"), n = 1000 ) + geom_vline(mapping = aes( xintercept = (1 + sum(y)) / (g + length(y)), lty = "Bayes mean" )) + geom_vline(mapping = aes(xintercept = lambda0, lty = "Bayes mode")) + geom_vline(mapping = aes(xintercept = mean(y), lty = "Plain mean")) ``` ```{r} ggplot() + xlim( lambda_inv(lambda0, gamma = g) - 4 * sd_quad, lambda_inv(lambda0, gamma = g) + 4 * sd_quad ) + xlab("u") + ylab("Density") + geom_function( fun = function(x) { exp(log_p(x, y = y, gamma = g) - log_p(lambda_inv(lambda0, gamma = g), y = y, gamma = g)) * (dgamma(lambda0, shape = 1 + sum(y), rate = g + length(y)) * D1lambda(lambda_inv(lambda0, gamma = g), gamma = g)) }, mapping = aes(col = "Theory"), n = 1000 ) + geom_function( fun = function(x) { dgamma(lambda(x, gamma = g), shape = 1 + sum(y), rate = g + length(y)) * D1lambda(x, gamma = g) }, mapping = aes(col = "Theory"), n = 1000 ) + geom_function( fun = dnorm, args = list(mean = mu_quad, sd = sd_quad), mapping = aes(col = "Quadratic"), n = 1000 ) + geom_function( fun = dnorm, args = list(mean = mu_quad, sd = sd_lin), mapping = aes(col = "Linearised"), n = 1000 ) + geom_vline( mapping = aes( xintercept = lambda_inv((1 + sum(y)) / (g + length(y)), gamma = g ), lty = "Bayes mean" ) ) + geom_vline( mapping = aes( xintercept = lambda_inv(lambda0, gamma = g), lty = "Bayes mode" ) ) + geom_vline( mapping = aes( xintercept = lambda_inv(mean(y), gamma = g), lty = "Plain mean" ) ) ``` ### Posterior CDFs ```{r} ggplot() + xlim( lambda(mu_quad - 4 * sd_quad, gamma = g), lambda(mu_quad + 4 * sd_quad, gamma = g) ) + xlab("lambda") + ylab("CDF") + geom_function( fun = pgamma, args = list(shape = 1 + sum(y), rate = g + length(y)), mapping = aes(col = "Theory"), n = 1000 ) + geom_function( fun = function(x) { pnorm(lambda_inv(x, gamma = g), mean = mu_quad, sd = sd_quad) }, mapping = aes(col = "Quadratic"), n = 1000 ) + geom_function( fun = function(x) { pnorm(lambda_inv(x, gamma = g), mean = mu_quad, sd = sd_lin) }, mapping = aes(col = "Linearised"), n = 1000 ) + geom_vline(mapping = aes( xintercept = (1 + sum(y)) / (g + length(y)), lty = "Bayes mean" )) + geom_vline(mapping = aes(xintercept = lambda0, lty = "Bayes mode")) + geom_vline(mapping = aes(xintercept = mean(y), lty = "Plain mean")) ``` ```{r} ggplot() + xlim( lambda_inv(lambda0, gamma = g) - 4 * sd_quad, lambda_inv(lambda0, gamma = g) + 4 * sd_quad ) + xlab("u") + ylab("CDF") + geom_function( fun = function(x) { pgamma(lambda(x, gamma = g), shape = 1 + sum(y), rate = g + length(y)) }, mapping = aes(col = "Theory"), n = 1000 ) + geom_function( fun = pnorm, args = list(mean = mu_quad, sd = sd_quad), mapping = aes(col = "Quadratic"), n = 1000 ) + geom_function( fun = pnorm, args = list(mean = mu_quad, sd = sd_lin), mapping = aes(col = "Linearised"), n = 1000 ) + geom_vline(mapping = aes( xintercept = lambda_inv((1 + sum(y)) / (g + length(y)), gamma = g ), lty = "Bayes mean" )) + geom_vline(mapping = aes( xintercept = lambda_inv(lambda0, gamma = g), lty = "Bayes mode" )) + geom_vline(mapping = aes( xintercept = lambda_inv(mean(y), gamma = g), lty = "Plain mean" )) ```