knitr::opts_chunk$set(dev = "ragg_png")
options(digits=5) # set number of digits equal to 5
# tidyverse
library(tidyverse)
# others
library(kableExtra)
library(glue)
library(latex2exp)
library(scales)
library(R6)
# jags and MCMC
library(rjags)
library(ggmcmc)
library(ggpubr)
A well established and diffused method for detecting a disease in blood fails to detect the presence of disease in 15% of the patients that actually have the disease. A young UniPD startUp has developed an innovative method of screening. During the qualification phase, a random sample of n = 75 patients known to have the disease is screened using the new method.
I call \(p_0=15\%\) the fraction of failures in detecting disease by the standard method.
The number of times \(y\) the new method fails is distributed according to a binomial distribution with probability \(p\) \[ P(y|p, n)=Binom(y|p, n)=\binom{n}{y}p^y(1-p)^{n-y} \] where \(p\) is the probability of failing to the detect the disease.
Since the number of failures follows the Binomial distribution, an unbiased frequentist estimator of the failure probability of the new method is \[ \hat{p}_F = \frac{y}{n} = \frac{6}{75} = 0.08 \]
The prior is a Beta distribution with \(m=0.15\) and \(\sigma=0.14\). For a Beta distribution \[ m=\frac{\alpha}{\alpha+\beta}\quad , \quad \sigma^2=\frac{\alpha\beta}{(\alpha+\beta)^2(\alpha+\beta+1)} \] so it is possible to compute \(\alpha\) and \(\beta\) given \(m\) and \(\sigma\) \[ \begin{align} \alpha&=-\frac{m(\sigma^2+m^2-m)}{\sigma^2} = 0.83 \\ \beta&=\frac{(\sigma^2+m^2-m)(m-1)}{\sigma^2} = 4.68 \end{align} \] Since the likelihood is Binomial and a Beta prior is a conjugate function for the Binomial distribution, then the posterior is also a Beta distribution with parameters \[ \begin{align} \alpha'&=\alpha+y = \alpha + 6 = 6.83 \\ \beta'&=\beta+n-y = \beta +75-6 = \beta+69 = 73.68 \end{align} \] The mean value and the variance of the posterior are simply \[ \begin{align} m&=\frac{\alpha'}{\alpha'+\beta'} = 0.0848\\ \sigma^2&=\frac{\alpha'\beta'}{(\alpha'+\beta')^2(\alpha'+\beta'+1)} = 0.000952 \end{align} \]
xsize= 400
x <- seq(0, 1, length=xsize)
m <- 0.15
sigma <- 0.14
alpha.prior <- -m*(sigma^2+m^2-m)/sigma^2
beta.prior <- (sigma^2+m^2-m)*(m-1)/sigma^2
alpha.posterior <- alpha.prior + 6
beta.posterior <- beta.prior + 69
m.posterior <- alpha.posterior/(alpha.posterior+beta.posterior)
var.posterior <- (alpha.posterior*beta.posterior)/((alpha.posterior+beta.posterior)^2*(alpha.posterior+beta.posterior+1))
glue(
'Posterior parameters:
- alpha = {round(alpha.posterior, 2)}
- beta = {round(beta.posterior, 2)}\n
Mean value = {round(m.posterior, 4)}
Variance = {round(var.posterior, 6)}'
)
## Posterior parameters:
## - alpha = 6.83
## - beta = 73.68
##
## Mean value = 0.0848
## Variance = 0.000952
ggplot()+
geom_line(aes(x, dbeta(x, alpha.posterior, beta.posterior)), color='navyblue', size=0.6)+
geom_vline(xintercept=m.posterior, color='firebrick')+
geom_vline(xintercept=m.posterior+c(sqrt(var.posterior), -sqrt(var.posterior)), color='black', linetype='dashed')+
annotate('text', x=m.posterior+0.09, y=13.75, label=paste('mean=', round(m.posterior, 3), sep=''), color='firebrick')+
annotate('text', x=m.posterior+sqrt(var.posterior)+0.05, y=11.25, label=TeX('$m+\\sigma$'))+
annotate('text', x=m.posterior-sqrt(var.posterior)-0.05, y=11.25, label=TeX('$m-\\sigma$'))+
labs(
x='p',
y='P(p|y, n)',
title=paste('Posterior probability, Beta(', round(alpha.posterior, 2), ', ', round(beta.posterior, 2), ')', sep='')
)+
xlim(0, 1)+
ylim(0, 15)+
theme_bw()
The null-hypothesis is \[ H_0: p\geq 15\% \]
while the alternative hypothesis (the new method is better) is \[ H_1: p<15\% \] The level of significance is chosen to be \(\alpha=5\%\).
Using the Bayesian approach, I can compute the posterior probability of \(H_0\) by integrating the posterior \(p\) over the required region: \[ P(H_0: p\geq15\%) = \int_{0.15}^{1}f(p|y, n)dp = 1-F(p=0.15|y, n) \] where \(F\) is the cumulative distribution of the posterior.
post.H0 <- 1-pbeta(0.15, alpha.posterior, beta.posterior)
glue('Probability P(p>=0.15|y=6, n=75) = {round(post.H0, 4)} = {round(post.H0*100, 2)}%')
## Probability P(p>=0.15|y=6, n=75) = 0.0313 = 3.13%
The result is smaller than \(\alpha=5\%\), therefore I reject the null hypothesis at the 5% level of significance and I assume that statistically the new test is better that the traditional one.
x.area <- tail(x, round(xsize*(1-0.15), 0))
ggplot()+
geom_line(aes(x, dbeta(x, alpha.posterior, beta.posterior)), color='navyblue', size=0.6)+
geom_area(aes(x.area, dbeta(x.area, alpha.posterior, beta.posterior)), fill='steelblue2', color='navyblue', alpha=0.5, size=0.6)+
geom_vline(xintercept=0.15, color='black', linetype='dashed')+
annotate('text', x=0.15+0.11, y=1.25, label=paste('Area=', round(post.H0, 4), sep=''))+
labs(
x='p',
y='P(p|y, n)',
title=paste('Posterior probability, Beta(', round(alpha.posterior, 2), ', ', round(beta.posterior, 2), ')', sep='')
)+
xlim(0, 1)+
ylim(0, 15)+
theme_bw()
The classical frequentist way is based on the Neyman and Pearson approach. The rejection region is chosen so that the distribution of the null hypothesis is under the significance level \(\alpha\), where in this case the null distribution is \(Binom(y|p=0.15, n=75)\).
p0 <- 0.15
y_ <- seq(0, 30)
ggplot()+
geom_col(aes(x=y_, y=dbinom(y_, 75, p0)), color='white', fill='lightblue', alpha=1)+
geom_col(aes(x=6, y=dbinom(6, 75, p0)), color='white', fill='darkorange')+
geom_hline(yintercept=0.05, color='firebrick', linetype='dashed', size=0.6)+
annotate('text', x=1, y=0.055, label='alpha=0.05')+
labs(
x='y',
y=paste('P(y|p=', p0, ', n=75)', sep=''),
title='Hypothesis Test',
subtitle='Detecting disease in blood'
)+
scale_x_continuous(breaks=seq(0,30, by=2), limits=c(0, 30))+
theme_bw()
glue('Value in y=6: {round(dbinom(6, 75, p0), 4)} < 0.05')
## Value in y=6: 0.0309 < 0.05
\(y = 6\) lies in the acceptance region, so again I do not reject the null hypothesis \(H_0\).
Instead, following Fisher’s approach, to decide whether to accept or reject the null hypothesis, it is necessary to calculate the p-value, which in this case (one-side hypothesis test) is given by \[ \text{p-value} = \sum_{y=0}^{6} P(y|p_0, n) = \sum_{y=0}^{6} Binom(y|p_0, n) \]
glue('p-value: {round(pbinom(6, 75, p0), 4)}')
## p-value: 0.0544
The p-value is larger than the significance level \(\alpha=0.05\), so applying the frequentist approach in Fisher’s way leads to not rejecting the null hypothesis \(H_0\), unlike what I have found so far. Anyway, the result is still highly significant, as the p-value is very close to \(\alpha\).
Ladislaus Josephovich Bortkiewicz was a Russian economist and statistician. He noted that the Poisson distribution can be very useful in applied statistics when describing low-frequency events in a large population. In a famous example he showed that the number of deaths by horse kick among the Prussian army follows the Poisson distribution.
y death soldiers | 0 | 1 | 2 | 3 | 4 | \(\geq\) 5 |
---|---|---|---|---|---|---|
\(n_1\) observations | 109 | 65 | 22 | 3 | 1 | 0 |
\(n_2\) observations | 144 | 91 | 32 | 11 | 2 | 0 |
Assuming a uniform prior for a Poisson process, the posterior becomes a \(Gamma(\alpha, \beta)\) function with shape \(\alpha=\sum y_i + 1\) and rate \(\beta=n\), the number of observations. I want to keep the 2 observations, n1 and n2, separated, so I will compute the posterior distribution for lambda for each set of measurements independently.
To solve the problem, I need the following quantities:
\[ \begin{align} n_{1, \text{tot}} &= \sum_{i=0}^5 (n_1)_i \\ n_{2, \text{tot}} &= \sum_{i=0}^5 (n_2)_i \\ \sum (y_1)_i &= \sum_{i=0}^5 i\cdot(n_1)_i \\ \sum (y_2)_i &= \sum_{i=0}^5 i\cdot(n_2)_i \end{align} \]
# GENERAL FUNCTIONS
gamma.mean <- function(shape, rate){
return(shape/rate)
}
gamma.variance <- function(shape, rate){
return(shape/rate^2)
}
gamma.median <- function(shape, rate){
return(qgamma(0.5, shape=shape, rate=rate))
}
gg_post <- function(post.funct, limits, mean, median, Title='Posterior', Subtitle=''){
lambda <- seq(0.25, 1.25, length=400)
lambda95 <- seq(limits[1], limits[2], length=400)
ggplot()+
geom_line(aes(x=lambda, y=post.funct(lambda)), color='navyblue', size=0.6)+
geom_area(aes(x=lambda95, y=post.funct(lambda95)), fill='steelblue2', color='navyblue', size=0.6, alpha=0.5)+
labs(
title=Title,
x=TeX('$\\lambda$'),
y=TeX('$P(\\lambda|\\{y_i\\})$')
)+
{if (Subtitle!='') labs(subtitle=Subtitle)}+
scale_y_continuous(breaks=seq(0, 9, by=3), limits=c(0, 9))+
geom_vline(xintercept=limits, linetype='dashed', size=0.6)+
geom_vline(xintercept=mean, linetype='dashed', size=0.6, color='firebrick')+
annotate('text', x=limits[1]-0.08, y=6.75,
label=paste('x1=', format(limits[1], digits=2, nsmall=2), sep=''))+
annotate('text', x=limits[2]+0.08, y=6.75,
label=paste('x2=', format(limits[2], digits=2, nsmall=2), sep=''))+
annotate('text', x=mean+0.09, y = 8.25,
label=paste('mean=', format(round(mean, 2), nsmall = 2), sep=''), color='firebrick')+
theme_bw()
}
# DATAFRAME
df_horsekick <- data.frame(
y = c(0, 1, 2, 3, 4, 5),
n.1 = c(109, 65, 22, 3, 1, 0),
n.2 = c(144, 91, 32, 11, 2, 0)
)
Observations 1.
n1.tot <- sum(df_horsekick$n.1)
s.y.1 <- sum(df_horsekick$n.1*df_horsekick$y)
alpha.hk.unif.1 <- s.y.1+1
beta.hk.1 <- n1.tot
mean.hk.unif.1 <- gamma.mean(alpha.hk.unif.1, beta.hk.1)
variance.hk.unif.1 <- gamma.variance(alpha.hk.unif.1, beta.hk.1)
median.hk.unif.1 <- gamma.median(alpha.hk.unif.1, beta.hk.1)
cred.int.unif.1 <- c(qgamma(0.025, shape=alpha.hk.unif.1, rate=beta.hk.1), qgamma(0.975, alpha.hk.unif.1, beta.hk.1))
gg_post(
function(x){dgamma(x, shape=alpha.hk.unif.1, rate=beta.hk.1)},
cred.int.unif.1,
mean.hk.unif.1,
median.hk.unif.1,
Title=paste('Posterior distribution: Gamma(', alpha.hk.unif.1, ', ', beta.hk.1, ')', sep=''),
Subtitle='Positive uniform prior, observations 1'
)
Observations 2.
n2.tot <- sum(df_horsekick$n.2)
s.y.2 <- sum(df_horsekick$n.2*df_horsekick$y)
alpha.hk.unif.2 <- s.y.2+1
beta.hk.2 <- n2.tot
mean.hk.unif.2 <- gamma.mean(alpha.hk.unif.2, beta.hk.2)
variance.hk.unif.2 <- gamma.variance(alpha.hk.unif.2, beta.hk.2)
median.hk.unif.2 <- gamma.median(alpha.hk.unif.2, beta.hk.2)
cred.int.unif.2 <- c(qgamma(0.025, shape=alpha.hk.unif.2, rate=beta.hk.2), qgamma(0.975, alpha.hk.unif.2, beta.hk.2))
gg_post(
function(x){dgamma(x, shape=alpha.hk.unif.2, rate=beta.hk.2)},
cred.int.unif.2,
mean.hk.unif.2,
median.hk.unif.2,
Title=paste('Posterior distribution: Gamma(', alpha.hk.unif.2, ', ', beta.hk.2, ')', sep=''),
Subtitle='Positive uniform prior, observations 2'
)
Assuming a Jeffrey’s prior \[ g(\lambda) \propto 1/\sqrt{\lambda}\quad , \quad \text{with } \lambda>0 \] the posterior is again a \(Gamma(\alpha, \beta)\) function and the only difference from the previous case is that now \(\alpha=\sum y_i+1/2\). Again, I will distinguish between the 2 measurements.
Observations 1.
alpha.hk.jeffrey.1 <- s.y.1+1/2
mean.hk.jeffrey.1 <- gamma.mean(alpha.hk.jeffrey.1, beta.hk.1)
variance.hk.jeffrey.1 <- gamma.variance(alpha.hk.jeffrey.1, beta.hk.1)
median.hk.jeffrey.1 <- gamma.median(alpha.hk.jeffrey.1, beta.hk.1)
cred.int.jeffrey.1 <- c(qgamma(0.025, shape=alpha.hk.jeffrey.1, rate=beta.hk.1), qgamma(0.975, alpha.hk.jeffrey.1, beta.hk.1))
gg_post(
function(x){dgamma(x, shape=alpha.hk.jeffrey.1, rate=beta.hk.1)},
cred.int.jeffrey.1,
mean.hk.jeffrey.1,
median.hk.jeffrey.1,
Title=paste('Posterior distribution: Gamma(', alpha.hk.jeffrey.1, ', ', beta.hk.1, ')', sep=''),
Subtitle="Jeffrey's prior, observations 1"
)
Observations 2.
alpha.hk.jeffrey.2 <- s.y.2+1/2
mean.hk.jeffrey.2 <- gamma.mean(alpha.hk.jeffrey.2, beta.hk.2)
variance.hk.jeffrey.2 <- gamma.variance(alpha.hk.jeffrey.2, beta.hk.2)
median.hk.jeffrey.2 <- gamma.median(alpha.hk.jeffrey.2, beta.hk.2)
cred.int.jeffrey.2 <- c(qgamma(0.025, shape=alpha.hk.jeffrey.2, rate=beta.hk.2), qgamma(0.975, alpha.hk.jeffrey.2, beta.hk.2))
gg_post(
function(x){dgamma(x, shape=alpha.hk.jeffrey.2, rate=beta.hk.2)},
cred.int.jeffrey.2,
mean.hk.jeffrey.2,
median.hk.jeffrey.2,
Title=paste('Posterior distribution: Gamma(', alpha.hk.jeffrey.2, ', ', beta.hk.2, ')', sep=''),
Subtitle="Jeffrey's prior, observations 2"
)
Summarize the results.
df_ <- data.frame(
observation = rep(1:2, each=2),
prior = rep(c('Unif', 'Jeffrey'), 2),
mean = c(mean.hk.unif.1, mean.hk.jeffrey.1, mean.hk.unif.2, mean.hk.jeffrey.2),
median = c(median.hk.unif.1, median.hk.jeffrey.1, median.hk.unif.2, median.hk.jeffrey.2),
variance = c(variance.hk.unif.1, variance.hk.jeffrey.1, variance.hk.unif.2, variance.hk.jeffrey.2),
Cred.Int.Low = c(cred.int.unif.1[1], cred.int.jeffrey.1[1], cred.int.unif.2[1], cred.int.jeffrey.2[1]),
Cred.Int.High = c(cred.int.unif.1[2], cred.int.jeffrey.1[2], cred.int.unif.2[2], cred.int.jeffrey.2[2])
)
df_ %>%
kable(digits=6) %>%
kable_styling(full_width=FALSE)
observation | prior | mean | median | variance | Cred.Int.Low | Cred.Int.High |
---|---|---|---|---|---|---|
1 | Unif | 0.61500 | 0.61333 | 0.003075 | 0.51113 | 0.72834 |
1 | Jeffrey | 0.61250 | 0.61083 | 0.003062 | 0.50885 | 0.72562 |
2 | Unif | 0.70357 | 0.70238 | 0.002513 | 0.60875 | 0.80516 |
2 | Jeffrey | 0.70179 | 0.70060 | 0.002506 | 0.60709 | 0.80325 |
In a study on water quality of streams, a high level of bacteria X was defined as a level greater than 100 per 100 ml of stream water. n = 116 samples were taken from streams having a high environmental impact on pandas. Out of these, y = 11 had a high bacteria X level.
Since the sample of water either has or doesn’t have a high bacteria X level, then \(y\) follows a Binomial distribution. The frequentist estimator for the Binomial distribution is \[ \hat{p}_F = \frac{y}{n} = \frac{11}{116} \simeq 0.09483 \]
The Beta function is the conjugate prior for the Binomial distribution, so the posterior \(P(p|y)\) is also a Beta distribution, with parameters \[ \begin{align} \alpha'&=\alpha+y = 1+11 = 12 \\ \beta'&=\beta+n-y = 10+116-11 = 115 \end{align} \]
Let’s plot the posterior.
alpha.water <- 12
beta.water <- 115
xsize= 400
x <- seq(0, 1, length=xsize)
ggplot()+
geom_line(aes(x, dbeta(x, alpha.water, beta.water)), color='navyblue', size=0.6)+
labs(
x='p',
y='P(p|y, n)',
title=paste('Posterior probability, Beta(', alpha.water, ', ', beta.water, ')', sep='')
)+
xlim(0, 1)+
ylim(0, 16)+
theme_bw()
In the Bayesian approach, the estimator for \(p\) is the posterior mean, which in this case is \[ \hat{p}_B = \frac{\alpha'}{\alpha'+\beta'}=\frac{12}{12+115} \simeq 0.09449 \]
# GENERIC FUNCTIONS
beta.mean <- function(alpha_, beta_){
return(alpha_/(alpha_+beta_))
}
beta.variance <- function(alpha_, beta_){
return(
(alpha_*beta_)/((alpha_+beta_)^2*(alpha_+beta_+1))
)
}
beta.cred.int <- function(alpha_, beta_){
return(
c(
qbeta(0.025, alpha_, beta_),
qbeta(0.975, alpha_, beta_)
)
)
}
mean.water <- beta.mean(alpha.water, beta.water)
variance.water <- beta.variance(alpha.water, beta.water)
cred.int.water <- beta.cred.int(alpha.water, beta.water)
df_ <- data.frame(
mean = mean.water,
variance = variance.water,
Cred.Int.Low = cred.int.water[1],
Cred.Int.High = cred.int.water[2]
)
df_ %>%
kable(digits=6, caption='Posterior statistics') %>%
kable_styling(full_width=FALSE)
mean | variance | Cred.Int.Low | Cred.Int.High |
---|---|---|---|
0.094488 | 0.000668 | 0.050185 | 0.15084 |
\[ H_0: p=0.1 \text{ versus } H_1: p\neq 0.1 \]
This requires a two-sides hypothesis test, where the null distribution is the sampling distribution of \(y\), given that \(H_0\) is true: \(Binom(y|n=116, p=0.1)\).
In the Neyman and Pearson approach I simply have to check if \(y=11\) lies inside or outside the rejection region, defined as the region in which the probability of \(y\) is under the significance level \(\alpha\).
n <- 116
y <- 11
p0 <- 0.1
alpha.value <- 0.05
# function to plot the histogram for the hypothesis test
gg_HT <- function(n_, p0_, y_){
yplot <- seq(0, 30)
ggplot()+
geom_col(aes(x=yplot, y=dbinom(yplot, n_, p0_)), color='white', fill='lightblue', alpha=1)+
geom_col(aes(x=y_, y=dbinom(y_, n_, p0_)), color='white', fill='darkorange')+
geom_hline(yintercept=alpha.value, color='firebrick', linetype='dashed', size=0.6)+
annotate('text', x=2.5, y=alpha.value+0.005, label=paste('alpha=', alpha.value, sep=''))+
labs(
x='y',
y=paste('P(y|p=', p0_, ', n=', n_, ')', sep=''),
title='Hypothesis Test',
subtitle='Water quality of streams'
)+
scale_x_continuous(breaks=seq(0,30, by=2), limits=c(0, 30))+
theme_bw()
}
gg_HT(n, p0, y)
glue('Value in y={y}: {round(dbinom(y, n, p0), 4)} > {alpha.value}')
## Value in y=11: 0.1233 > 0.05
I observe that \(y = 10\) lies inside the acceptance region, so I do not reject the null hypothesis. I would have obtain the same result by evaluating the p-value:
glue('p-value: {round(pbinom(y, n, p0)+1-pbinom(n-y-1, n, p0), 4)} > {alpha.value}')
## p-value: 0.5043 > 0.05
The hypothesis test can be performed also using the Bayesian approach. I assume a \(Beta(1, 10)\) function as prior, so the posterior distribution is the same as in point (b). I already computed in point (c) the \((1-\alpha)\cdot 100\%=95\%\) credibility interval for \(p\), so to accept or reject the null hypothesis I simply have to check if \(p_0=0.1\) lies inside or outside this interval.
glue('95% credibility interval for p: [{round(cred.int.water[1], 4)}, {round(cred.int.water[2], 4)}]')
## 95% credibility interval for p: [0.0502, 0.1508]
\(p_0\) lies inside the interval, so also with the Bayesian approach I do not reject the null hypothesis \(H_0\).
For the same reason described in point (a), the frequentist estimator for \(p\) is \[ \hat{p}_F = \frac{y}{n} = \frac{9}{165} \simeq 0.054545 \]
To find the Bayesian estimator, I must first compute the posterior distribution associated with each of the considered priors. Assuming a \(Beta(1,10)\) prior, the posterior \(P(p|y)\) is a Beta distribution with parameters \[ \begin{align} \alpha'&=\alpha+y = 1+9 = 10 \\ \beta'&=\beta+n-y = 10+165-9 = 166 \end{align} \]
Instead, assuming as prior the posterior probability of the older measurement, which was a \(Beta(12, 115)\) function, the posterior is again a Beta distribution, in this case with parameters \[ \begin{align} \alpha'&=\alpha_{old}+y = 12+9 = 21 \\ \beta'&=\beta_{old}+n-y = 115+165-9 = 271 \end{align} \]
Beta(1, 10) prior \[ \hat{p}_B = \frac{\alpha'}{\alpha'+\beta'}=\frac{10}{10+166} \simeq 0.056818 \] Beta(12, 115) prior, i.e. posterior of the older measurement \[ \hat{p}_B = \frac{\alpha'}{\alpha'+\beta'}=\frac{21}{21+271} \simeq 0.071918 \]
alpha.w.1 <- 10
beta.w.1 <- 166
alpha.w.2 <- 21
beta.w.2 <- 271
mean.w.1 <- beta.mean(alpha.w.1, beta.w.1)
variance.w.1 <- beta.variance(alpha.w.1, beta.w.1)
cred.int.w.1 <- beta.cred.int(alpha.w.1, beta.w.1)
mean.w.2 <- beta.mean(alpha.w.2, beta.w.2)
variance.w.2 <- beta.variance(alpha.w.2, beta.w.2)
cred.int.w.2 <- beta.cred.int(alpha.w.2, beta.w.2)
df_ <- data.frame(row.names=c('Beta.1.10', 'Beta.12.115'))
df_['mean'] <- c(mean.w.1, mean.w.2)
df_['variance'] <- c(variance.w.1, variance.w.2)
df_['Cred.Int.Low'] <- c(cred.int.w.1[1], cred.int.w.2[1])
df_['Cred.Int.High'] <- c(cred.int.w.1[2], cred.int.w.2[2])
df_ %>%
kable(digits=6) %>%
kable_styling(full_width=FALSE)
mean | variance | Cred.Int.Low | Cred.Int.High | |
---|---|---|---|---|
Beta.1.10 | 0.056818 | 0.000303 | 0.027739 | 0.09538 |
Beta.12.115 | 0.071918 | 0.000228 | 0.045224 | 0.10415 |
Plot of the posteriors:
xsize= 400
x <- seq(0, 1, length=xsize)
x.zoom <- seq(0, 0.15, length=xsize)
gg_w_12 <- ggplot()+
geom_line(aes(x, dbeta(x, alpha.w.1, beta.w.1), color='Beta(1, 10)'), size=0.6)+
geom_line(aes(x, dbeta(x, alpha.w.2, beta.w.2), color='Beta(12, 115)'), size=0.6, linetype='twodash')+
labs(
x='p',
y='P(p|y, n)',
title='Posterior distributions'
)+
xlim(0, 1)+
ylim(0, 30)+
scale_color_manual('Assumed prior', values=c('Beta(1, 10)'='navyblue', 'Beta(12, 115)'='orangered'))+
theme_bw()
gg_w_12_zoom <- ggplot()+
geom_line(aes(x.zoom, dbeta(x.zoom, alpha.w.1, beta.w.1)), color='navyblue', size=0.6)+
geom_line(aes(x.zoom, dbeta(x.zoom, alpha.w.2, beta.w.2)), color='orangered', size=0.6, linetype='twodash')+
labs(
x='p',
y='P(p|y, n)'
)+
ylim(0, 30)+
theme_bw()
gg_w_12 + annotation_custom(ggplotGrob(gg_w_12_zoom), xmin = 0.5, xmax = 1, ymin = 10, ymax = 30)
\[ H_0: p=0.1 \text{ versus } H_1: p\neq 0.1 \]
As in point (d), the null distribution is the sampling distribution of \(y\), given that \(H_0\) is true: \(Binom(y|n=165, p=0.1)\).
# useful quantities
n <- 165
y <- 9
p0 <- 0.1
alpha.value <- 0.05
Frequentist approach
gg_HT(n, p0, y)
glue('Value in y={y}: {round(dbinom(y, n, p0), 4)} < {alpha.value}')
## Value in y=9: 0.0146 < 0.05
\(y=9\) lies outside the acceptance region, so according to the Neyman and Pearson approach I reject the null hypothesis \(H_0\). The same result can be obtained by evaluating the p-value:
glue('p-value: {round(pbinom(y, n, p0)+1-pbinom(n-y-1, n, p0), 4)} < {alpha.value}')
## p-value: 0.0275 < 0.05
Bayesian approach
With the Bayesian approach it
is necessary to distinguish between the 2 priors defined in point (f).
For each, the null hypothesis is accepted or rejected depending on
whether \(p_0\) falls inside the 95%
credibility interval for \(p\) or
not.
glue('
95% credibility interval for p, assuming a Beta(1, 10) prior:
[{round(cred.int.w.1[1], 4)}, {round(cred.int.w.1[2], 4)}]\n
95% credibility interval for p, assuming a Beta(12, 115) prior:
[{round(cred.int.w.2[1], 4)}, {round(cred.int.w.2[2], 4)}]'
)
## 95% credibility interval for p, assuming a Beta(1, 10) prior:
## [0.0277, 0.0954]
##
## 95% credibility interval for p, assuming a Beta(12, 115) prior:
## [0.0452, 0.1042]
The result is different in the two cases, in fact in the first \(p_0=0.1\) lies outside the interval, while in the second it falls inside. So, with Bayesian approach I reject the null hypothesis when using a \(Beta(1, 10)\) prior, while I accept it when using as prior the posterior of the older measurement. This is reasonable, as \(p_0\) is very close to the upper bounds of both 95% credibility intervals and in the second case the prior and the posterior are both slightly shifted towards larger values of \(p\).
ggplot()+
geom_line(aes(x, dbeta(x, 1, 10), color='Beta(1, 10)'), size=0.6)+
geom_line(aes(x, dbeta(x, 12, 115), color='Beta(12, 115)'), size=0.6, linetype='twodash')+
labs(
x='p',
y='P(p)',
title='Prior distributions'
)+
xlim(0, 1)+
scale_y_continuous(breaks=seq(0, 16, by=4), limits=c(0, 16))+
scale_color_manual('', values=c('Beta(1, 10)'='navyblue', 'Beta(12, 115)'='orangered'))+
theme_bw()+
theme(legend.title=element_blank())
Starting with the coda output, here I define an R6
object which uses the ggmcmc
package to perform the chain
analysis and display the results of MCMC. Some methods are just wrappers
around the built-in ggmcmc
functions.
chainR6 <- R6Class(
'chainR6',
public = list(
param = NULL, # string
chain.ggs = NULL, # coda.samples object
cred.int = NULL, # credibility interval
initialize = function(chain_, param){
self$param <- param
self$chain.ggs <- ggs(chain_)
},
# plot the chains
traceplot = function(title=''){
ggs_traceplot(self$chain.ggs)+
geom_line(color='steelblue')+
labs(
x='Iteration',
y='Value'
)+
{if (title!='') labs(title=title)}+
theme_bw()
},
# plot the density of the parameter
density = function(title=''){
self$chain.ggs %>%
filter(Parameter==self$param) %>%
ggs_density()+
geom_density(color='steelblue', fill='lightblue', alpha=0.6)+
labs(
x='Value',
y='Density'
)+
{if (title!='') labs(title=title)}+
theme_bw()
},
# plot running mean
running = function(title=''){
ggs_running(self$chain.ggs)+
{if (title!='') labs(title=title)}+
theme_bw()+
theme(legend.position="none")
},
# plot autocorrelation
autocorrelation = function(title=''){
ggs_autocorrelation(self$chain.ggs)+
geom_col(fill='steelblue')+
{if (title!='') labs(title=title)}+
theme_bw()
},
# plot the histogram of p, with the credibility interval
inference = function(y.cred.int=1, ci.round=3, subtitle='', xlim, ylim){
self$cred.int <- (
ci(self$chain.ggs, thick_ci = c(0.05, 0.95), thin_ci = c(0.025, 0.975))[1,c('low','high')]
)
self$chain.ggs %>%
filter(Parameter==self$param) %>%
ggplot(aes(x=value))+
geom_histogram(aes(y=..density..), bins=40, color='white', fill='forestgreen')+
geom_segment(aes(x = self$cred.int$low, y = 0, xend = self$cred.int$high, yend = 0),
size=2.5, color='darkred')+
annotate('label', x=(self$cred.int$low+self$cred.int$high)/2, y=y.cred.int,
label='95% Cred. Int.', size=6, fill='white', alpha=0.8)+
annotate('label', x=self$cred.int$low, y=y.cred.int,
label=paste(round(self$cred.int$low, ci.round)), size=6, fill='white', alpha=0.8)+
annotate('label', x=self$cred.int$high, y=y.cred.int,
label=paste(round(self$cred.int$high, ci.round)), size=6, fill='white', alpha=0.8)+
labs(
x=self$param,
y=paste('f(', self$param, ')', sep=''),
title=paste('Inference on ', self$param, sep='')
)+
{if (!missing(xlim)) xlim(xlim[1], xlim[2])}+
{if (!missing(ylim)) ylim(ylim[1], ylim[2])}+
{if (subtitle!='') labs(subtitle=subtitle)}+
theme_bw()
},
# plot the predictions of y
prediction = function(binomial=FALSE, n.next=NA, x.plot.max=NA, subtitle=''){
if (binomial){
Title <- paste('Number of successes in ', n.next, ' future trials', sep='')
xmax <- n.next
} else{
Title <- 'Predicted counts'
xmax <- x.plot.max
}
self$chain.ggs %>%
filter(Parameter=='y') %>%
ggplot(aes(x=value))+
geom_bar(aes(y = ..prop..), color='black', fill='firebrick')+
labs(
x='y',
y='f(y)',
title=Title
)+
{if (subtitle!='') labs(subtitle=subtitle)}+
scale_x_continuous(breaks=seq(0, xmax, by=1))+
theme_bw()
},
# plot correlation between p and y
correlation = function(x.min, x.max, y.min, y.max, subtitle=''){
self$chain.ggs %>%
summarise(
y=value[Parameter=='y'],
p=value[Parameter==self$param]
)%>%
ggplot()+
geom_point(aes(x=p, y=y), color='navyblue', shape=3)+
labs(
title=paste('Correlation between ', self$param, ' and the predicted variable', sep='')
)+
{if (subtitle!='') labs(subtitle=subtitle)}+
xlim(x.min, x.max)+
scale_y_continuous(breaks=seq(y.min, y.max, by=2), limits=c(y.min, y.max))+
theme_bw()
}
)
)
In exercises 4, 5 and 6 I run MCMC with JAGS. In all of these it is possible to notice that the chains behave correctly, the average values of \(p\) (or \(\lambda\)) and \(y\) converge rapidly and the autocorrelation scales well with the lag. Regarding the inference of \(p\) (or \(\lambda\)), it is possible to see that the posterior distributions found with JAGS are compatible with the corresponding ones found analytically in the previous exercises (very similar shape, mean/mode, credibility interval).
Analyze the data of Exercise 1 using a MCMC with JAGS (solve only point c of Ex 1). Notice that in the Exercise 1 we are dealing with a Bernoulli process, but we only know the number of successes out of \(n\) trials.
modelpath_e4 <- "./bugs_models/model_ex4.bug"
model_string <- "model {
# data likelihood (prob of y given p and n)
X ~ dbin(p, n);
# beta prior for p
p ~ dbeta(alpha, beta);
# predicted data , given p and the number of samples
y ~ dbin(p, n_next);
}
"
writeLines(model_string , con=modelpath_e4)
data <- NULL
data$X <- 6 # number of observations
data$n <- 75 # number of patients screened
data$n_next <- 10 # predictions (10 more data)
data$alpha <- alpha.prior
data$beta <- beta.prior
jm <- jags.model(file=modelpath_e4, data=data, n.chains=1, n.adapt=500)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 1
## Unobserved stochastic nodes: 2
## Total graph size: 7
##
## Initializing model
update(jm, 1000)
chain <- coda.samples(jm, c("p", "y"), n.iter=10000)
print(summary(chain))
##
## Iterations = 1501:11500
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 10000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## p 0.0849 0.0305 0.000305 0.000427
## y 0.8392 0.9244 0.009244 0.009594
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## p 0.0354 0.0628 0.0812 0.103 0.155
## y 0.0000 0.0000 1.0000 1.000 3.000
# Correlation matrix
cat("Correlation matrix: \n")
## Correlation matrix:
as.data.frame(as.mcmc(chain)) %>%
cor() %>%
print()
## p y
## p 1.00000 0.33263
## y 0.33263 1.00000
chain_e4 <- chainR6$new(chain, param='p')
chain_e4$traceplot()
chain_e4$density()
chain_e4$running()
chain_e4$autocorrelation()
chain_e4$inference(y.cred.int=1.25, ylim=c(0, 15))
chain_e4$prediction(binomial=TRUE, n.next=data$n_next)
chain_e4$correlation(x.min=0, x.max=0.25, y.min=0, y.max=data$n_next)
Analyze the data of Exercise 2 using a MCMC with JAGS.
modelpath_e5 <- "./bugs_models/model_ex5.bug"
model_string <- "model {
# data likelihood
for (i in 1:length(X)) {
X[i] ~ dpois(lambda);
}
# uniform prior for lambda
lambda ~ dunif(min.prior, max.prior)
# predicted data , given lambda
y ~ dpois(lambda);
}
"
writeLines(model_string, con=modelpath_e5)
In this case, given the table in exercise 02, I need to generate the data, repeating each value of \(y\) the corresponding number of times (\(n_1\) or \(n_2\)). Furthermore, I have to distinguish between the 2 sets of observations.
df_horsekick %>%
kable() %>%
kable_styling(full_width=FALSE)
y | n.1 | n.2 |
---|---|---|
0 | 109 | 144 |
1 | 65 | 91 |
2 | 22 | 32 |
3 | 3 | 11 |
4 | 1 | 2 |
5 | 0 | 0 |
obs.1 <- rep(df_horsekick$y, df_horsekick$n.1)
obs.2 <- rep(df_horsekick$y, df_horsekick$n.2)
cat('Observations 1:\n', obs.1, '\n\nObservations 2:\n', obs.2)
## Observations 1:
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 4
##
## Observations 2:
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 4 4
Observations 1.
data <- NULL
data$X <- obs.1
data$min.prior <- 0
data$max.prior <- 10
jm <- jags.model(file=modelpath_e5, data=data, n.chains=1, n.adapt=500)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 200
## Unobserved stochastic nodes: 2
## Total graph size: 204
##
## Initializing model
update(jm, 1000)
chain <- coda.samples(jm, c("lambda", "y"), n.iter=10000)
print(summary(chain))
##
## Iterations = 1501:11500
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 10000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## lambda 0.615 0.0554 0.000554 0.00068
## y 0.618 0.7916 0.007916 0.00842
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## lambda 0.511 0.577 0.614 0.651 0.728
## y 0.000 0.000 0.000 1.000 3.000
# Correlation matrix
cat("Correlation matrix: \n")
## Correlation matrix:
as.data.frame(as.mcmc(chain)) %>%
cor() %>%
print()
## lambda y
## lambda 1.00000 0.07345
## y 0.07345 1.00000
chain_e5_1 <- chainR6$new(chain, param='lambda')
Observations 2.
data <- NULL
data$X <- obs.2
data$min.prior <- 0
data$max.prior <- 10
jm <- jags.model(file=modelpath_e5, data=data, n.chains=1, n.adapt=500)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 280
## Unobserved stochastic nodes: 2
## Total graph size: 284
##
## Initializing model
update(jm, 1000)
chain <- coda.samples(jm, c("lambda", "y"), n.iter=10000)
print(summary(chain))
##
## Iterations = 1501:11500
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 10000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## lambda 0.702 0.0504 0.000504 0.000646
## y 0.694 0.8304 0.008304 0.008304
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## lambda 0.607 0.668 0.702 0.735 0.804
## y 0.000 0.000 1.000 1.000 3.000
# Correlation matrix
cat("Correlation matrix: \n")
## Correlation matrix:
as.data.frame(as.mcmc(chain)) %>%
cor() %>%
print()
## lambda y
## lambda 1.000000 0.044954
## y 0.044954 1.000000
chain_e5_2 <- chainR6$new(chain, param='lambda')
tit.1 <- 'Observations 1'
tit.2 <- 'Observations 2'
ggarrange(chain_e5_1$traceplot(title=tit.1), chain_e5_2$traceplot(title=tit.2), ncol=2, nrow=1)
ggarrange(chain_e5_1$density(title=tit.1), chain_e5_2$density(title=tit.2), ncol=1, nrow=2)
ggarrange(chain_e5_1$running(title=tit.1), chain_e5_2$running(title=tit.2), ncol=2, nrow=1)
ggarrange(chain_e5_1$autocorrelation(title=tit.1), chain_e5_2$autocorrelation(title=tit.2), ncol=1, nrow=2)
ggarrange(
chain_e5_1$inference(y.cred.int=1.5, subtitle=tit.1, xlim=c(0.45, 0.9), ylim=c(0, 8)),
chain_e5_2$inference(y.cred.int=1.5, subtitle=tit.2, xlim=c(0.45, 0.9), ylim=c(0, 8)),
ncol=1, nrow=2
)
ggarrange(
chain_e5_1$prediction(x.plot.max=5, subtitle=tit.1),
chain_e5_2$prediction(x.plot.max=5, subtitle=tit.2),
ncol=1, nrow=2
)
ggarrange(
chain_e5_1$correlation(x.min=0.4, x.max=1, y.min=0, y.max=6, subtitle=tit.1),
chain_e5_2$correlation(x.min=0.4, x.max=1, y.min=0, y.max=6, subtitle=tit.2),
ncol=1, nrow=2
)
Analyze the data of Exercise 3 using a MCMC with JAGS (solve points b and c).
To solve this exercise I can use the same JAGS model defined for exercise 4.
data <- NULL
data$X <- 11 # number of observations
data$n <- 116 # number of patients screened
data$n_next <- 10 # predictions (10 more data)
data$alpha <- 1
data$beta <- 10
jm <- jags.model(file=modelpath_e4, data=data, n.chains=1, n.adapt=500)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 1
## Unobserved stochastic nodes: 2
## Total graph size: 7
##
## Initializing model
update(jm, 1000)
chain <- coda.samples(jm, c("p", "y"), n.iter=10000)
print(summary(chain))
##
## Iterations = 1501:11500
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 10000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## p 0.0948 0.0256 0.000256 0.000325
## y 0.9467 0.9568 0.009568 0.009932
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## p 0.0508 0.0768 0.0927 0.111 0.15
## y 0.0000 0.0000 1.0000 1.000 3.00
# Correlation matrix
cat("Correlation matrix: \n")
## Correlation matrix:
as.data.frame(as.mcmc(chain)) %>%
cor() %>%
print()
## p y
## p 1.00000 0.25555
## y 0.25555 1.00000
chain_e6 <- chainR6$new(chain, param='p')
chain_e6$traceplot()
chain_e6$density()
chain_e6$running()
chain_e6$autocorrelation()
chain_e6$inference(y.cred.int=1.25)
chain_e6$prediction(binomial=TRUE, n.next=data$n_next)
chain_e6$correlation(x.min=0, x.max=0.25, y.min=0, y.max=data$n_next)
In the Bayesian approach, the estimator for \(p\) is the posterior mean.
p_estimate <- chain_e6$chain.ggs %>%
filter(
Parameter=='p'
) %>%
summarise(
Mean = mean(value),
Variance = var(value)
)
cat('Bayesian estimator for p:', p_estimate$Mean)
## Bayesian estimator for p: 0.094829
Notice that the bayesian estimator for \(p\) found running MCMC is almost identical to the analytical result obtained in point (c) of exercise 3. The following table shows that the same is true for the other properties of the distribution, such as the variance and the credibility interval.
p_estimate['Cred.Int.Low'] <- chain_e6$cred.int[1]
p_estimate['Cred.Int.High'] <- chain_e6$cred.int[2]
p_estimate %>%
kable(caption="Results of MCMC; posterior distribution of p") %>%
kable_styling(full_width=FALSE)
Mean | Variance | Cred.Int.Low | Cred.Int.High |
---|---|---|---|
0.09483 | 0.00065 | 0.05076 | 0.14959 |
It is also possible to plot the analytical posterior above the histogram obtained running MCMC, to have a visual comparison of the 2 distributions.
tmp <- chain_e6$chain.ggs %>%
filter(Parameter=='p') %>%
summarise(
min = min(value),
max = max(value),
len = length(value)
)
x <- seq(tmp$min, tmp$max, length=tmp$len)
chain_e6$inference(y.cred.int=1.25) +
geom_line(aes(x, dbeta(x, alpha.water, beta.water), color='Analytical Posterior'), size=0.8)+
scale_color_manual('', values=c('Analytical Posterior'='navyblue'))+
theme(legend.position='bottom')