knitr::opts_chunk$set(dev = "ragg_png")
options(digits=5) # set number of digits equal to 5
# tidyverse
library(tidyverse)
# others
library(tictoc)
library(gridExtra)
library(kableExtra)
library(glue)
library(highcharter)
library(latex2exp)
library(pracma)
library(pander)
library(htmltools)
options(highcharter.theme = hc_theme_elementary())
Consider the following triangular distribution \[ f(x)= \begin{cases} \frac{2(x-a)}{(b-a)(c-a)} & a\leq x < c \\ \frac{2(b-x)}{(b-a)(b-c)} & c\leq x < b \\ 0 & \text{otherwise} \end{cases} \] where \(c\in[a, b]\).
d_triang <- function(x, params){
a = params[1]
b = params[2]
c = params[3]
if (!(c>=a & c<=b)) stop('c is not between a and b')
return(
ifelse(
a<=x & x<c,
2*(x-a)/((b-a)*(c-a)),
ifelse(
c<=x & x<=b,
2*(b-x)/((b-a)*(b-c)),
0
)
)
)
}
a = -2
b = 2
c = 0
x_plot = seq(-3, 3, by=0.1)
# generic plot of a function
gg_triang <- function(x_values, funct, args, title='My function', xlab = 'x', ylab='f(x)'){
ggplot() +
geom_line(aes(x=x_values, y=funct(x_values, args)), size=0.6) +
labs(
title=title,
x=xlab,
y=ylab
) +
scale_x_continuous(breaks=seq(x_values[1], tail(x_values, 1), by=0.5)) +
theme_bw()
}
gg_triang(x_plot, d_triang, c(a, b, c), title='PDF of the Triangular distribution, with a=-2, b=2, c=0', xlab='X', ylab='f(X)')
# Cumulative distribution
p_triang <- function(x, params){
integrals = numeric(length(x))
for (i in seq_along(x)){
integrals[i] <- integrate(d_triang, lower=a, upper = x[i], params)$value # set lower to a=-2 instead than -Inf for speed reasons
}
return(integrals)
}
gg_triang(x_plot, p_triang, c(a, b, c), title='CDF of the Triangular distribution, with a=-2, b=2, c=0', xlab='X', ylab='F(X)')
# Inverse functions;
# bounds set such that it searches for a solution in [-2, 2], the only interval in which the cdf is invertible
inverse <- function(y, params){
uniroot(function(x){p_triang(x, params) - y}, lower = -2, upper = 2)$root
}
# Quantile function
q_triang <- function(y, params, lower.value=-2, upper.value=2){
output <- numeric(length(y))
for (i in seq_along(y)){
if (y[i]==1) output[i]<-upper.value # boundary value
else if (y[i]==0) output[i]<-lower.value # boundary value
else output[i] <- inverse(y[i], params)
}
return(output)
}
# Generate random numbers
r_triang_1 <- function(n, params, seed=1){
set.seed(seed)
q_triang(runif(n), params)
}
In order to use the acceptance/rejection method, we must find a finite interval \([a', b']\) in which the pdf is defined and a value \(M\in\mathbb{R}\) such that \(\forall x \in [a',b'] \; f(x)<M\).
My choices for the interval and \(M\) are:
d_triang(0, c(a, b, c))
## [1] 0.5
r_triang_2 <- function(n, f, params, seed=1, a_=a, b_=b, M=0.5){
set.seed(seed)
y = numeric(n)
i=1
while (i <= n){
u_1 <- runif(1, a_, b_)
u_2 <- runif(1, 0, 1)
x_1 <- a + (b-a)*u_1
if (u_2*M < f(u_1, params)){
y[i] <- u_1
i=i+1
}
}
return(y)
}
tic("Generate 10^4 random numbers using the inverse transform method")
generated_points <- r_triang_1(10000, c(a, b, c))
toc()
## elapsed time is 17.070000 seconds
gg_gen <- function(method){
ggplot() +
geom_histogram(aes(x=generated_points, y=..density..), binwidth = 0.1, center = 0.1, fill='lightblue', colour="black") +
stat_function(fun=function(x)d_triang(x, c(a, b, c)), color='firebrick', size=0.7) +
labs(title=paste('Sampling results using the', method, 'method'),
x='X',
y='Count (normalized)') +
scale_x_continuous(breaks=seq(-3, 3, by=0.5), limits=c(-3,3)) +
theme_bw()
}
gg_gen('inverse transform')
tic("Generate 10^4 random numbers using the acceptance/rejection method")
generated_points <- r_triang_2(10000, d_triang, c(a, b, c))
toc()
## elapsed time is 0.380000 seconds
gg_gen('acceptance/rejection')
Notice that the acceptance/rejection method is much faster than the inverse transform method. This is due to the fact that the CDF and its inverse are not computed analytically, but numerically, and this slows down the second method a lot.
Markov’s inequality represents an upper bound to probability distributions. Having defined a function \(G(k) = 1-F(k) \equiv P(X \geq k)\), plot \(G(k)\) and the Markov’s upper bound for
I want to start by defining some general functions, such as \(G(k)\), the Markov’s upper bound and the plotting function.
G <- function(cdf_values){
return(1-cdf_values)
}
markov_bound <- function(k, exp_value){
if (any(k<=0)) stop('k must be > 0')
return(exp_value/k)
}
gg_Markov <- function(k_values, cdf_values, exp_value, title="Markov's inequality", xlab = 'k', logscale=FALSE){
ggplot() +
geom_line(aes(x=k_values, y=markov_bound(k, exp_value), colour="Markov's bound"), size=0.6) + # Markov's upper bound
geom_line(aes(x=k_values, y=G(cdf_values), color='G(k)'), size=0.6) + # G(k)
labs(
title=title,
x=xlab
) +
{if (logscale==TRUE) {
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x)))}
}+
scale_x_continuous(breaks=seq(1, 10, by=1)) +
scale_color_manual(values = c("Markov's bound" = 'firebrick', 'G(k)' = 'steelblue')) +
theme_bw() +
theme(axis.title.y=element_blank(), legend.title= element_blank())
}
Since the Markov’s upper bound diverges as \(k\) approaches zero, I plot the distribution starting from \(k=1\).
k <- seq(1, 10, by=0.05)
gg_Markov(k, cdf_values=pexp(k, 1), exp_value=1, title=TeX("Markov's inequality, exponential distribution ($\\lambda=1$)"))
k <- seq(2, 6, by=0.05)
gg_Markov(k, cdf_values=punif(k, 3, 5), exp_value=4, title=TeX("Markov's inequality, uniform distribution ($\\mu=3, \\sigma=5$)"))
The function \(Bin(n = 1, p = 1/2)\) describes the probability of \(x\) successes in 1 trial, where the probability of a success is \(1/2\). So there are only 2 possible value of \(x\) that give a non-zero probability: \(x=0\) and \(x=1\). Regarding the cumulative distribution, we have that \(F(x=0)=f(x=0)=0.5\) and \(F(x)=1 \; \forall x\geq 1\). So the only meaningful points are \(x=0\) and \(x=1\).
Moreover, as the Markov’s upper bound diverges in 0, it only makes sense to compute it in 1.
k <- 0:1
y <- G(pbinom(0:1, 1, 1/2))
ggplot() +
geom_point(aes(x=k, y=y, color='G(k)'), size=2.5) +
geom_segment(aes(x=k, xend=k, y=0, yend=y, color='G(k)')) +
geom_point(aes(x=1, y=markov_bound(1, 1/2), colour="Markov's bound"), size=2.5) +
labs(
title=TeX("Markov's inequality, binomial distribution ($n=1, p=1/2$)"),
x='k'
) +
scale_x_continuous(breaks=seq(-1, 2, by=1), limits=c(-0.5, 1.5)) +
scale_color_manual(values = c('G(k)' = 'steelblue', "Markov's bound" = 'firebrick')) +
theme_bw() +
theme(axis.title.y=element_blank(), legend.title= element_blank())
Also in this case I plot the distribution starting from \(k=1\), so not considering \(k=0\), that corresponds to an infinite value of the Markov’s bound. Here I use the logarithmic scale on the y axis, as \(G(k)\) immediately becomes very small.
k <- seq(1, 10, by=0.05)
gg_Markov(k, cdf_values=ppois(k, 1/2), exp_value=4, title=TeX("Markov's inequality, Poisson distribution ($\\lambda=1/2$)"), logscale=TRUE)
Use R to show, with a plot, that Chebyshev’s inequality is an upper bound to the following distributions:
First I need to define some general functions that I will use to show that Chebyshev’s inequality is an upper bound of the given distributions.
prob_ksigma
: returns, for a given probability
distribution, the probability that X deviates of at least \(k\cdot\sigma\) from its expected value, so
\[
\begin{align*}
P\left[|X-\mu|\geq k \sigma\right]&=P[X \in (-\infty,
\mu-k\sigma] \cup [\mu+k\sigma, \infty)]=P[X\leq \mu -k
\sigma]+P[X\geq\mu+k \sigma] = \\ & =
F[\mu-k\sigma]+(1-F[u+k\sigma])+P[X=\mu+k\sigma]
\end{align*}
\] While in the continuous case the last probability is zero, as
it is the probability of one single value, for a discrete distribution,
when \(\mu+k\sigma\) is integer, it
must be considered .chebyshev_bound
: Chebyshev’s upper bound to the above
probabilities \[
P\left[|X-\mu|\geq k \sigma\right]\leq\frac{1}{k^2}
\]gg_Chebyshev
: plot function.prob_ksigma <- function(k, cdf, mu, sigma){
return(cdf(mu-k*sigma)+1-cdf(mu+k*sigma))
}
chebyshev_bound <- function(k){
return(1/k^2)
}
gg_Chebyshev <- function(k_values, cdf, mu, sigma, title="Chebyshev's inequality", xlab = 'k', logscale=FALSE){
ggplot() +
geom_line(aes(x=k_values, y=chebyshev_bound(k_values), colour='M'), size=0.6) + # Chebyshev's upper bound
geom_line(aes(x=k_values, y=prob_ksigma(k_values, cdf, mu, sigma), color='P'), size=0.6) + # P(k...)
labs(
title=title,
x=xlab
) +
{if (logscale==TRUE) {
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x)))}
}+
scale_x_continuous(breaks=seq(1, 10, by=1)) +
scale_color_manual(values = c("M" = 'firebrick', 'P' = 'steelblue'), labels=unname(TeX(c("Chebishev's bound", "$P(|X-\\mu|\\geq k\\sigma)$")))) +
theme_bw() +
theme(axis.title.y=element_blank(), legend.title= element_blank())
}
Test of the function prob_ksigma
:
for (i in 1:3){
cat('Probability that X is within', i, 'sigma in the normal distribution:', 1-prob_ksigma(i, function(x) pnorm(x, 0, 1), 0, 1), '\n')
}
## Probability that X is within 1 sigma in the normal distribution: 0.68269
## Probability that X is within 2 sigma in the normal distribution: 0.9545
## Probability that X is within 3 sigma in the normal distribution: 0.9973
k <- seq(1, 10, by=0.1)
gg_Chebyshev(k, function(x) pnorm(x, 3, 5), mu=3, sigma=5, title=TeX("Chebyshev's inequality, normal distribution ($\\mu=3, \\sigma=5$)"))
lambda <- 1
gg_Chebyshev(k, function(x) pexp(x, lambda), mu=1/lambda, sigma=1/lambda, title=TeX("Chebyshev's inequality, exponential distribution ($\\lambda=1)"))
a <- 1-sqrt(2)
b <- 1+sqrt(2)
sigma_un <- (b-a)/sqrt(12)
# k such that k*sigma = b-a/2
k_zero <- (b-a)/(2*sigma_un)
gg_Chebyshev(seq(1, 10, by=0.05), function(x) punif(x, a, b), mu=(a+b)/2, sigma=sigma_un, title=TeX("Chebyshev's inequality, uniform distribution ($a=1-\\sqrt{2}, b=1+\\sqrt{2}$)")) +
geom_point(aes(k_zero, 0), color='black', size=2) +
annotate('text', x = k_zero+0.7, y = 0.04, label = '(1.73, 0)') +
theme(plot.title = element_text(size=12))
cat('k such that k*sigma is equal to half the length of the interval:', k_zero)
## k such that k*sigma is equal to half the length of the interval: 1.7321
For larger values of \(k\), the probability of being outside the interval \((\mu-k\sigma, \mu+k\sigma)\) is obviously zero, as the distribution is always zero in the region outside \((a, b)\).
As \(\lambda=1/3\), we have that
\(\mu=1/3\) and \(\sigma=1/\sqrt{3}\). For this reason and
for the numerical uncertainty due to computer approximations, \(\mu+k\sigma\) is unlikely to be integer.
Moreover, the probability function of the Poisson distribution is
defined only for integer value. All this leads me to neglect \(P[X=\mu+k\sigma]\) in the formula defined
above. This makes even more sense if we think that to have \(1/3+k/\sqrt{3}\) integer \(k\) would have to be an integer multiple of
\(\sqrt{3}\), so an irrational number
and a number that surely won’t be passed to
prob_ksigma
.
lambda <- 1/3
gg_Chebyshev(k, function(x) ppois(x, lambda), mu=lambda, sigma=sqrt(lambda), title=TeX("Chebyshev's inequality, Poisson distribution ($\\lambda=1/3)"), logscale=TRUE)
The six boxes toy model is described in [1].
Write a program in R that:
j
–> vector containing the numbers associated to
each boxp_H
–> vector containing the probability of each
hypothesis (box)p_W_H
–> vector of the probabilities of observing a
white, given the box \(j\)p_W_B
–> vector of the probabilities of observing a
black, given the box \(j\)j <- 0:5
# initialize the probabilities
p_H <- c(rep(1/6, 6))
# conditional probability of observing a white or black
p_W_H <- j/5
p_B_H <- (5-j)/5
# dataframe of all the probabilities, as a function of the extraction step
prob_H_steps <- data.frame(matrix(ncol=length(j)+1, nrow=0))
colnames(prob_H_steps) <- c('H_0', 'H_1', 'H_2', 'H_3', 'H_4', 'H_5', 'color')
i <- 1
cat('\014')
# plot function
hc_plot <- function(i_H){
hc <- prob_H_steps %>%
mutate_if(is.numeric, round, digits=4)
hc <- {if(i_H == 0){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_0, group=color), color=col)
}else if(i_H == 1){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_1, group=color), color=col)
}else if(i_H == 2){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_2, group=color), color=col)
}else if(i_H == 3){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_3, group=color), color=col)
}else if(i_H == 4){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_4, group=color), color=col)
}else if(i_H == 5){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_5, group=color), color=col)}} %>%
hc_title(text=paste('H', i_H, sep=''), align='center') %>%
hc_xAxis(title = list(text = 'Extraction step')) %>%
hc_yAxis(title = list(text=paste('P(H', i_H, ')', sep='')), min=0, max=1)
}
check_unique_color <- TRUE
repeat{
i_color <- menu(c('White', 'Black', 'End the game'), title='What color has been extracted?')
if (i_color == 3) break
# update p_H depending on the extracted color
if (i_color == 1){
p_H <- (p_W_H*p_H)/dot(p_W_H, p_H)
col <- 'W'
}else if(i_color == 2){
p_H <- (p_B_H*p_H)/dot(p_B_H, p_H)
col <- 'B'
}
# print on the standard output the probability of selecting each box
cat('\nProbability of selecting each box:\n\n')
print(p_H)
cat('\n')
# update the dataframe with the probabilities
prob_H_steps[i,] <- append(p_H, list(col))
i <- i+1
# check if the extracted stones are of one color only and update plot color list accordingly
if (check_unique_color){
if (length(unique(prob_H_steps$color))==1){
if (unique(prob_H_steps$color)=='W'){
col <- c('steelblue')
}else if (unique(prob_H_steps$color)=='B'){
col <- c('firebrick')
}
}else{
col <- c('firebrick', 'steelblue')
check_unique_col <- FALSE
}
}
# plot in browser
save_html(hw_grid(hc_plot(0), hc_plot(1), hc_plot(2), hc_plot(3), hc_plot(4), hc_plot(5), ncol=3, rowheight=350), file="6_boxes_toy_model.html")
openFileInOS("6_boxes_toy_model.html")
}
head(prob_H_steps[dim(prob_H_steps)[1]:1, 1:(dim(prob_H_steps)[2]-1)], 5) %>%
kable(row.names=TRUE, digits=4, caption='Probability of selecting each box in the last 5 steps') %>%
kable_styling()
hw_grid(hc_plot(0), hc_plot(1), hc_plot(2), hc_plot(3), hc_plot(4), hc_plot(5), ncol=3, rowheight=350)
The resulting plot of my simulation can be downloaded here (download the entire folder, including the subfolder “lib”): 6 boxes toy model.
Consider again the six boxes toy model of the previous exercise and write a simulation program that:
n_steps <- 20
box <- sample(0:5, 1)
j <- 0:5
# initialize the probabilities
p_H <- c(rep(1/6, 6))
# conditional probability of observing a white or black
p_W_H <- j/5
p_B_H <- (5-j)/5
# conditional probability of observing a white or black
p_W <- box/5
p_B <- (5-box)/5
# dataframe of all the probabilities, as a function of the extraction step
prob_H_steps <- data.frame(matrix(ncol=length(j)+1, nrow=0))
colnames(prob_H_steps) <- c('H_0', 'H_1', 'H_2', 'H_3', 'H_4', 'H_5', 'color')
i <- 1
while(i<=n_steps){
i_color <- sample(c(1, 2), 1, prob=c(p_W, p_B))
# update p_H depending on the extracted color
if (i_color == 1){
p_H <- (p_W_H*p_H)/dot(p_W_H, p_H)
col <- 'W'
}else if(i_color == 2){
p_H <- (p_B_H*p_H)/dot(p_B_H, p_H)
col <- 'B'
}
# update the dataframe with the probabilities
prob_H_steps[i,] <- append(p_H, list(col))
i <- i+1
}
# print on the standard output the probability of selecting each box
cat('Probability of selecting each box:\n\n')
## Probability of selecting each box:
print(p_H)
## [1] 0.0000e+00 2.2769e-10 3.3574e-05 2.2053e-02 9.7791e-01 0.0000e+00
cat('\n')
# plot function (equal to the one of the exercise 4)
hc_plot <- function(i_H){
hc <- prob_H_steps %>%
mutate_if(is.numeric, round, digits=4)
hc <- {if(i_H == 0){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_0, group=color), color=col)
}else if(i_H == 1){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_1, group=color), color=col)
}else if(i_H == 2){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_2, group=color), color=col)
}else if(i_H == 3){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_3, group=color), color=col)
}else if(i_H == 4){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_4, group=color), color=col)
}else if(i_H == 5){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_5, group=color), color=col)}} %>%
hc_title(text=paste('H', i_H, sep=''), align='center') %>%
hc_xAxis(title = list(text = 'Extraction step')) %>%
hc_yAxis(title = list(text=paste('P(H', i_H, ')', sep='')), min=0, max=1)
}
# check if the extracted stones are of one color only and update plot color list accordingly
if (length(unique(prob_H_steps$color))==1){
if (unique(prob_H_steps$color)=='W'){
col <- c('steelblue')
}else if (unique(prob_H_steps$color)=='B'){
col <- c('firebrick')
}
}else{
col <- c('firebrick', 'steelblue')
}
# plot
hw_grid(hc_plot(0), hc_plot(1), hc_plot(2), hc_plot(3), hc_plot(4), hc_plot(5), ncol=3, rowheight=350)