knitr::opts_chunk$set(dev = "ragg_png")
options(digits=5) # set number of digits equal to 5

Packages and functions

# tidyverse
library(tidyverse)

# others
library(tictoc)
library(gridExtra)
library(kableExtra)
library(glue)
library(highcharter)
library(latex2exp)
library(pracma)
library(pander)
library(htmltools)

options(highcharter.theme = hc_theme_elementary())

Exercise 1

Consider the following triangular distribution \[ f(x)= \begin{cases} \frac{2(x-a)}{(b-a)(c-a)} & a\leq x < c \\ \frac{2(b-x)}{(b-a)(b-c)} & c\leq x < b \\ 0 & \text{otherwise} \end{cases} \] where \(c\in[a, b]\).

a) Plot the function, given the interval (a, b)

d_triang <- function(x, params){
  a = params[1]
  b = params[2]
  c = params[3]
  
  if (!(c>=a & c<=b)) stop('c is not between a and b')
  
  return(
    ifelse(
      a<=x & x<c, 
      2*(x-a)/((b-a)*(c-a)), 
      ifelse(
        c<=x & x<=b, 
        2*(b-x)/((b-a)*(b-c)), 
        0
      )
    )
  )
}

a = -2
b = 2
c = 0

x_plot = seq(-3, 3, by=0.1)

# generic plot of a function
gg_triang <- function(x_values, funct, args, title='My function', xlab = 'x', ylab='f(x)'){
  ggplot() +
    geom_line(aes(x=x_values, y=funct(x_values, args)), size=0.6) +
    labs(
      title=title, 
      x=xlab, 
      y=ylab
    ) +
    scale_x_continuous(breaks=seq(x_values[1], tail(x_values, 1), by=0.5)) +
    theme_bw()
}

gg_triang(x_plot, d_triang, c(a, b, c), title='PDF of the Triangular distribution, with a=-2, b=2, c=0', xlab='X', ylab='f(X)')

b) Write an algorithm to generate random numbers from the triangular distribution

Inverse transform method

# Cumulative distribution
p_triang <- function(x, params){
  integrals = numeric(length(x))
  for (i in seq_along(x)){
    integrals[i] <- integrate(d_triang, lower=a, upper = x[i], params)$value  # set lower to a=-2 instead than -Inf for speed reasons
  }
  return(integrals)
}

gg_triang(x_plot, p_triang, c(a, b, c), title='CDF of the Triangular distribution, with a=-2, b=2, c=0', xlab='X', ylab='F(X)')

# Inverse functions;
# bounds set such that it searches for a solution in [-2, 2], the only interval in which the cdf is invertible
inverse <- function(y, params){
    uniroot(function(x){p_triang(x, params) - y}, lower = -2, upper = 2)$root
}

# Quantile function
q_triang <- function(y, params, lower.value=-2, upper.value=2){
  output <- numeric(length(y))
  for (i in seq_along(y)){
    if (y[i]==1) output[i]<-upper.value       # boundary value
    else if (y[i]==0) output[i]<-lower.value  # boundary value
    else output[i] <- inverse(y[i], params)
  }
  return(output)
}

# Generate random numbers 
r_triang_1 <- function(n, params, seed=1){
  set.seed(seed)
  q_triang(runif(n), params)
}

Acceptance/rejection method

In order to use the acceptance/rejection method, we must find a finite interval \([a', b']\) in which the pdf is defined and a value \(M\in\mathbb{R}\) such that \(\forall x \in [a',b'] \; f(x)<M\).

My choices for the interval and \(M\) are:

\([a', b'] = [a, b] = [-2, 2]\)
\(M=\max_{x\in[a, b]} f(x) = f(x=0) = 0.5\)

d_triang(0, c(a, b, c))

## [1] 0.5

r_triang_2 <- function(n, f, params, seed=1, a_=a, b_=b, M=0.5){
  
  set.seed(seed)
  y = numeric(n)

  i=1
  while (i <= n){
    u_1 <- runif(1, a_, b_)
    u_2 <- runif(1, 0, 1)
    x_1 <- a + (b-a)*u_1
    
    if (u_2*M < f(u_1, params)){
      y[i] <- u_1
      i=i+1
    }
  }
  return(y)
}

c) Generate \(10^4\) random number from the distribution, show them in an histogram and superimpose the analytical curve

Inverse transform method

tic("Generate 10^4 random numbers using the inverse transform method")
generated_points <- r_triang_1(10000, c(a, b, c))
toc()

## elapsed time is 17.070000 seconds

gg_gen <- function(method){
  ggplot() + 
    geom_histogram(aes(x=generated_points, y=..density..),  binwidth = 0.1, center = 0.1, fill='lightblue', colour="black") +
    stat_function(fun=function(x)d_triang(x, c(a, b, c)), color='firebrick', size=0.7) +
    labs(title=paste('Sampling results using the', method, 'method'), 
        x='X',
        y='Count (normalized)') +
    scale_x_continuous(breaks=seq(-3, 3, by=0.5), limits=c(-3,3)) +
    theme_bw()
}

gg_gen('inverse transform')

Acceptance/rejection method

tic("Generate 10^4 random numbers using the acceptance/rejection method")
generated_points <- r_triang_2(10000, d_triang, c(a, b, c))
toc()

## elapsed time is 0.380000 seconds

gg_gen('acceptance/rejection')

Notice that the acceptance/rejection method is much faster than the inverse transform method. This is due to the fact that the CDF and its inverse are not computed analytically, but numerically, and this slows down the second method a lot.

Exercise 2 - Markov’s inequality

Markov’s inequality represents an upper bound to probability distributions. Having defined a function \(G(k) = 1-F(k) \equiv P(X \geq k)\), plot \(G(k)\) and the Markov’s upper bound for

a) the exponential, \(Exp(\lambda = 1)\), distribution function
b) the uniform, \(\mathcal{U}(3, 5)\), distribution function
c) the binomial, \(Bin(n = 1, p = 1/2)\), distribution function
d) a Poisson, \(Pois(\lambda = 1/2)\), distribution function

I want to start by defining some general functions, such as \(G(k)\), the Markov’s upper bound and the plotting function.

G <- function(cdf_values){
  return(1-cdf_values)
}

markov_bound <- function(k, exp_value){
  if (any(k<=0)) stop('k must be > 0')
  return(exp_value/k)
}
  
gg_Markov <- function(k_values, cdf_values, exp_value, title="Markov's inequality", xlab = 'k', logscale=FALSE){
  ggplot() +
    geom_line(aes(x=k_values, y=markov_bound(k, exp_value), colour="Markov's bound"), size=0.6) +   # Markov's upper bound
    geom_line(aes(x=k_values, y=G(cdf_values), color='G(k)'), size=0.6) +                           # G(k)
    labs(
      title=title, 
      x=xlab 
    ) +
    {if (logscale==TRUE) {
        scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
                      labels = scales::trans_format("log10", scales::math_format(10^.x)))}
    }+
    scale_x_continuous(breaks=seq(1, 10, by=1)) +
    scale_color_manual(values = c("Markov's bound" = 'firebrick', 'G(k)' = 'steelblue')) +
    theme_bw() +
    theme(axis.title.y=element_blank(), legend.title= element_blank())
}

a) Exponential distribution function

Since the Markov’s upper bound diverges as \(k\) approaches zero, I plot the distribution starting from \(k=1\).

k <- seq(1, 10, by=0.05)
gg_Markov(k, cdf_values=pexp(k, 1), exp_value=1, title=TeX("Markov's inequality, exponential distribution ($\\lambda=1$)"))

b) Uniform distribution function

k <- seq(2, 6, by=0.05)
gg_Markov(k, cdf_values=punif(k, 3, 5), exp_value=4, title=TeX("Markov's inequality, uniform distribution ($\\mu=3, \\sigma=5$)"))

c) Binomial distribution function

The function \(Bin(n = 1, p = 1/2)\) describes the probability of \(x\) successes in 1 trial, where the probability of a success is \(1/2\). So there are only 2 possible value of \(x\) that give a non-zero probability: \(x=0\) and \(x=1\). Regarding the cumulative distribution, we have that \(F(x=0)=f(x=0)=0.5\) and \(F(x)=1 \; \forall x\geq 1\). So the only meaningful points are \(x=0\) and \(x=1\).

Moreover, as the Markov’s upper bound diverges in 0, it only makes sense to compute it in 1.

k <- 0:1
y <- G(pbinom(0:1, 1, 1/2))
ggplot() +
  geom_point(aes(x=k, y=y, color='G(k)'), size=2.5) + 
  geom_segment(aes(x=k, xend=k, y=0, yend=y, color='G(k)')) +
  geom_point(aes(x=1, y=markov_bound(1, 1/2), colour="Markov's bound"), size=2.5) + 
  labs(
      title=TeX("Markov's inequality, binomial distribution ($n=1, p=1/2$)"), 
      x='k' 
    ) +
  scale_x_continuous(breaks=seq(-1, 2, by=1), limits=c(-0.5, 1.5)) +
  scale_color_manual(values = c('G(k)' = 'steelblue', "Markov's bound" = 'firebrick')) +
  theme_bw() +
  theme(axis.title.y=element_blank(), legend.title= element_blank())

d) Poisson distribution function

Also in this case I plot the distribution starting from \(k=1\), so not considering \(k=0\), that corresponds to an infinite value of the Markov’s bound. Here I use the logarithmic scale on the y axis, as \(G(k)\) immediately becomes very small.

k <- seq(1, 10, by=0.05)
gg_Markov(k, cdf_values=ppois(k, 1/2), exp_value=4, title=TeX("Markov's inequality, Poisson distribution ($\\lambda=1/2$)"), logscale=TRUE)

Exercise 3 - Chebyshev’s inequality

Use R to show, with a plot, that Chebyshev’s inequality is an upper bound to the following distributions:

a) a normal distribution, \(\mathcal{N}(\mu=3, \sigma=5)\)
b) an exponential distribution \(Exp(\lambda = 1)\)
c) a uniform distribution, \(\mathcal{U}(1-\sqrt{2}, 1+\sqrt{2})\)
d) a Poisson, \(Pois(\lambda = 1/3)\), distribution function

First I need to define some general functions that I will use to show that Chebyshev’s inequality is an upper bound of the given distributions.

prob_ksigma: returns, for a given probability distribution, the probability that X deviates of at least \(k\cdot\sigma\) from its expected value, so \[ \begin{align*} P\left[|X-\mu|\geq k \sigma\right]&=P[X \in (-\infty, \mu-k\sigma] \cup [\mu+k\sigma, \infty)]=P[X\leq \mu -k \sigma]+P[X\geq\mu+k \sigma] = \\ & = F[\mu-k\sigma]+(1-F[u+k\sigma])+P[X=\mu+k\sigma] \end{align*} \] While in the continuous case the last probability is zero, as it is the probability of one single value, for a discrete distribution, when \(\mu+k\sigma\) is integer, it must be considered .
chebyshev_bound: Chebyshev’s upper bound to the above probabilities \[ P\left[|X-\mu|\geq k \sigma\right]\leq\frac{1}{k^2} \]
gg_Chebyshev: plot function.

prob_ksigma <- function(k, cdf, mu, sigma){
  return(cdf(mu-k*sigma)+1-cdf(mu+k*sigma))
}

chebyshev_bound <- function(k){
  return(1/k^2)
}

gg_Chebyshev <- function(k_values, cdf, mu, sigma, title="Chebyshev's inequality", xlab = 'k', logscale=FALSE){
  ggplot() +
    geom_line(aes(x=k_values, y=chebyshev_bound(k_values), colour='M'), size=0.6) +               # Chebyshev's upper bound
    geom_line(aes(x=k_values, y=prob_ksigma(k_values, cdf, mu, sigma), color='P'), size=0.6) +    # P(k...)
    labs(
      title=title, 
      x=xlab 
    ) +
    {if (logscale==TRUE) {
        scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
                      labels = scales::trans_format("log10", scales::math_format(10^.x)))}
    }+
    scale_x_continuous(breaks=seq(1, 10, by=1)) +
    scale_color_manual(values = c("M" = 'firebrick', 'P' = 'steelblue'), labels=unname(TeX(c("Chebishev's bound", "$P(|X-\\mu|\\geq k\\sigma)$")))) + 
    theme_bw() +
    theme(axis.title.y=element_blank(), legend.title= element_blank())
  
}

a) Normal distribution

Test of the function prob_ksigma:

for (i in 1:3){
  cat('Probability that X is within', i, 'sigma in the normal distribution:', 1-prob_ksigma(i, function(x) pnorm(x, 0, 1), 0, 1), '\n')
}

## Probability that X is within 1 sigma in the normal distribution: 0.68269 
## Probability that X is within 2 sigma in the normal distribution: 0.9545 
## Probability that X is within 3 sigma in the normal distribution: 0.9973

k <- seq(1, 10, by=0.1)

gg_Chebyshev(k, function(x) pnorm(x, 3, 5), mu=3, sigma=5, title=TeX("Chebyshev's inequality, normal distribution ($\\mu=3, \\sigma=5$)"))

b) Exponential distribution

lambda <- 1
gg_Chebyshev(k, function(x) pexp(x, lambda), mu=1/lambda, sigma=1/lambda, title=TeX("Chebyshev's inequality, exponential distribution ($\\lambda=1)"))

c) Uniform distribution

a <- 1-sqrt(2)
b <- 1+sqrt(2)
sigma_un <- (b-a)/sqrt(12)

# k such that k*sigma = b-a/2
k_zero <- (b-a)/(2*sigma_un)

gg_Chebyshev(seq(1, 10, by=0.05), function(x) punif(x, a, b), mu=(a+b)/2, sigma=sigma_un, title=TeX("Chebyshev's inequality, uniform distribution ($a=1-\\sqrt{2}, b=1+\\sqrt{2}$)")) + 
  geom_point(aes(k_zero, 0), color='black', size=2) +
  annotate('text', x = k_zero+0.7, y = 0.04, label = '(1.73, 0)') +
  theme(plot.title = element_text(size=12))

cat('k such that k*sigma is equal to half the length of the interval:', k_zero)

## k such that k*sigma is equal to half the length of the interval: 1.7321

For larger values of \(k\), the probability of being outside the interval \((\mu-k\sigma, \mu+k\sigma)\) is obviously zero, as the distribution is always zero in the region outside \((a, b)\).

d) Poisson distribution

As \(\lambda=1/3\), we have that \(\mu=1/3\) and \(\sigma=1/\sqrt{3}\). For this reason and for the numerical uncertainty due to computer approximations, \(\mu+k\sigma\) is unlikely to be integer. Moreover, the probability function of the Poisson distribution is defined only for integer value. All this leads me to neglect \(P[X=\mu+k\sigma]\) in the formula defined above. This makes even more sense if we think that to have \(1/3+k/\sqrt{3}\) integer \(k\) would have to be an integer multiple of \(\sqrt{3}\), so an irrational number and a number that surely won’t be passed to prob_ksigma.

lambda <- 1/3
gg_Chebyshev(k, function(x) ppois(x, lambda), mu=lambda, sigma=sqrt(lambda), title=TeX("Chebyshev's inequality, Poisson distribution ($\\lambda=1/3)"), logscale=TRUE)

Exercise 4 - Six Boxes Toy Model : inference

The six boxes toy model is described in [1].

Write a program in R that:

allows the user to insert the color of a randomly extracted stone, after having randomly chosen one box
prints on the standard output the probability of selecting each box
plots the probability for each box as a function of the extraction step

Description of the boxes

\(H_0\): 5 black stones
\(H_1\): 4 black stones, 1 white stone
\(H_2\): 3 black stones, 2 white stone
\(H_3\): 2 black stones, 3 white stone
\(H_4\): 1 black stones, 4 white stone
\(H_5\): 5 white stones

Variables

j –> vector containing the numbers associated to each box
p_H –> vector containing the probability of each hypothesis (box)
p_W_H –> vector of the probabilities of observing a white, given the box \(j\)
p_W_B –> vector of the probabilities of observing a black, given the box \(j\)

j <- 0:5 

# initialize the probabilities
p_H <- c(rep(1/6, 6))

# conditional probability of observing a white or black
p_W_H <- j/5
p_B_H <- (5-j)/5


# dataframe of all the probabilities, as a function of the extraction step
prob_H_steps <- data.frame(matrix(ncol=length(j)+1, nrow=0))
colnames(prob_H_steps) <- c('H_0', 'H_1', 'H_2', 'H_3', 'H_4', 'H_5', 'color')

i <- 1
cat('\014')

# plot function
hc_plot <- function(i_H){
  hc <- prob_H_steps %>% 
    mutate_if(is.numeric, round, digits=4) 
    
  hc <- {if(i_H == 0){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_0, group=color), color=col)
    }else if(i_H == 1){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_1, group=color), color=col)
    }else if(i_H == 2){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_2, group=color), color=col)
    }else if(i_H == 3){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_3, group=color), color=col)
    }else if(i_H == 4){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_4, group=color), color=col)
    }else if(i_H == 5){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_5, group=color), color=col)}} %>%
    hc_title(text=paste('H', i_H, sep=''), align='center') %>% 
    hc_xAxis(title = list(text = 'Extraction step')) %>%
    hc_yAxis(title = list(text=paste('P(H', i_H, ')', sep='')), min=0, max=1) 
}

check_unique_color <- TRUE

repeat{
  i_color <- menu(c('White', 'Black', 'End the game'), title='What color has been extracted?')
  if (i_color == 3) break
  
  # update p_H depending on the extracted color
  if (i_color == 1){
    p_H <- (p_W_H*p_H)/dot(p_W_H, p_H)
    col <- 'W'
    
  }else if(i_color == 2){
    p_H <- (p_B_H*p_H)/dot(p_B_H, p_H)
    col <- 'B'
  }
  
  # print on the standard output the probability of selecting each box
  cat('\nProbability of selecting each box:\n\n')
  print(p_H)
  cat('\n')
  
  # update the dataframe with the probabilities
  prob_H_steps[i,] <- append(p_H, list(col))
  i <- i+1
  
  # check if the extracted stones are of one color only and update plot color list accordingly
  if (check_unique_color){ 
    if (length(unique(prob_H_steps$color))==1){
      if (unique(prob_H_steps$color)=='W'){
        col <- c('steelblue')
      }else if (unique(prob_H_steps$color)=='B'){
        col <- c('firebrick')
      }
    }else{
      col <- c('firebrick', 'steelblue')
      check_unique_col <- FALSE
    }
  }
  
  # plot in browser
  save_html(hw_grid(hc_plot(0), hc_plot(1), hc_plot(2), hc_plot(3), hc_plot(4), hc_plot(5), ncol=3, rowheight=350), file="6_boxes_toy_model.html")
  openFileInOS("6_boxes_toy_model.html")
}

head(prob_H_steps[dim(prob_H_steps)[1]:1, 1:(dim(prob_H_steps)[2]-1)], 5) %>% 
  kable(row.names=TRUE, digits=4, caption='Probability of selecting each box in the last 5 steps') %>%
  kable_styling()

hw_grid(hc_plot(0), hc_plot(1), hc_plot(2), hc_plot(3), hc_plot(4), hc_plot(5), ncol=3, rowheight=350)

The resulting plot of my simulation can be downloaded here (download the entire folder, including the subfolder “lib”): 6 boxes toy model.

Exercise 5 - Six Boxes Toy Model : simulation

Consider again the six boxes toy model of the previous exercise and write a simulation program that:

selects a random box
makes random sampling from the box
prints on the standard output the probability of selecting each box
plots the probability for each box as a function of the number of trial

n_steps <- 20
box <- sample(0:5, 1)

j <- 0:5 

# initialize the probabilities
p_H <- c(rep(1/6, 6))

# conditional probability of observing a white or black
p_W_H <- j/5
p_B_H <- (5-j)/5

# conditional probability of observing a white or black
p_W <- box/5
p_B <- (5-box)/5

# dataframe of all the probabilities, as a function of the extraction step
prob_H_steps <- data.frame(matrix(ncol=length(j)+1, nrow=0))
colnames(prob_H_steps) <- c('H_0', 'H_1', 'H_2', 'H_3', 'H_4', 'H_5', 'color')

i <- 1

while(i<=n_steps){
  i_color <- sample(c(1, 2), 1, prob=c(p_W, p_B))
  
  # update p_H depending on the extracted color
  if (i_color == 1){
    p_H <- (p_W_H*p_H)/dot(p_W_H, p_H)
    col <- 'W'
    
  }else if(i_color == 2){
    p_H <- (p_B_H*p_H)/dot(p_B_H, p_H)
    col <- 'B'
  }
  
  # update the dataframe with the probabilities
  prob_H_steps[i,] <- append(p_H, list(col))
  i <- i+1
}

# print on the standard output the probability of selecting each box
cat('Probability of selecting each box:\n\n')

## Probability of selecting each box:

print(p_H)

## [1] 0.0000e+00 2.2769e-10 3.3574e-05 2.2053e-02 9.7791e-01 0.0000e+00

cat('\n')

# plot function (equal to the one of the exercise 4)
hc_plot <- function(i_H){
  hc <- prob_H_steps %>% 
    mutate_if(is.numeric, round, digits=4) 
    
  hc <- {if(i_H == 0){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_0, group=color), color=col)
    }else if(i_H == 1){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_1, group=color), color=col)
    }else if(i_H == 2){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_2, group=color), color=col)
    }else if(i_H == 3){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_3, group=color), color=col)
    }else if(i_H == 4){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_4, group=color), color=col)
    }else if(i_H == 5){hc %>% hchart('point', hcaes(x=1:dim(prob_H_steps)[1], y=H_5, group=color), color=col)}} %>%
    hc_title(text=paste('H', i_H, sep=''), align='center') %>% 
    hc_xAxis(title = list(text = 'Extraction step')) %>%
    hc_yAxis(title = list(text=paste('P(H', i_H, ')', sep='')), min=0, max=1) 
}

# check if the extracted stones are of one color only and update plot color list accordingly
if (length(unique(prob_H_steps$color))==1){
  if (unique(prob_H_steps$color)=='W'){
    col <- c('steelblue')
  }else if (unique(prob_H_steps$color)=='B'){
    col <- c('firebrick')
  }
}else{
  col <- c('firebrick', 'steelblue')
}

# plot
hw_grid(hc_plot(0), hc_plot(1), hc_plot(2), hc_plot(3), hc_plot(4), hc_plot(5), ncol=3, rowheight=350)

Bibliography

[1]: G. D’Agostini, Probability, propensity and probabilities of propensities (and of probabilities), https://arxiv.org/pdf/1612.05292.pdf
[2]: G. D’Agostini, More lessons form the six box toy experiment, https://arxiv.org/pdf/1701.01143.pdf

Exercises Laboratory Session 03

Nicola Zomer

21 April, 2022

Packages and functions

Exercise 1

a) Plot the function, given the interval (a, b)

b) Write an algorithm to generate random numbers from the triangular distribution

Inverse transform method

Acceptance/rejection method

c) Generate \(10^4\) random number from the distribution, show them in an histogram and superimpose the analytical curve

Inverse transform method

Acceptance/rejection method

Exercise 2 - Markov’s inequality

a) Exponential distribution function

b) Uniform distribution function

c) Binomial distribution function

d) Poisson distribution function

Exercise 3 - Chebyshev’s inequality

a) Normal distribution

b) Exponential distribution

c) Uniform distribution

d) Poisson distribution

Exercise 4 - Six Boxes Toy Model : inference

Description of the boxes

Variables

Exercise 5 - Six Boxes Toy Model : simulation

Bibliography