############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2024-09-19 # # Topic(s): # - Statistical Rethinking (https://xcelab.net/rm/) # - Section(s): 3.3 - 3.4 # # last updated: 2024-09-24 ############################################################################ ### 3.3: Sampling to simulate prediction ## 3.3.1: Dummy data # probabilities of seeing 0, 1, or 2 times water when the true probability of # seeing water is 0.7 on a single toss based on a binomial distribution data.frame(W=0:2, prob=dbinom(0:2, size=2, prob=0.7)) # simulate one value of W from this distribution rbinom(1, size=2, prob=0.7) # simulate 10 values of W from this distribution rbinom(10, size=2, prob=0.7) # simulate 100,000 values and create a frequency table of the observed values dummy_w <- rbinom(1e5, size=2, prob=0.7) tab <- table(dummy_w) tab # turn the frequencies into proportions tab / sum(tab) # simulate 100,000 values when there are 9 tosses dummy_w <- rbinom(1e5, size=9, prob=0.7) tab <- table(dummy_w) tab # Figure 3.5: plot of the frequencies plot(tab, xlab="dummy water count", ylab="Frequency") # it may happen that one of the values of W is never observed in the simulated # values (although the chances of this happening with 100,000 values is very # very small); to guarantee that all values show up in the table/plot (even if # the frequency is 0), we can turn 'dummy_w' into a factor with the known # values/levels in can take on before creating the table and plot plot(table(factor(dummy_w, levels=0:9)), xlab="dummy water count", ylab="Frequency") # load the rethinking package library(rethinking) # could also use the simplehist() function from the rethinking package simplehist(dummy_w, xlab="dummy water count") ## 3.3.2: Model checking # 3.3.2.1: Did the software work? # not done since the software implementation is simple enough that it can be # checked against analytic results, so this was skipped; maybe the book comes # back to this idea in the context of a more complex example later on # 3.3.2.2: Is the model adequate? # recreate the grid approximation we did in chapter 2 p_grid <- seq(from=0, to=1, length.out=1000) # set up the grid prob_p <- rep(1, 1000) # assumed prior (each value of p is equally likely) prob_data <- dbinom(6, size=9, prob=p_grid) # compute the likelihoods posterior <- prob_data * prob_p # compute the posterior values posterior <- posterior / sum(posterior) # rescale them so they add up to 1 plot(p_grid, posterior, type="l", lwd=4) # plot the posterior distribution # note that the peak of the posterior is at 6/9 (= W/N) abline(v=6/9) # for every value of p in the grid, construct the binomial distribution mat <- sapply(p_grid, function(p) dbinom(0:9, size=9, prob=p)) mat[,1:5] # multiply the probabilities of seeing 0:9 times water for a given value of p # with the corresponding posterior probability of the value of p (note: since # the multiplication is done column-wise, we need to transpose 'mat' first, # then multiply, and then we transpose back) mat <- t(posterior * t(mat)) mat[,1:5] # now take the mean of the values across rows; this gives us the posterior # predictive distribution for seeing 0:9 times water ppd <- rowMeans(mat) ppd <- ppd / sum(ppd) # Figure 3.6: plot of the posterior predictive distribution plot(0:9, ppd, type="h", lwd=5, xlab="number of water samples", ylab="probability", xaxt="n", ylim=c(0,0.3)) axis(side=1, 0:9) # if we ignore the uncertainty as to what p is and just use the most probable # value according to the posterior distribution for p (6/9), then we would be # underestimating the uncertainty for new observations lines(0:9 + 0.05, dbinom(0:9, size=9, prob=6/9), type="h", lwd=3, col="red") # now suppose we just have 10,000 sampled values from the posterior distribution samples <- sample(p_grid, prob=posterior, size=1e4, replace=TRUE) # now we are going to simulate 10,000 new data points where for each simulated # value, we use the corresponding sampled value of p from the posterior w <- rbinom(1e4, size=9, prob=samples) head(w) # create a frequency table of the simulated values (again, we turn w into a # factor for this just to be certain that every value between 0 and 9, even # one with 0 frequency, shows up in the table) tab <- table(factor(w, levels=0:9)) tab # rescale the frequencies to proportions tab <- tab / sum(tab) tab # add these proportions to the figure lines(0:9 - 0.05, tab, type="h", lwd=3, col="blue") # add a legend legend("topleft", inset=.01, lty=1, col=c("black","red","blue"), lwd=c(5,3,3), legend=c("posterior predictive distribution", "binomial(N=9, p=6/9)", "posterior predictive distribution simulations")) # now we can use the simulated values from the posterior predictive # distribution to compute for example a 80% percentile interval quantile(w, probs=c(.10, .90)) # simulate values from the PPD of the maximum run lengths sim.maxrun <- sapply(samples, function(p) { x <- rbinom(9, size=1, prob=p) maxrun <- max(rle(x)$lengths) factor(maxrun, levels=1:9) }) # create a frequency table of the simulated values and rescale tab <- table(sim.maxrun) tab <- tab / sum(tab) tab # Figure 3.7 (left): plot the posterior predictive distribution plot(1:9, c(tab), type="h", lwd=5, xlab="longest run length", ylab="probability", xaxt="n") axis(side=1, 1:9) # actual sequence observed wobs <- c(1,0,1,1,1,0,1,0,1) # maximum running length observed in the actual sequence maxrun <- max(rle(wobs)$lengths) # make the line in the plot blue segments(maxrun, 0, maxrun, tab[which(maxrun == names(tab))], lwd=8, col="#1e59ae") # simulate values from the PPD of the number of switches sim.switches <- sapply(samples, function(p) { x <- rbinom(9, size=1, prob=p) switches <- length(rle(x)$lengths) - 1 factor(switches, levels=0:8) }) # create a frequency table of the simulated values and rescale tab <- table(sim.switches) tab <- tab / sum(tab) tab # Figure 3.7 (right): plot the posterior predictive distribution plot(0:8, c(tab), type="h", lwd=5, xlab="number of switches", ylab="probability", xaxt="n") axis(side=1, 0:8) # number of switches observed in the actual sequence switches <- length(rle(wobs)$lengths) - 1 # make the line in the plot blue segments(switches, 0, switches, tab[which(switches == names(tab))], lwd=8, col="#1e59ae") ############################################################################ # on p. 67-68, the book states that even if there is correlation between the # observed values on consecutive throws of the globe, we should still obtain # the correct proportion in the long run; this is not entirely obvious, so we # will conduct a little simulation to examine this; for this, let's assume # that the person throwing the globe does not spin the globe much on each # throw, so when the globe lands and we examine the presence of water at the # location at the very top (or bottom) of the globe, we will tend to see the # same observation as on the previous throw, since land and water are not # distributed randomly across the globe, but are clustered; this will induce # positive correlation between the throws # let's draw a (flattened out) map of a fictitious earth where there is land # in the upper right quadrant, so 3/4 of the planet is covered in water plot(NA, xlim=c(0,1), ylim=c(0,1), xlab="pos1", ylab="pos2", xaxs="i", yaxs="i") rect(0, 0, 1, 1, col="#1e59ae") rect(0.5, 0.5, 1, 1, col="brown4") # now to simulate the process above, we are going to take 100,000 'steps' # starting at a random location, where on each step we just move randomly a # small amount horizontally and vertically; note that when we move beyond one # of the borders of the map, we need to move our position to the opposite side # (since we are technically moving around a globe); at each step, we record # whether we are on water or not steps <- 100000 water <- rep(NA, steps) pos <- matrix(NA_real_, nrow=steps, ncol=2) draw <- rep(1, steps) # set a random starting position and record if we are on water pos[1,] <- runif(2) water[1] <- ifelse(pos[1,2] < 0.5, 1, ifelse(pos[1,1] < 0.5, 1, 0)) for (i in 2:steps) { # set the new location (add a random amount between -.05 and .05 to the # current pos1 and pos2 values) pos[i,] <- pos[i-1,] + runif(2, -.05, .05) # check if we have moved beyond one of the borders and, if so, record this # as a 0 in the 'draw' vector if (any(pos[i,] < 0 | pos[i,] > 1)) draw[i] <- 0 # if we have moved beyond a border, adjust the position to the opposite side if (pos[i,1] > 1) pos[i,1] <- pos[i,1] - 1 if (pos[i,1] < 0) pos[i,1] <- 1 + pos[i,1] if (pos[i,2] > 1) pos[i,2] <- pos[i,2] - 1 if (pos[i,2] < 0) pos[i,2] <- 1 + pos[i,2] # record if we are on water water[i] <- ifelse(pos[i,2] < 0.5, 1, ifelse(pos[i,1] < 0.5, 1, 0)) } # check how often we have been on water; this is indeed approximately 0.75 mean(water) # add our path on the map (note: when drawing the line segments, we need to # skip cases where we moved beyond the border as otherwise we would get lines # moving from one border to the opposite side) pos[draw == 0,] <- NA segments(pos[1:(steps-1),1], pos[1:(steps-1),2], pos[2:steps,1], pos[2:steps,2]) # the following will show an 'animation' of the position moving around the # map, but this will take quite some time to finish plot(NA, xlim=c(0,1), ylim=c(0,1), xlab="pos1", ylab="pos2", xaxs="i", yaxs="i") rect(0, 0, 1, 1, col="#1e59ae") rect(0.5, 0.5, 1, 1, col="brown4") for (i in 2:steps) { segments(pos[i-1,1], pos[i-1,2], pos[i,1], pos[i,2]) title(list(paste("▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇", i), col="white")) title(paste("Step", i)) } ############################################################################