############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2024-05-16 # # Topic(s): # - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/) # - Section(s): 9.1 - 9.3 # # last updated: 2024-12-13 ############################################################################ ### 9.1: Propagating uncertainty in inference using posterior simulations # download the dataset for the example (run once) #download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/ElectionsEconomy/data/hibbs.dat", destfile="hibbs.dat") # read in the data dat <- read.table("hibbs.dat", header=TRUE) # inspect the dataset dat # plot the data plot(dat$growth, dat$vote, cex=1.2, pch=21, bg="gray", xlab="Average recent growth in personal income", ylab="Incumbent party's vote share", bty="l", panel.first=grid()) # load the rstanarm package library(rstanarm) # fit a linear regression model using stan_glm() (setting the seed of the # random number generator to make the results fully reproducible) set.seed(1237) res <- stan_glm(vote ~ growth, data=dat, refresh=0) res # extract the simulated draws from the posterior distributions for the # intercept, slope, and error standard deviation as a data frame post <- as.data.frame(res) head(post) # compute the median and mad sd values for the three columns tab <- cbind(median = sapply(post, median), mad = sapply(post, mad)) tab ## Uncertainty in the regression coefficients and implied uncertainty in the regression line # plot histograms and superimposed kernel density estimates of the three # posterior distributions (similar to Figure 9.1) and add arrows around the # median plus/minus one and two mad sds par(mfrow=c(3,1)) titles <- c("Intercept", "Slope", "Sigma") for (i in 1:3) { hist(post[[i]], main=titles[i], freq=FALSE, breaks=80) abline(v=median(post[[i]]), lwd=6) arrows(tab[i,1] - 1*tab[i,2], par("usr")[4]*.40, tab[i,1] + 1*tab[i,2], par("usr")[4]*.40, code=3, length=0.15, lwd=2) arrows(tab[i,1] - 2*tab[i,2], par("usr")[4]*.20, tab[i,1] + 2*tab[i,2], par("usr")[4]*.20, code=3, length=0.15, lwd=2) lines(density(post[[i]]), lwd=3) } par(mfrow=c(1,1)) # Figure 9.2(a): scatterplot of the posterior draws for the intercept and slope plot(post[[1]], post[[2]], pch=19, cex=0.2, xlab="Intercept", ylab="Slope", bty="l") # we can also plot the draws for all three parameters against each other pairs(post, pch=19, cex=0.2) # Figure 9.2(b) but also highlight the regression line based on the median intercept and slope plot(dat$growth, dat$vote, type="n", xlab="Average recent growth in personal income", ylab="Incumbent party's vote share", bty="l") apply(post[1:100,], 1, function(x) abline(a=x[1], b=x[2], col="gray40")) abline(a=tab[1,1], b=tab[2,1], lwd=6) points(dat$growth, dat$vote, pch=21, bg="black", col="white", lwd=3) # we can also draw all 4000 lines, using alpha blending (i.e., making the # lines semi-transparent) to indicate what lines are more or less plausible plot(dat$growth, dat$vote, type="n", xlab="Average recent growth in personal income", ylab="Incumbent party's vote share", bty="l") apply(post, 1, function(x) abline(a=x[1], b=x[2], col=rgb(0,0,0,.05))) abline(a=tab[1,1], b=tab[2,1], lwd=6) points(dat$growth, dat$vote, pch=21, bg="black", col="white", lwd=3) ## Using the matrix of posterior simulations to express uncertainty about a ## parameter estimate or function of parameter estimates # for illustration purposes only, compute the ratio intercept / slope adivb <- post[[1]] / post[[2]] # this yields the posterior distribution for this derived parameter, which we # can again summarize via the median and mad sd c(median = median(adivb), mad = mad(adivb)) # plot the kernel density estimate of the posterior for this derived parameter plot(density(adivb)) # there are a few very extreme values, so zoom in on where most of the mass of # the posterior distribution is plot(density(adivb), xlim=c(0,50)) ############################################################################ ### 9.2: Prediction and uncertainty: predict, posterior_linpred, and posterior_predict ## Point prediction using predict # note that coef() gives the median of the sampled values for the intercept # and slope and we could use these values to make point predictions coef(res) tab # so we can make a point prediction manually as follows coef(res)[[1]] + coef(res)[[2]] * 2.0 # but this is not exactly what predict() does x_new <- data.frame(growth=2.0) y_point_pred <- predict(res, newdata=x_new) y_point_pred # predict() actually computes the predicted values based on all regression # lines and then takes the mean of those values mean(post[[1]] + post[[2]] * 2.0) # careful: there is some inconsistent behavior of predict() when specifying # 'newdata' versus when not specifying this; consider the first data point head(dat, 1) # predict(res) gives the predicted values of the given data, so the following # gives the predicted value when growth = 2.4 predict(res)[1] # this also gives the predicted value predict(res, newdata=data.frame(growth=2.4)) # note that they are slightly different! this is because predict(res) uses the # median coefficient values to compute the predicted value coef(res)[[1]] + coef(res)[[2]] * 2.4 # but as noted above, when specifying 'newdata', predict() computes all # predicted values (based on the various intercept and slope values) and then # takes the mean; this is quite confusing/inconsisten behavior (although the # difference is negligible here) ## Linear predictor with uncertainty using posterior_linpred or posterior_epred # posterior_linpred() does the same thing as predict(), but gives the # individual predictions (based on the various intercept and slope values) y_linpred <- posterior_linpred(res, newdata=x_new) head(y_linpred) # taking their mean again gives the same value as predict(res, newdata=x_new) mean(y_linpred[,1]) # note that posterior_linpred() does not show this inconsistent behavior when # specifying 'newdata' versus not; so using posterior_linpred() is preferable head(posterior_linpred(res, newdata=data.frame(growth=2.4))) head(posterior_linpred(res)[,1,drop=FALSE]) # now we can now also visualize the posterior for the predicted value plot(density(y_linpred[,1])) # and compute other summary statistics median(y_linpred[,1]) sd(y_linpred[,1]) mad(y_linpred[,1]) ## Predictive distribution for a new observation using posterior_predict # note that above, we are making a prediction for the mean of y when x = 2.0, # but now we want to make a prediction for an individual election; we can do # so with the posterior_predict() function y_pred <- posterior_predict(res, newdata=x_new) head(y_pred) # so this yields the posterior distribution for the predicted value of y for # an individual election, which we can again summarize in various ways mean(y_pred[,1]) sd(y_pred[,1]) median(y_pred[,1]) mad(y_pred[,1]) # while the mean (or median) values are essentially the same for the predicted # mean and the predicted value of an individual election, the uncertainty in # predicting the result of an individual election is much larger # note that running posterior_predict() multiple times yields different values head(posterior_predict(res, newdata=x_new)) head(posterior_predict(res, newdata=x_new)) # visualize the posterior for the predicted value of an individual election # and also add the posterior for the mean plot(density(y_pred[,1]), col="dodgerblue", lwd=3, ylim=c(0,.40), main="", bty="l") lines(density(y_linpred[,1]), col="firebrick", lwd=3) legend("topright", inset=.02, lwd=3, col=c("firebrick", "dodgerblue"), legend=c("Posterior for E(y|x=2.0)", "Posterior for y|x=2.0")) # do the prediction of y|x=2.0 manually y_pred_manual <- post[[1]] + post[[2]] * 2.0 + rnorm(nrow(post), mean=0, sd=post[[3]]) head(y_pred_manual) # also add the posterior based on our manual calculations lines(density(y_pred_manual), col="forestgreen", lwd=3) # we see that this is essentially identical to what posterior_predict() is # doing, but because the additional uncertainly in predicting y is random, # there are minor discrepancies # we can derive further quantities from the posterior of the predicted value # of y when x=2.0; for example, the posterior distribution for y being greater # than 50% (which is of course a dichotomous variable) table(y_pred[,1] > 50) # and this distribution we can again summarize mean(y_pred[,1] > 50) # so the chances of winning the election is around 71% when x=2.0; this was in # fact the economic growth value leading up to the 2016 presidential election # when Hillary Clinton (as the incumbent candidate) ran against Donald Trump; # according to the model, she had high chances of winning, but we all know how # that one turned out ... ## Prediction given a range of input values # predictions for x = -2.0, -1.5, ..., 4.0 new_grid <- data.frame(growth=seq(-2.0, 4.0, 0.5)) y_point_pred_grid <- predict(res, newdata=new_grid) y_linpred_grid <- posterior_linpred(res, newdata=new_grid) y_pred_grid <- posterior_predict(res, newdata=new_grid) # examine the predicted values y_point_pred_grid head(y_linpred_grid) head(y_pred_grid) ## Propagating uncertainty # simulate the uncertainty in economic growth values (note: we use much more # uncertainty here compared to the value in the book) x_new <- rnorm(nrow(post), mean=2.0, sd=0.9) # and make use of these in making a prediction of y y_pred <- post[[1]] + post[[2]] * x_new + rnorm(nrow(post), mean=0, sd=post[[3]]) # show that this adds additional uncertainty to the posterior distribution lines(density(y_pred), col="orange", lwd=3) # note: we really have to bump of the SD for x_new to actually see this # increase in uncertainty (with an SD of 0.3 as in the book, the difference is # not really noticeable) ## Simulating uncertainty for the linear predictor and new observations # instead of using the earnings dataset, we stick to the same dataset above # and illustrate the part that is new, namely equations 9.1 and 9.2, which are # used when fitting a model with lm() res2 <- lm(vote ~ growth, data=dat) summary(res2) # we can make the two different types of predictions using predict() x_new <- data.frame(growth=2.0) predict(res2, newdata=x_new, interval="confidence") # this uses (9.1) predict(res2, newdata=x_new, interval="prediction") # this uses (9.2) ############################################################################ ### 9.3: Prior information and Bayesian synthesis ## Expressing data and prior information on the same scale # note: we already touched on the concepts of this section in section 1.5 # (which we discussed during the stream on 2023-03-23) ## Bayesian information aggregation theta_hat_prior <- 0.524 se_prior <- 0.041 n <- 400 y <- 190 theta_hat_data <- y/n se_data <- sqrt(theta_hat_data*(1-theta_hat_data)/n) theta_hat_bayes <- (theta_hat_prior/se_prior^2 + theta_hat_data/se_data^2) / (1/se_prior^2 + 1/se_data^2) se_bayes <- sqrt(1/(1/se_prior^2 + 1/se_data^2)) theta_hat_bayes se_bayes # Figure 9.3 xs <- seq(0.35, 0.7, length=1000) plot(xs, dnorm(xs, mean=theta_hat_prior, sd=se_prior), type="l", lty="dotted", bty="n", yaxt="n", yaxs="i", xlab=expression(theta), ylab="", ylim=c(0,20)) lines(xs, dnorm(xs, mean=theta_hat_data, sd=se_data), lty="dashed") lines(xs, dnorm(xs, mean=theta_hat_bayes, sd=se_bayes)) text(0.44, dnorm(0.44, mean=theta_hat_data, sd=se_data), pos=2, "Likelihood") text(0.58, dnorm(0.58, mean=theta_hat_prior, sd=se_prior), pos=4, "Prior") text(0.50, dnorm(0.50, mean=theta_hat_bayes, sd=se_bayes), pos=4, "Posterior") # this is identical to what happens in a meta-analysis if we treat the prior # as if it was a study; we can do this for example with the metafor package # install.packages("metafor") library(metafor) res <- rma(c(theta_hat_prior,theta_hat_data), sei=c(se_prior,se_data), method="EE") print(res, digits=3) # we can do the same as above using stan_glm(); first we create our dataset as # if we had done a poll in 400 people, where y = 1 indicates that a person # will vote for the Democratic candidate and y = 0 for the Republican # candidate dat <- data.frame(y = sample(rep(c(1,0), times=c(190,210)))) table(dat$y) # we can analyze the data using a linear regression model (which may appear a # bit unusual for a dichotomous outcome, but such a 'linear probability model' # (https://en.wikipedia.org/wiki/Linear_probability_model) often yields # similar results as a logistic regression model and by using this model, we # directly model the proportion of votes for the Democratic candidate as is # done in the example above res <- lm(y ~ 1, data=dat) summary(res) # note that the intercept is equal to the proportion and the standard error is # the same as we calculated above (except that here the SE is computed based # on (n-1), while we divided by n above in computing se_data) # now we fit the same model using stan_glm(), specifying the prior as we did above res <- stan_glm(y ~ 1, data=dat, prior_intercept=normal(location=theta_hat_prior, scale=se_prior), refresh=0) res # extract the sampled values for the posterior of the intercept (and sigma) post <- as.data.frame(res) # plot the posterior distribution for the intercept (this looks very much like # the posterior we have drawn above in Figure 9.3) plot(density(post[,1])) # the mean (or median) also corresponds very closely to our manual calculation mean(post[,1]) median(post[,1]) theta_hat_bayes # similarly, the SD (or MAD SD) is very to the SE we computed above sd(post[,1]) mad(post[,1]) se_bayes # now fit the same model, but using the default priors res <- stan_glm(y ~ 1, data=dat, refresh=0) res post <- as.data.frame(res) # the default prior for the intercept is vague, so the mean (or median) of the # posterior is now almost like the observed proportion (i.e., the prior has # very little influence on the posterior distribution) mean(post[,1]) median(post[,1]) theta_hat_data # and the SD (or MAD SD) of the posterior distribution is almost like the SE # of the observed proportion sd(post[,1]) mad(post[,1]) se_data ############################################################################