############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2024-04-11 # # Topic(s): # - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/) # - Section(s): 6.5 - 7.4 # # last updated: 2024-04-25 ############################################################################ ### 6.5: The paradox of regression to the mean # install the rstanarm package (need to do this once) #install.packages("rstanarm") # load the rstanarm package library(rstanarm) # download the dataset (only need to do this once) #download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/PearsonLee/data/Heights.txt", destfile="heights.txt") # read in the data dat <- read.table("heights.txt", header=TRUE) # inspect the dataset head(dat) # Figure 6.3a: mother's height versus daughter's height # (with jittering to avoid points that overlap) plot(jitter(daughter_height, amount=0.5) ~ jitter(mother_height, amount=0.5), data=dat, pch=19, cex=0.2, xlab="Mother's height (inches)", ylab="Adult daughter's height (inches)", bty="l") grid() # fit a simple linear regression model predicting the daughter's height based # on the mother's height res <- stan_glm(daughter_height ~ mother_height, data=dat) res # extract the coefficients (rounded to two decimal places) round(coef(res), digits=2) # add the regression line to the plot abline(res, lwd=5) # also add a diagonal line with a slope of 1 abline(a=0, b=1, lwd=5, lty="dotted") # compare the variance of the mothers' and daughters' heights var(dat$mother_height) var(dat$daughter_height) # so the variation in the daughters' heights is still as large (or even a bit # larger) than the variation in mothers' heights ############################################################################ # a little simulation to examine what happens to the heights of women over # many generations set.seed(1234) # heights of mothers in the first generation heights <- rnorm(10000, mean=62.5, sd=2.3) generations <- 100 means <- rep(NA, generations) sds <- rep(NA, generations) for (i in 1:generations) { # save the mean and SD of the height values means[i] <- mean(heights) sds[i] <- sd(heights) # simulate the height of the daughters (who, in the next iteration of the # loop, will then be the mothers) heights <- 30 + 0.5 * heights + rnorm(10000, mean=0, sd=2.3) } # it can be shown that the means converge to intercept / (1 - slope) 30 / (1 - 0.5) # it can be shown that the SDs converge to sigma / sqrt(1 - slope^2) 2.3 / sqrt(1 - 0.5^2) # plot the means and SDs over the generations and add horizontal lines for the # values to which the means and SDs will converge par(mfrow=c(2,1)) plot(1:generations, means, type="o", pch=21, bg="gray", xlab="Generation", ylab="Mean(Height)", panel.first=abline(h = 30 / (1 - 0.5), lty="dotted", lwd=2), main="Plot of the Means over Generations") plot(1:generations, sds, type="o", pch=21, bg="gray", xlab="Generation", ylab="SD(Height)", panel.first=abline(h = 2.3 / sqrt(1 - 0.5^2), lty="dotted", lwd=2), main="Plot of the SDs over Generations") par(mfrow=c(1,1)) ############################################################################ ## How regression to the mean can confuse people about causal inference; ## demonstration using fake data # simulate the data for the demonstration set.seed(1234) ability <- rnorm(1000, mean=50, sd=10) midterm <- ability + rnorm(1000, mean=0, sd=10) final <- ability + rnorm(1000, mean=0, sd=10) dat <- data.frame(midterm, final) rm(ability, midterm, final) head(dat) # regression model predicting the final exam score from the midterm exam score res <- stan_glm(final ~ midterm, data=dat) res # plot the midterm versus final exam scores and add the regression line plot(dat$midterm, dat$final, pch=21, bg="gray", xlab="Midterm exam score", ylab="Final exam score", panel.last=grid()) abline(res, lwd=5) abline(a=0, b=1, lwd=5, lty="dotted") ############################################################################ ### 7.1: Example: predicting presidential vote share from the economy # download the dataset (only need to do this once) #download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/ElectionsEconomy/data/hibbs.dat", destfile="hibbs.dat") # read in the data dat <- read.table("hibbs.dat", header=TRUE) # look at the data dat # plot growth on the x-axis versus vote on the y-axis (Figure 7.2a) plot(vote ~ growth, data=dat, xlab="Average recent growth in personal income", ylab="Incumbent party's vote share", pch=NA, xaxt="n", yaxt="n", xlim=c(-0.5,4.5), ylim=c(43,62), bty="l") abline(h=50, col="gray") with(dat, text(growth, vote, year, pos=4)) axis(side=1, at=0:4, labels=paste0(0:4, "%")) axis(side=2, at=c(45,50,55,60), labels=paste0(c(45,50,55,60), "%")) title("Forecasting the election from the economy") # fit the model using stan_glm() (first setting the seed of the random number # generator to make the results fully reproducible) set.seed(1237) res <- stan_glm(vote ~ growth, data=dat) res # model fitting with stan_glm() involves some aspects that are random (to be # discussed in further detail later on in the book); since the authors did not # set the seed in the book, we cannot exactly reproduce their results, but by # trying out different seed values, we can get the same results at least when # rounded to a single digit # plot growth versus vote and add the regression line (Figure 7.2b) plot(vote ~ growth, data=dat, xlab="Average recent growth in personal income", ylab="Incumbent party's vote share", pch=19, xaxt="n", yaxt="n", xlim=c(-0.5,4.5), ylim=c(43,62), bty="l") abline(h=50, col="gray") points(vote ~ growth, data=dat, pch=19) axis(side=1, at=0:4, labels=paste0(0:4, "%")) axis(side=2, at=c(45,50,55,60), labels=paste0(c(45,50,55,60), "%")) title("Data and linear fit") abline(res, lwd=3) text(3, coef(res)[1] + coef(res)[2]*3, pos=4, offset=2, paste0("y = ", formatC(coef(res)[1], digits=1, format="f"), " + ", formatC(coef(res)[2], digits=1, format="f"), " x")) # estimated slope plus/minus one standard error and two standard errors round(coef(res)[2] + c(-1,1) * 1 * se(res)[2], digits=1) round(coef(res)[2] + c(-1,1) * 2 * se(res)[2], digits=1) # highlight the point for the 2008 election points(vote ~ growth, data=dat, subset=year==2008, pch=19, cex=2, col="red") text(dat$growth[dat$year==2008], dat$vote[dat$year==2008], "2008", pos=1, cex=0.8) ## Graphing the fitted regression line # note: we can just do abline(res) as shown above, so we do not have to use # abline(coef(res)) as described in the book ## Using the model to predict # based on our results we obtained above, the predicted vote share for Clinton # in 2016 was as follows (based on growth =~ 2% for the second term of Obama) 46.3 + 2 * 3.0 # Figure 7.3 xs <- seq(35, 70, length=1000) ys <- dnorm(xs, mean=52.3, sd=3.9) plot(xs, ys, type="l", bty="n", lwd=3, yaxs="i", yaxt="n", ylim=c(0,max(ys)+.005), xlab="Clinton share of the two-party vote", ylab="") xs.sub <- seq(50, 70, length=1000) ys.sub <- dnorm(xs.sub, mean=52.3, sd=3.9) polygon(c(xs.sub,rev(xs.sub)), c(ys.sub,rep(0,length(xs.sub))), col="gray") text(50, dnorm(50, mean=52.3, sd=3.9)/3, "Predicted\n72% chance\nof Clinton victory", pos=4) lines(xs, ys, lwd=3) # proportion of the area above 50 in a normal distribution with mean 52.3 and # standard deviation 3.9 pnorm(50, mean=52.3, sd=3.9, lower.tail=FALSE) ############################################################################ ### 7.2: Checking the model-fitting procedure using fake-data simulation ## Step 1: Creating the pretend world a <- 46.3 b <- 3.0 sigma <- 3.9 x <- dat$growth n <- length(x) ## Step 2: Simulating fake data set.seed(1234) y <- a + b*x + rnorm(n, mean=0, sd=sigma) fake <- data.frame(x, y) ## Step 3: Fitting the model and comparing fitted to assumed values res <- stan_glm(y ~ x, data=fake) res # extract the coefficients and standard errors b_hat <- coef(res)["x"] b_se <- se(res)["x"] # check whether the 68% and 95% CIs include the true slope cover_68 <- (b_hat - 1*b_se) < b && (b_hat + 1*b_se > b) cover_95 <- (b_hat - 2*b_se) < b && (b_hat + 2*b_se > b) cover_68 cover_95 ## Step 4: Embedding the simulation in a loop set.seed(1234) n_fake <- 1000 cover_68 <- rep(NA, n_fake) cover_95 <- rep(NA, n_fake) pbar <- txtProgressBar(min=0, max=n_fake, style=3) # note: the loop below can take around ~10 minutes to finish for (s in 1:n_fake) { setTxtProgressBar(pbar, s) y <- a + b*x + rnorm(n, mean=0, sd=sigma) fake <- data.frame(x, y) res <- stan_glm(y ~ x, data=fake, refresh=0) # suppress output on console b_hat <- coef(res)["x"] b_se <- se(res)["x"] cover_68[s] <- (b_hat - 1*b_se) < b && (b_hat + 1*b_se > b) cover_95[s] <- (b_hat - 2*b_se) < b && (b_hat + 2*b_se > b) } # check the coverage of the 68% and 95% CIs mean(cover_68) mean(cover_95) # rerun the simulation using critical t-values # note: for a 68% CI, we need the value, say t_crit, from a t-distribution # under which 84% of the area falls, since that leaves 16% above and since we # construct a two-sided CI, a total of 32% will fall outside the interval # (-t_crit, t_crit); similarly, for a 95% CI, we need the value under which # 97.5% of the area falls t_68 <- qt(0.84, df=n-2) t_95 <- qt(0.975, df=n-2) cover_68 <- rep(NA, n_fake) cover_95 <- rep(NA, n_fake) pbar <- txtProgressBar(min=0, max=n_fake, style=3) for (s in 1:n_fake) { setTxtProgressBar(pbar, s) y <- a + b*x + rnorm(n, mean=0, sd=sigma) fake <- data.frame(x, y) res <- stan_glm(y ~ x, data=fake, refresh=0) # suppress output on console b_hat <- coef(res)["x"] b_se <- se(res)["x"] cover_68[s] <- (b_hat - t_68*b_se) < b && (b_hat + t_68*b_se > b) cover_95[s] <- (b_hat - t_95*b_se) < b && (b_hat + t_95*b_se > b) } # check the coverage of the 68% and 95% CIs mean(cover_68) mean(cover_95) ############################################################################ # note: as long as the model fitted corresponds to the model used to simulate # the data, the coverage of the 68% and 95% CIs above will always be nominal # (i.e., 0.68 and 0.95 within simulation error), so this simulation does *not* # tell you whether the model you are using is appropriate for the given data; # this will even be true if the actual data generating mechanism is totally # different to the model you are fitting # for example, suppose there is a quadratic relationship between x and y; # let's simulate some data of this type set.seed(1234) x <- runif(20, -1, 1) y <- 1*x - 4*x^2 + rnorm(20, 0, 0.4) plot(x, y, pch=21, bg="gray") # now let's ignore the non-linearity in the relationship and fit a simple # linear regression model to these data res <- lm(y ~ x) summary(res) abline(res, lwd=3) # do a simulation to examine the coverage of the 68% and 95% CIs using the # fake-data simulation approach described in the book (here, we use lm() since # this will speed up the model fitting a lot and the results from lm() and # stan_glm() would be very similar anyway) a <- coef(res)[1] b <- coef(res)[2] sigma <- sigma(res) n <- length(x) t_68 <- qt(0.84, df=n-2) t_95 <- qt(0.975, df=n-2) n_fake <- 10000 cover_68 <- rep(NA, n_fake) cover_95 <- rep(NA, n_fake) pbar <- txtProgressBar(min=0, max=n_fake, style=3) for (s in 1:n_fake) { setTxtProgressBar(pbar, s) y.fake <- a + b*x + rnorm(n, mean=0, sd=sigma) res <- lm(y.fake ~ x) b_hat <- coef(res)["x"] b_se <- sqrt(vcov(res)[2,2]) cover_68[s] <- (b_hat - t_68*b_se) < b && (b_hat + t_68*b_se > b) cover_95[s] <- (b_hat - t_95*b_se) < b && (b_hat + t_95*b_se > b) } # check the coverage of the 68% and 95% CIs mean(cover_68) mean(cover_95) # as we can see, the coverage is nominal even though the model we are fitting # (and using for simulating the fake data) does not correspond at all to the # true data generating mechanism; so this method only tells you whether the # statistical properties of the CI work as intended, but it cannot tell you # whether the model you are fitting is 'correct' # however, one *can* compare the simulated fake data with the actual data to # see whether the it seems reasonable that the actual data came from the # assumed model; note, we can also use the simulate() function to do the fake # data simulation y.fake <- simulate(res) points(x, y.fake[,1], pch=21, bg="firebrick") # the simulated fake data looks totally different than the actual data; now # let's fit a model that corresponds to the way we generate the actual data res2 <- lm(y ~ x + I(x^2)) y.fake <- simulate(res2) points(x, y.fake[,1], pch=21, bg="dodgerblue") # the simulated fake data now looks much more like the actual data, which # gives some evidence that the fitted model might be a reasonable # approximation to the true data generating mechanism; this is a different # type of model check than what is discussed in section 7.2 (we will see this # come back later in the book) ############################################################################ ### 7.3: Formulating comparisons as regression models set.seed(1235) # simulate 20 observations from a population with mean 2.0 and SD 5.0 n_0 <- 20 y_0 <- rnorm(n_0, mean=2.0, sd=5.0) fake_0 <- data.frame(y_0) y_0 # compute the mean and the standard error of the mean mean(y_0) sd(y_0) / sqrt(n_0) # fit a regression model with just a 'constant term' (intercept) res0 <- stan_glm(y_0 ~ 1, data=fake_0, prior_intercept=NULL, prior=NULL, prior_aux=NULL, refresh=0) res0 # simulate 30 observations from a population with mean 8.0 and SD 5.0 n_1 <- 30 y_1 <- rnorm(n_1, mean=8.0, sd=5.0) # compute the mean difference and the SE of the mean difference diff <- mean(y_1) - mean(y_0) se_0 <- sd(y_0) / sqrt(n_0) se_1 <- sd(y_1) / sqrt(n_1) se <- sqrt(se_0^2 + se_1^2) # recall eq. 4.1 diff se # put the data from the two groups together into a data frame y <- c(y_0, y_1) x <- rep(c(0,1), times=c(n_0, n_1)) fake <- data.frame(x, y) # fit a regression model with the dummy variable x as predictor res <- stan_glm(y ~ x, data=fake, prior_intercept=NULL, prior=NULL, prior_aux=NULL, refresh=0) res # Figure 7.4 plot(x, y, pch=19, cex=0.8, bty="l", xlab="Indicator x", xaxp=c(0,1,1)) abline(h=tapply(y, x, mean), lty="dashed") abline(res, lwd=3) text(0.01, 1.5, expression(bar(y)[0] == 2.44), pos=4) text(0.99, 7.9, expression(bar(y)[1] == 7.31), pos=2) text(0.5, 3.9, paste0("y = ", formatC(coef(res)[1], digits=1, format="f"), " + ", formatC(coef(res)[2], digits=1, format="f"), " x")) ############################################################################