############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2024-09-12 # # Topic(s): # - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/) # - Section(s): 10.6 - 10.10 # # last updated: 2024-09-19 ############################################################################ ### 10.6: Example: uncertainty in predicting congressional elections ## Background # download the data for the example if (!file.exists("congress.csv")) download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/Congress/data/congress.csv", destfile="congress.csv") # read in the data and inspect the first 6 rows dat <- read.csv("congress.csv") head(dat) # for the histogram, create a copy of the 'v88' variable that treats # proportions below 10% or above 90% as 'uncontested' vote shares (i.e., # change the proportions to essentially 0 or 1, respectively) dat$v88_hist <- ifelse(dat$v88 < 0.1, 0.0001, ifelse(dat$v88 > 0.9, 0.9999, dat$v88)) # Figure 10.5: proportion of the vote share for the Democratic party in the # 435 congressional districts in 1988 (using the v88_hist variable) hist(dat$v88_hist, breaks=seq(0,1,by=.05), main="Congressional elections in 1988", xlab="Democratic share of the two-party vote") # Figure 10.6a: proportion of the vote share in 1988 versus 1986 with filled # circles when the incumbent is from the Democratic party, crosses when the # incumbent is from the Republican party, and unfilled circles otherwise plot(jitter(dat$v86, amount=.01), jitter(dat$v88, amount=.01), pch=c(4,1,19)[as.numeric(factor(dat$inc88))], xlab="Democratic vote share in 1986", ylab="Democratic vote share in 1988", panel.first=abline(0,1), xlim=c(0,1), ylim=c(0,1), main="Raw Data") legend("topleft", inset=.01, pch=c(19,4,1), legend=c("Incumbent is Democrat", "Incumbent is Republican", "Otherwise")) ## Data issues # Figure 10.6b: same plot as above, but using the adjusted data as described # in the book (either 0.25 or 0.75 for uncontested elections) plot(jitter(dat$v86_adj, amount=.01), jitter(dat$v88_adj, amount=.01), pch=c(4,1,19)[as.numeric(factor(dat$inc88))], xlab="Adjusted Dem. vote share in 1986", ylab="Adjusted Dem. vote share in 1988", panel.first=abline(0,1), xlim=c(0,1), ylim=c(0,1), main="Adjusted Data") legend("topleft", inset=.01, pch=c(19,4,1), legend=c("Incumbent is Democrat", "Incumbent is Republican", "Otherwise")) ## Fitting the model # load the rstanarm package library(rstanarm) # fit the regression model set.seed(1234) dat88 <- data.frame(vote=dat$v88_adj, past_vote=dat$v86_adj, inc=dat$inc88) res <- stan_glm(vote ~ past_vote + inc, data=dat88, refresh=0) print(res, digits=2) ## Simulation for inferences and predictions of new data points # extract the sampled values from the posteriors distributions of the parameters sims88 <- as.matrix(res) head(sims88) # histogram of sampled values for each parameter par(mfrow=c(2,2)) hist(sims88[,1], main=colnames(sims88)[1], breaks=30, xlab="") hist(sims88[,2], main=colnames(sims88)[2], breaks=30, xlab="") hist(sims88[,3], main=colnames(sims88)[3], breaks=30, xlab="") hist(sims88[,4], main=colnames(sims88)[4], breaks=30, xlab="") par(mfrow=c(1,1)) # summary statistics for the sampled values for each parameter round(apply(sims88, 2, median), digits=3) round(apply(sims88, 2, mean), digits=3) round(apply(sims88, 2, sd), digits=3) # based on the model fitted above, predict the vote in 1990 based on the vote # in 1988 using the 4000 sampled parameter values dat90 <- data.frame(past_vote=dat$v88_adj, inc=dat$inc90) pred90 <- posterior_predict(res, newdata=dat90) # so we get a 4000x435 matrix dim(pred90) ## Predictive simulation for a nonlinear function of new data # determine for how many of the 435 districts was the vote share for the # Democratic party above 50% (based on each of the 4000 sampled values) dems_pred <- rowSums(pred90 > 0.5) # compute summary statistics for this count mean(dems_pred) median(dems_pred) sd(dems_pred) # or examine the entire posterior distribution for this count (note: since the # outcome is a count, instead of a histogram, we make a barplot of the # frequencies of the counts between the minimum and maximum) barplot(table(factor(dems_pred, levels=min(dems_pred):max(dems_pred))), xlab="Number of Districts Won", ylab="Frequency") # we could fit the same model using 'regular' regression and also compute # predicted values for the 1990 election based on this model and then count # how many of those predicted values are larger than 50% res.lm <- lm(vote ~ past_vote + inc, data=dat88) pred90.lm <- predict(res.lm, newdata=dat90) sum(pred90.lm > 0.5) # we get essentially the same answer as above, but it would difficult to # derive a standard error for this value; using the Bayesian approach we used # above, we automatically get a measure of the uncertainty of this estimate ## Combining simulation and analytic calculations # suppose there are 1000 people in district 147 n147 <- 1000 # then based on the predicted values for the vote share in that district, we # can compute the chances of a tied vote (500 for the Democratic candidate, # 500 for the Republican one) as follows # number of cases where the vote is not tied versus tied table(round(pred90[,147] * n147) == n147/2) # proportion of cases where the vote is tied proptie <- prop.table(table(round(pred90[,147] * n147) == n147/2))[2] proptie # now suppose there are 50 people in district 147; then we get n147 <- 50 proptie <- prop.table(table(round(pred90[,147] * n147) == n147/2))[2] proptie # note: with 50 people, a vote share just above 0.49 to just below 0.51 is # undecided, since 0.49*50 = 24.5 and 0.51*50 = 25.5 # based on this, we can compute the chances of a tied vote if the district # actually had 1000 people with proptie / (0.02 * 1000) # this differs from what we obtained above because of sampling uncertainty; # these two would be very similar to each other if we had sampled more values # from the posterior (can confirm this by adding iter=200000 to the stan_glm() # call above and rerunning the code) # however, the number of people in districts is more around 750,000 n147 <- 750000 table(round(pred90[,147] * n147) == n147/2) # then we start running into the problem that a tie never happens in the # sampled values; we could increase the number of sampled values, but this # becomes computationally very demanding; instead, we can use our observation # above and estimate the chances of a tied vote with this proptie / (0.02 * 750000) # or let's pretend there are 10 people in the district (note then that a vote # share just above 0.45 to just below 0.55 is undecided, since 0.45*10 = 4.5 # and 0.55*10 = 5.5); then we can do the estimation of a tied vote as follows n147 <- 10 proptie <- prop.table(table(round(pred90[,147] * n147) == n147/2))[2] proptie / (0.1 * 750000) # instead, we could assume the proportion of vote shares for the district are # normally distributed, with the mean and sd we obtain from the sampled values mean147 <- mean(pred90[,147]) sd147 <- sd(pred90[,147]) mean147 sd147 # then we want to know for this distribution what area falls within this interval 0.5 - 1 / (2*750000) 0.5 + 1 / (2*750000) # we can obtain this as follows, which gives us a very similar estimate for # the chances of a tied vote as we obtained above pnorm(0.5 + 1 / (2*750000), mean=mean147, sd=sd147, lower.tail=TRUE) - pnorm(0.5 - 1 / (2*750000), mean=mean147, sd=sd147, lower.tail=TRUE) # note: since n=435 here, whether we use a normal or t-distribution for this # calculations makes essentially no difference # if we would use a t-distribution, then we get essentially the same answer pt((0.5 + 1 / (2*750000) - mean147) / sd147, df=435-3, lower.tail=TRUE) - pt((0.5 - 1 / (2*750000) - mean147) / sd147, df=435-3, lower.tail=TRUE) ############################################################################ ### 10.7: Mathematical notation and statistical inference # nothing to code here ############################################################################ ### 10.8: Weighted regression # not going to do any specific examples here either ############################################################################ ### 10.9: Fitting the same model to many datasets # download the data for the example if (!file.exists("nes.txt")) download.file("https://github.com/avehtari/ROS-Examples/raw/master/NES/data/nes.txt", destfile="nes.txt") # read in the data and inspect the first 6 rows dat <- read.delim("nes.txt", sep="") head(dat) # fit the regression model for 1972, 1976, ..., 2000, and extract the # coefficients and corresponding standard errors years <- seq(1972,2000,4) res <- lapply(years, function(x) { sub <- dat[dat$year == x,] res <- stan_glm(partyid7 ~ real_ideo + race_adj + factor(age_discrete) + educ1 + female + income, data=sub, refresh=0) rbind(coef=coef(res), se=se(res)) }) # examine the resulting object res # Figure 10.9 coef_names <- c("Intercept", "Ideology", "Black", "Age_30_44", "Age_45_64", "Age_65_up", "Education", "Female", "Income") par(mfrow=c(2,5), mar=c(3,4,4,2)) for (i in 1:9) { coefs <- sapply(res, function(x) x[1,i]) ses <- sapply(res, function(x) x[2,i]) lbs <- coefs - 0.67 * ses ubs <- coefs + 0.67 * ses plot(years, coefs, pch=19, xlab="", ylab="Coefficient", bty="l", ylim=range(c(0,c(lbs,ubs))), xaxt="n") axis(side=1, at=c(1972, 1986, 2000)) abline(h=0, lty="dashed") segments(years, lbs, years, ubs, lwd=2) title(coef_names[i]) } par(mfrow=c(1,1)) ############################################################################