############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2023-05-04 # # Topic(s): # - Regression and Other Stories (https://avehtari.github.io/ROS-Examples/) # - Section(s): 2.3 # # last updated: 2023-05-05 ############################################################################ ### 2.3: All graphs are comparisons ## Simple scatterplots # download the dataset download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/HealthExpenditure/data/healthdata.txt", destfile="healthdata.txt") # read in the data dat <- read.table("healthdata.txt", header=TRUE) # inspect the first 6 rows of the dataset head(dat) # Figure 2.4 plot(dat$spending, dat$lifespan, type="n", bty="l", xlim=c(0,7500), xaxs="i", xlab="Health care spending (PPP US$)", ylab="Life expectancy (years)") text(dat$spending, dat$lifespan, dat$country) # in the book, they left out some countries from the figure for clarity, so # let's do the same thing and remove the countries shown below from dat dat <- dat[!is.element(dat$country, c("Netherlands", "Belgium", "Germany", "Ireland", "Iceland", "Greece", "Italy", "Sweden", "UK")),] # Figure 2.4 plot(dat$spending, dat$lifespan, type="n", bty="l", xlim=c(0,7500), xaxs="i", xlab="Health care spending (PPP US$)", ylab="Life expectancy (years)") text(dat$spending, dat$lifespan, dat$country) ## Displaying more information on a graph # the dataset for this example is not available on the book website; one # possibility would be to use digitizing software to manually extract the data # from Figure 2.5, but this would be quite tedious given the large number of # points (and multiple points may overlap, which we might not recognize from # the figure) # the book repo contains a pdf of Figure 2.5, which we can download download.file("https://github.com/avehtari/ROS-Examples/raw/master/Redistricting/redistrict.pdf", destfile="redistrict.pdf") # since a pdf is essentially just a text file that describes (using the # PostScript language: https://en.wikipedia.org/wiki/PostScript) what should # be displayed in the pdf, we can open up this pdf file with a text editor and # copy the relevant information about the position of the points to a dataset; # however, it takes a bit of extra effort to transform this information into a # dataset that we make use of in R # after doing so, we can construct the following dataset dat <- structure(list(x = c(0.007, 0.001, 0.031, 0.031, 0.037, 0.022, 0.057, 0.039, -0.021, -0.006, -0.021, -0.019, -0.018, -0.024, -0.047, -0.047, 0.012, -0.021, -0.025, 0.016, 0.031, 0.004, 0.019, -0.007, 0.002, -0.007, -0.005, 0.045, 0.037, 0.005, 0.048, -0.011, -0.003, -0.007, 0.014, 0.012, 0.003, 0.03, 0.014, -0.041, -0.016, -0.013, -0.021, -0.032, -0.039, -0.057, -0.023, -0.004, 0.013, 0.032, 0.053, -0.008, 0.001, 0.008, -0.002, 0.003, 0.015, -0.002, 0.002, -0.004, 0.036, 0.028, 0.003, 0.013, -0.02, -0.019, -0.025, -0.052, -0.052, -0.022, -0.015, 0.04, 0.053, 0.066, -0.057, 0.009, 0.025, 0.063, 0.051, 0.012, 0.063, 0.058, -0.008, -0.007, -0.029, -0.016, 0.003, -0.027, -0.022, -0.02, 0.009, 0.04, 0.066, 0.055, 0.021, 0.069, -0.026, -0.005, -0.015, 0.012, 0.005, -0.022, -0.036, -0.06, -0.014, 0.019, 0.021, 0.047, 0.043, 0.021, 0.011, 0.041, 0.024, -0.019, -0.006, -0.07, 0.058, -0.015, -0.016, -0.021, 0.001, -0.002, -0.006, -0.015, -0.003, 0.05, -0.05, -0.034, 0.03, -0.002, -0.049, -0.014, -0.003, 0, 0.015, 0.015, -0.006, -0.014, 0.009, 0.042, -0.019, -0.014, 0.058), y = c(-0.01, 0.019, 0.021, 0.026, 0.03, 0.046, 0.029, 0.021, -0.006, -0.011, -0.009, -0.008, -0.011, -0.037, -0.037, -0.039, -0.002, -0.028, 0.013, 0.028, 0.021, 0.016, -0.01, 0.029, -0.002, -0.005, 0.045, 0.037, -0.002, 0.048, -0.011, -0.011, -0.012, 0.009, 0.007, -0.008, 0.025, 0.01, 0.023, -0.037, -0.002, 0.009, -0.02, -0.028, -0.046, -0.026, -0.025, 0.006, 0.026, 0.046, 0.043, -0.006, 0.002, 0.013, 0.007, 0.019, 0.002, -0.002, 0, 0.003, 0.019, -0.005, 0.007, -0.028, -0.027, 0.047, -0.055, -0.057, -0.027, -0.02, 0.01, 0.048, 0.061, 0.061, -0.083, 0.012, 0.05, 0.038, 0.044, 0.05, 0.045, 0.059, 0, -0.023, -0.01, 0.009, -0.008, -0.016, -0.014, 0.003, 0.023, 0.049, 0.038, 0.025, 0.052, 0.035, -0.01, -0.006, 0.02, 0.013, -0.026, -0.028, -0.051, -0.015, -0.02, 0.014, 0.04, 0.036, 0.051, 0.004, 0.01, 0.012, 0.001, -0.01, 0.006, -0.004, 0, 0.014, 0.005, -0.013, -0.024, -0.007, -0.019, -0.007, -0.001, -0.014, -0.056, -0.013, 0.02, 0.005, -0.002, -0.005, -0.009, 0.029, 0.005, 0.036, 0, -0.021, -0.007, 0.005, 0.004, 0.013, -0.021), grp = rep(1:4, times = c(111,7,10,15))), row.names = c(NA, 143L), class = "data.frame") # note: for grp, the coding is: 1 = no redistricting, 2 = bipartisan # redistricting, 3 = Republican redistricting, 4 = Democratic redistricting # Figure 2.5 par(mar=c(5,10,4,10)) plot(dat$x, dat$y, lwd=3.8, xlim=c(-.08,.07), ylim=c(-.08,.07), pch=c(15,21,4,19)[dat$grp], cex=c(0.35,1.3,1,1.3)[dat$grp], xlab="Estimated partisan bias in previous election", ylab="Estimated partisam bias\n(adjusted for state)") axis(side=2, at=par("usr")[1:2], las=1, tick=FALSE, labels=c("(favors Republicans)", "(favors Democrats)")) # turn grp into a factor dat$grpfac <- factor(dat$grp, levels=1:4, labels=c("no redistricting", rep("redistricting", 3))) # create a variable that is equal to 1 for grp=4, -1 for grp=3, and 0 otherwise dat$bipartdev <- 0 dat$bipartdev[dat$grp == 4] <- 1 dat$bipartdev[dat$grp == 3] <- -1 # fit a regression model similar to what the authors must have fitted res <- lm(y ~ x + bipartdev + x:grpfac, data=dat) summary(res) # add the regression lines for the 4 groups to the plot xs <- seq(par("usr")[1], par("usr")[2], length=100) pred <- predict(res, newdata=data.frame(x=xs, bipartdev=0, grpfac="no redistricting")) lines(xs, pred) mtext(side=4, at=tail(pred, 1), "no redistricting", las=1, line=0.4) pred <- predict(res, newdata=data.frame(x=xs, bipartdev=0, grpfac="redistricting")) lines(xs, pred) mtext(side=4, at=tail(pred, 1), "bipartisan redistrict", las=1, line=0.4) pred <- predict(res, newdata=data.frame(x=xs, bipartdev=-1, grpfac="redistricting")) lines(xs, pred) mtext(side=4, at=tail(pred, 1), "Rep. redistrict", las=1, line=0.4) pred <- predict(res, newdata=data.frame(x=xs, bipartdev=1, grpfac="redistricting")) lines(xs, pred) mtext(side=4, at=tail(pred, 1), "Dem. redistrict", las=1, line=0.4) # the results are not exactly identical (I had to guess what kind of model the # authors have fitted), but the general pattern is the same ## Multiple plots # download the data download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/Names/data/allnames_clean.csv", destfile="allnames_clean.csv") # read in the data dat <- read.csv("allnames_clean.csv") # inspect the first 6 lines and first 8 columns dat[1:6, 1:8] # only keep the boys in the dataset dat <- dat[dat$sex == "M",] # add the last letter of each name as a new variable to the dataset dat$lastletter <- substr(dat$name, nchar(dat$name), nchar(dat$name)) # turn lastletter into a factor with all letters as factor levels dat$lastletter <- factor(dat$lastletter, levels=letters) # function to create figures like 2.6 and 2.7 plotletters <- function(x) { # get the total frequency of each last letter in 1906 agg <- aggregate(x, list(letter = dat$lastletter), sum, drop=FALSE) # in case a letter never occurred, the resulting frequency will be NA, so # turn this into a 0 agg$x[is.na(agg$x)] <- 0 # create the barplot plot(1:26, agg$x/sum(agg$x)*100, type="h", lwd=6, ylab="Percentage of boys born", xaxt="n", bty="n", xlab="") par(xpd=TRUE) mtext(letters, side=1, at=1:26) } # Figure 2.6 par(mar=c(3,4,4,2)) plotletters(dat$X1906) title("Last letter of boys' names in 1906") # Figure 2.7 par(mfrow=c(1,2), mar=c(3,4,4,2)) plotletters(dat$X1956) title("Last letter of boys' names in 1956") plotletters(dat$X2006) title("Last letter of boys' names in 2006") par(mfrow=c(1,1), mar=c(5,4,2,2)) # get the position of variables that start with X followed by either 1 or 2 posyearvars <- grep("^X[1,2]", names(dat)) # for each variable from 'dat' that is such a variable, do the aggregation of # the frequencies for the same last letter, and turn these frequencies into # percentages, and return the resulting data frame; the resulting object will # be a matrix, with each row corresponding to a letter and each column # corresponding to a year agg <- sapply(dat[posyearvars], function(x) { agg <- aggregate(x, list(letter = dat$lastletter), sum, drop=FALSE) agg$x[is.na(agg$x)] <- 0 agg$x / sum(agg$x) * 100 }) # add the letters as row names rownames(agg) <- letters # inspect the first three columns for the resulting matrix agg[,1:3] # get the actual year values from the column names as a numeric variable years <- as.numeric(substr(colnames(agg), 2, 5)) years # Figure 2.8 (but using colors for letters d, n, y instead of line types) plot(years, rep(NA,length(years)), ylim=c(0,40), xlab="", bty="l", ylab="Percentage of all boys' names that year", yaxt="n") axis(side=2, at=c(0,20,40), labels=c("0%","20%","40%")) apply(agg, 1, function(y) lines(years, y, col="gray70")) lines(years, agg["d",], col="dodgerblue", lwd=4) lines(years, agg["n",], col="firebrick", lwd=4) lines(years, agg["y",], col="forestgreen", lwd=4) text(1936, agg["d","X1936"], "D", pos=3) text(1982, agg["n","X1982"], "N", pos=3) text(1976, agg["y","X1976"], "Y", pos=3) title("Last letters of boys' names") # read in the data again dat <- read.csv("allnames_clean.csv") # using a data frame that only contains boys' names and the year variables, do # the following for every year (i.e., every column): find the position of the # top 10 most frequent names, sum up the corresponding frequencies, and turn # this into a percentage out of the total frequency for that year perc.top10.b <- sapply(dat[dat$sex == "M",posyearvars], function(x) { pos.top10 <- order(x, decreasing=TRUE)[1:10] sum(x[pos.top10]) / sum(x) * 100 }) # do the same for the girls' names perc.top10.g <- sapply(dat[dat$sex == "F",posyearvars], function(x) { pos.top10 <- order(x, decreasing=TRUE)[1:10] sum(x[pos.top10]) / sum(x) * 100 }) # Figure 2.9 plot(years, rep(NA,length(years)), ylim=c(0,45), xlab="Year", bty="l", ylab="Percentage", yaxt="n") axis(side=2, at=c(0,20,40), labels=c("0%","20%","40%")) lines(years, perc.top10.b, lwd=3) lines(years, perc.top10.g, lwd=3) text(1930, perc.top10.b["X1930"], "Boys", pos=3) text(1930, perc.top10.g["X1930"], "Girls", pos=1) title("Total popularity of top 10 names each year, by sex") ## Grids of plots # download the data download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/Congress/data/1948.asc", destfile="1948.asc") download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/Congress/data/1950.asc", destfile="1950.asc") download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/Congress/data/1968.asc", destfile="1968.asc") download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/Congress/data/1970.asc", destfile="1970.asc") download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/Congress/data/1988.asc", destfile="1988.asc") download.file("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/Congress/data/1990.asc", destfile="1990.asc") prepdat <- function(year) { # read in the data dat <- data.frame(year, read.table(paste0(year, ".asc"))) # give the variables in the dataset proper variable names names(dat) <- c("year", "statecode", "unknown", "inc", "votedem", "voterep") # based on the statecode variable, recreate the region variable dat$region <- floor(dat$statecode/20) + 1 # compute the percentage of votes for the democratic candidate dat$votedemperc <- with(dat, 100 * votedem / (votedem + voterep)) # return the dataset return(dat) } plotpair <- function(year1, year2, xaxis=FALSE, showregion=FALSE) { # read in the data for two years using prepdat() dat1 <- prepdat(year1) dat2 <- prepdat(year2) # combine the relevant variables from the two datasets into a single dataset dat <- data.frame(region = dat1$region, inc = abs(dat1$inc), dvote1 = dat1$votedemperc, dvote2 = dat2$votedemperc) # create the contested and voteswing variables dat$contested <- with(dat, abs(dvote1 - 50) < 30 & abs(dvote2 - 50) < 30) dat$voteswing <- with(dat, dvote2 - dvote1) # keep only rows where inc is -1, 0, or 1 and when it is a contested election dat <- dat[dat$inc %in% c(-1, 0, 1) & dat$contested,] # region names regions <- c("Northeast", "Midwest", "South", "West") # for each region, create a scatterplot for (i in 1:4) { plot(NA, NA, xlim=c(20,80), ylim=c(-35,30), xaxt="n", yaxt="n", bty="n", xlab="", ylab="") abline(h=0) with(dat[dat$inc == 1 & dat$region==i,], points(dvote1, voteswing, pch=19, col="gray60", cex=0.5)) with(dat[dat$inc == 0 & dat$region==i,], points(dvote1, voteswing, pch=19, col="black", cex=0.7)) if (xaxis) { axis(side=1, at=c(25,50,75), labels=c("25%","50%","75%")) mtext(side=1, "Dem. vote in election 1", line=2.5, cex=0.7) } axis(side=2, at=c(-25,0,25), labels=c("-25%","0%","25%")) if (showregion) mtext(side=3, regions[i], line=2) if (i == 1) { mtext(side=2, paste0(year1, "\nto\n", year2), line=7, las=1, adj=0.5) mtext(side=2, "Vote swing", line=3, cex=0.7) } } } # Figure 2.10 par(mfrow=c(3,4), mar=c(1,2,0,2), oma=c(3,8,5,0)) plotpair(1948, 1950, showregion=TRUE) plotpair(1968, 1970) plotpair(1988, 1990, xaxis=TRUE) ############################################################################