############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2021-03-11
#
# Topic(s):
# - bootstrapping
# - permutation testing
#
# last updated: 2021-03-11

############################################################################

# read in data
dat <- read.delim("data_survey_edit.txt")

# look at first 6 rows of dataset
head(dat)

# number of rows
nrow(dat)

# to make things a bit easier below, keep only participants with complete data
# on the variables age, sex, pss, rses, and lotr
dat <- dat[complete.cases(dat[c("age", "sex", "pss", "rses", "lotr")]),]

# so we lose 5 participants
nrow(dat)

# mean stress level for female versus male participants
by(dat$pss, dat$sex, mean)

# mean difference
mean(dat$pss[dat$sex == "female"]) - mean(dat$pss[dat$sex == "male"])

# t-test of H0: true mean difference is 0
# https://en.wikipedia.org/wiki/T-test#Equal_or_unequal_sample_sizes,_similar_variances_(1/2_%3C_sX1/sX2_%3C_2)
t.test(pss ~ sex, data=dat, var.equal=TRUE)

############################################################################

# bootstrapping: https://en.wikipedia.org/wiki/Bootstrapping_(statistics)

# bootstrap the mean difference

set.seed(1234)

iters <- 10000
diffs <- rep(NA, iters)
n <- nrow(dat)

for (i in 1:iters) {

   ids <- sample(n, replace=TRUE)
   sub <- dat[ids,]
   diffs[i] <- mean(sub$pss[sub$sex == "female"]) - mean(sub$pss[sub$sex == "male"])

}

hist(diffs, breaks=50)
abline(v=mean(dat$pss[dat$sex == "female"]) - mean(dat$pss[dat$sex == "male"]), lwd=5)
mean(diffs)

ci <- quantile(diffs, c(.025, .975))
ci

abline(v=ci, lwd=3)

# histogram with kernel density estimate of bootstrap distribution superimposed

hist(diffs, breaks=50, freq=FALSE)
lines(density(diffs), lwd=5)

# let's use the boot package for this

library(boot)

bootfun <- function(data, ids) {

   sub <- data[ids,]
   mean(sub$pss[sub$sex == "female"]) - mean(sub$pss[sub$sex == "male"])

}

set.seed(1234)

sav <- boot(dat, statistic=bootfun, R=10000)
sav

hist(sav$t, breaks=50)
abline(v=sav$t0, lwd=5)

cis <- boot.ci(sav)
cis

abline(v=cis$bca[4:5], lwd=3)

# note: not exactly the same, but we can ignore the difference
cis$percent[4:5]
quantile(sav$t, c(.025, .975))

# or we can get the same percentile CI with this
quantile(sav$t, c(.025, .975), type=6)

############################################################################

# bootstrap CI for the standardized mean difference

bootfun <- function(data, ids) {

   sub <- data[ids,]
   pss.f <- sub$pss[sub$sex == "female"]
   pss.m <- sub$pss[sub$sex == "male"]
   varp <- ((length(pss.f)-1)*sd(pss.f)^2 + (length(pss.m)-1)*sd(pss.m)^2) / (nrow(sub) - 2)
   (mean(pss.f) - mean(pss.m)) / sqrt(varp)

}

set.seed(1234)

sav <- boot(dat, statistic=bootfun, R=10000)
sav

hist(sav$t, breaks=50)
abline(v=sav$t0, lwd=5)

cis <- boot.ci(sav)
cis

abline(v=cis$bca[4:5], lwd=3)

############################################################################

# bootstrap CI for cor(pss,rses) - cor(pss,lotr)

cor(dat$pss, dat$rses)
cor(dat$pss, dat$lotr)
cor(dat$pss, dat$rses) - cor(dat$pss, dat$lotr)

bootfun <- function(data, ids) {
   sub <- data[ids,]
   cor(sub$pss, sub$rses) - cor(sub$pss, sub$lotr)
}

set.seed(1234)

sav <- boot(dat, statistic=bootfun, R=10000)
sav

hist(sav$t, breaks=50)
abline(v=sav$t0, lwd=5)

cis <- boot.ci(sav)
cis

abline(v=cis$bca[4:5], lwd=3)

############################################################################

# permutation testing: https://en.wikipedia.org/wiki/Resampling_(statistics)#Permutation_tests

set.seed(1234)

iters <- 10000
diffs <- rep(NA, iters)

for (i in 1:iters) {

   sub <- dat
   sub$sex <- sample(sub$sex)
   diffs[i] <- mean(sub$pss[sub$sex == "female"]) - mean(sub$pss[sub$sex == "male"])

}

hist(diffs, breaks=50)
abline(v=0, lwd=5)
obsdiff <- mean(dat$pss[dat$sex == "female"]) - mean(dat$pss[dat$sex == "male"])
abline(v=obsdiff, lwd=3)

mean(diffs >= obsdiff)
mean(abs(diffs) >= abs(obsdiff))

# compare with t-test
t.test(pss ~ sex, data=dat, var.equal=TRUE)

############################################################################

# permutation testing in the context of a regression model

res <- lm(pss ~ age + sex + rses, data=dat)
summary(res)

set.seed(1234)

iters <- 10000
coefs <- matrix(NA, nrow=iters, ncol=4)

for (i in 1:iters) {

   sub <- dat
   sub$pss <- sample(sub$pss)
   tmp <- lm(pss ~ age + sex + rses, data=sub)
   coefs[i,] <- coef(tmp)

}

pairs(coefs, pch=16, cex=0.5)

pairs(coefs, pch=16, cex=0.5, col=rgb(0,0,0,.05))

# put histograms on the diagonal
panel.hist <- function(x, ...) {
   usr <- par("usr"); on.exit(par(usr))
   par(usr = c(usr[1:2], 0, 1.5))
   h <- hist(x, plot = FALSE, breaks = 50)
   breaks <- h$breaks; nB <- length(breaks)
   y <- h$counts; y <- y/max(y)
   rect(breaks[-nB], 0, breaks[-1], y, col = "lightgray")
}

pairs(coefs, pch=20, cex=0.5, col=rgb(0,0,0,.05), diag.panel=panel.hist)

mean(abs(coefs[,1]) >= abs(coef(res)[1]))
mean(abs(coefs[,2]) >= abs(coef(res)[2]))
mean(abs(coefs[,3]) >= abs(coef(res)[3]))
mean(abs(coefs[,4]) >= abs(coef(res)[4]))

# compare with summary(lm())
summary(res)

# to avoid a p-value of 0, can do a +1 correction
(sum(abs(coefs[,4]) >= abs(coef(res)[4])) + 1) / (iters + 1)

# a CI for R^2 using bootstrapping

bootfun <- function(data, ids) {

   sub <- data[ids,]
   tmp <- lm(pss ~ age + sex + rses, data=sub)
   summary(tmp)$r.squared

}

set.seed(1234)

sav <- boot(dat, statistic=bootfun, R=10000)
sav

hist(sav$t, breaks=50)
abline(v=sav$t0, lwd=5)

cis <- boot.ci(sav)
cis

abline(v=cis$bca[4:5], lwd=3)

# to see a skewed bootstrap distribution, rerun the above but remove rses from
# the model; then R^2 is quite small (~4%) and the bootstrap distribution is
# quite right-skewed

############################################################################