############################################################################
# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By: Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2021-04-08
#
# Topic(s):
# - fundamentals of regression modeling with R (part 3)
#
# last updated: 2021-04-09
############################################################################
# what we covered so far (in the last 2 streams):
#
# - simple regression model with a quantitative predictor
# - simple regression model with a categorical predictor
# - changing the 'reference level' in such a model
# - what happens when you 'remove the intercept' in such a model
# - linear contrasts using linearHypothesis() from the 'car' package
# (e.g., to compare different groups against each other)
# - regression model with a quantitative and a categorical predictor
# - what happens when you 'remove the intercept' in such a model
# - linear contrasts using linearHypothesis() from the 'car' package
# - the model as above but with an interaction between the two predictors
# - how to 'parameterize' the model such that we directly get the intercepts
# and slopes of the three groups
# - contrast the model above with the approach where we fit three separate
# simple regression models (one per group)
# - how R^2 is computed for models with and without an intercept term
############################################################################
# read in the data
dat <- read.delim("data_survey_edit.txt", na.strings="", as.is=TRUE)
# look at first 6 rows of the dataset
head(dat)
# to make things a bit easier below, keep only participants with complete data
# on the variables pss, source, rses, sex, posaff, and negaff
dat <- dat[complete.cases(dat[c("pss", "source", "rses", "sex", "posaff", "negaff")]),]
# collapse the source variable down to a smaller number of levels
table(dat$source)
dat$source <- ifelse(dat$source %in% c("children", "family", "friendships", "spouse/partner"), "interpers", dat$source)
dat$source <- ifelse(dat$source %in% c("health/illness", "lack of time", "life in general", "money/finances"), "other", dat$source)
table(dat$source)
############################################################################
# a model with two categorical predictors
res <- lm(pss ~ source + sex + source:sex, data=dat)
summary(res)
# or we can use the shortcut notation
res <- lm(pss ~ source*sex, data=dat)
summary(res)
# - intercept: the average PSS value for females when source = 'interpers'
# - sourceother and sourcework: the difference in the average PSS value when
# source = 'other' or 'work' compared to source = 'interpers' for females
# - sexmale: the difference in the average PSS value between males and females
# when source = 'interpers'
# - sourceother:sexmale and sourcework:sexmale: the difference between males
# and females with respect to the difference in the average PSS value when
# source = 'other' or 'work' compared to source = 'interpers' (hence, for
# example, if I want to know how 'other' differs from 'interpers' for males,
# I need to compute: 0.7036 + -2.0880 = -1.3844)
# fit the same model, but parameterize it in such a way that we directly get
# the estimated average PSS value for each of the 6 groups
res <- lm(pss ~ 0 + source:sex, data=dat)
summary(res)
# note: 25.0600 - 26.4444 = -1.3844 is the difference between source = 'other'
# and source = 'interpers' for males (as we also calculated above, but here
# this calculation is a lot more intuitive)
# install (if necessary) and load the 'car' package
#install.packages("car")
library(car)
# test all pairwise contrasts between the source levels for the females
linearHypothesis(res, hypothesis.matrix="sourceother:sexfemale - sourceinterpers:sexfemale")
linearHypothesis(res, hypothesis.matrix="sourcework:sexfemale - sourceinterpers:sexfemale")
linearHypothesis(res, hypothesis.matrix="sourcework:sexfemale - sourceother:sexfemale")
# test all pairwise contrasts between the source levels for the males
linearHypothesis(res, hypothesis.matrix="sourceother:sexmale - sourceinterpers:sexmale")
linearHypothesis(res, hypothesis.matrix="sourcework:sexmale - sourceinterpers:sexmale")
linearHypothesis(res, hypothesis.matrix="sourcework:sexmale - sourceother:sexmale")
# how to do a Bonferroni correction for these 6 pairwise contrasts; could look
# into the 'multcomp' package which may be able to do this in a concise way;
# but we can also just collect the 6 p-values in a vector and then feed this to
# the p.adjust() function to do the Bonferroni correction 'manually'
pvals <- c(linearHypothesis(res, hypothesis.matrix="sourceother:sexfemale - sourceinterpers:sexfemale")$"Pr(>F)"[2],
linearHypothesis(res, hypothesis.matrix="sourcework:sexfemale - sourceinterpers:sexfemale")$"Pr(>F)"[2],
linearHypothesis(res, hypothesis.matrix="sourcework:sexfemale - sourceother:sexfemale")$"Pr(>F)"[2],
linearHypothesis(res, hypothesis.matrix="sourceother:sexmale - sourceinterpers:sexmale")$"Pr(>F)"[2],
linearHypothesis(res, hypothesis.matrix="sourcework:sexmale - sourceinterpers:sexmale")$"Pr(>F)"[2],
linearHypothesis(res, hypothesis.matrix="sourcework:sexmale - sourceother:sexmale")$"Pr(>F)"[2])
p.adjust(pvals, method="bonferroni")
# note: the 'unadjusted' p-values are so large to begin with that we just get
# p = 1 for all these adjusted p-values with the Bonferroni correction
############################################################################
# a model with a three-way interaction (of two categorical predictors and one
# numerical predictor)
res <- lm(pss ~ source*sex*rses, data=dat)
summary(res)
# note: one could interpret each coefficient in the manner described above,
# but this gets quite tedious, especially for the three-way interaction terms
# (e.g., the coefficient for sourceother:sexmale:rses indicates how different
# the difference between source = 'other' and source = 'interpers' is for
# males compared to females with respect to the slope of the relationship
# between rses and pss)
# we can parameterize the model in such a way that we directly get the 6
# intercepts and 6 slopes for each of the 6 groups
res <- lm(pss ~ 0 + source:sex + source:sex:rses, data=dat)
summary(res)
# now it is easy to test linear contrasts (e.g., test if the slope of the
# relationship between rses and pss differs for source = 'interpers' and
# source = 'work' for females)
linearHypothesis(res, hypothesis.matrix="sourceinterpers:sexfemale:rses - sourcework:sexfemale:rses")
############################################################################
# model with two quantitative predictors
res <- lm(pss ~ posaff + negaff, data=dat)
summary(res)
# scatterplot of pss versus posaff (with jittering)
set.seed(1234)
plot(jitter(pss, amount=0.4) ~ jitter(posaff, amount=0.4), data=dat,
xlab="Positive Affect", ylab="Perceived Stress Scale",
pch=19, cex=0.5, xlim=c(10,50), ylim=c(10,50))
# how can draw the regression line into the scatterplot for this model?
# compute the predicted average PSS value for posaff = 10 and posaff = 50 when
# negaff is equal to its mean
mean(dat$negaff)
24.32280 + -0.20631*10 + 0.48072*19.41687
24.32280 + -0.20631*50 + 0.48072*19.41687
coef(res)[1] + coef(res)[2] * 10 + coef(res)[3] * mean(dat$negaff)
coef(res)[1] + coef(res)[2] * 50 + coef(res)[3] * mean(dat$negaff)
# that gives us this line for the scatterplot
abline(a = coef(res)[1] + coef(res)[3] * mean(dat$negaff), b = coef(res)[2])
# this is sometimes called the 'marginal' regression line (also called the
# marginal association / marginal effect)
# note: if there are missing values in the data, then the subjects included in
# fitting the model above may differ from those with complete data on negaff;
# so, when computing the mean of negaff, we should only base this on those
# subjects actually included in the analysis; we can do this by computing the
# mean based on the 'model matrix'
abline(a = coef(res)[1] + coef(res)[3] * colMeans(model.matrix(res))[3], b = coef(res)[2])
# or maybe even better (so we don't have to count coefficient numbers)
abline(a = coef(res)["(Intercept)"] + coef(res)["negaff"] * colMeans(model.matrix(res))["negaff"], b = coef(res)["posaff"])
# instead of holding negaff at its mean, we could also hold it at one standard
# deviation below or above its mean
abline(a = coef(res)[1] + coef(res)[3] * (mean(dat$negaff) - sd(dat$negaff)), b = coef(res)[2], lty="dotted")
abline(a = coef(res)[1] + coef(res)[3] * (mean(dat$negaff) + sd(dat$negaff)), b = coef(res)[2], lty="dashed")
# can also use partial regression plots to illustrate the association between
# pss and posaff while 'controlling' for negaff
# https://en.wikipedia.org/wiki/Partial_regression_plot
res1 <- lm(pss ~ negaff, data=dat)
res2 <- lm(posaff ~ negaff, data=dat)
plot(residuals(res2), residuals(res1), pch=19, cex=0.5)
abline(a=0, b=coef(res)["posaff"])
# model with an interaction between two continuous predictors
res <- lm(pss ~ posaff*negaff, data=dat)
summary(res)
# scatterplot of pss versus posaff (with jittering)
set.seed(1234)
plot(jitter(pss, amount=0.4) ~ jitter(posaff, amount=0.4), data=dat,
xlab="Positive Affect", ylab="Perceived Stress Scale",
pch=19, cex=0.5, xlim=c(10,50), ylim=c(10,50))
# compute the predicted average PSS value for posaff = 10 and posaff = 50 when
# negaff is equal to its mean
20.469345 + -0.089754*10 + 0.659114*mean(dat$negaff) + -0.005543*mean(10*dat$negaff)
20.469345 + -0.089754*50 + 0.659114*mean(dat$negaff) + -0.005543*mean(50*dat$negaff)
# that would give us this line
abline(a = coef(res)[1] + coef(res)[3] * mean(dat$negaff), b = coef(res)[2] + coef(res)[4]*mean(dat$negaff))
# draw the lines when negaff is one standard deviation below or above its mean
abline(a = coef(res)[1] + coef(res)[3] * (mean(dat$negaff) - sd(dat$negaff)), b = coef(res)[2] + coef(res)[4]*(mean(dat$negaff) - sd(dat$negaff)), lty="dotted")
abline(a = coef(res)[1] + coef(res)[3] * (mean(dat$negaff) + sd(dat$negaff)), b = coef(res)[2] + coef(res)[4]*(mean(dat$negaff) + sd(dat$negaff)), lty="dashed")
# note: the lines are no longer parallel, since the slope of the relationship
# between pss and posaff depends on negaff in this model (since it includes an
# interaction between posaff and negaff)
# you might also consider checking out the emmeans package, which does this
# kind of stuff in a more automated manner
# https://cran.r-project.org/package=emmeans
############################################################################