############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2022-07-28
#
# Topic(s):
# - An Introduction to Statistical Learning (https://www.statlearning.com)
# - Section(s): 6.4.1 - 6.4.4
#
# last updated: 2022-08-04

############################################################################

### 6.4.2: What Goes Wrong in High Dimensions?

# simulate data like in Figure 6.22 (left panel)

set.seed(1234)

n <- 20
x <- runif(n, -2, 2)
y <- 0 + 5 * x + rnorm(n, 0, 4)

plot(x, y, pch=19, cex=1.5, col="red")

res <- lm(y ~ x)
summary(res)
abline(res, lwd=5)
resid(res)

# simulate data like in Figure 6.22 (right panel)

set.seed(1299)

n <- 2
x <- runif(n, -2, 2)
y <- 0 + 5 * x + rnorm(n, 0, 4)

plot(x, y, pch=19, cex=1.5, col="red", xlim=c(-2,2), ylim=c(-10,10))

res <- lm(y ~ x)
summary(res)
abline(res, lwd=5)
resid(res)

# since n is equal to the number of regression coefficients (intercept plus
# slope), the model fits perfectly, R^2 = 1, and all of the residuals are
# equal to 0

############################################################################

# simulate data where n = 20, p = 20, and none of the predictors are actually
# related to the outcome variable

set.seed(1319)

n <- 20
p <- 20
X <- replicate(p, runif(n, -2, 2))
y <- 0 + rnorm(n)

# simulate a large test dataset based on the same model

n.test <- 100000
X.test <- replicate(p, runif(n.test, -2, 2))
y.test <- 0 + rnorm(n.test)

# fit regression models with 0, 1, 2, ..., p predictors, get R^2, the MSE, and
# the test MSE for these models (and the adjusted R^2, AIC, and BIC)

R2       <- rep(NA, p+1)
MSE      <- rep(NA, p+1)
MSE.test <- rep(NA, p+1)
R2.adj   <- rep(NA, p+1)
AIC      <- rep(NA, p+1)
BIC      <- rep(NA, p+1)

for (i in 0:p) {

   if (i == 0) {
      res <- lm(y ~ 0)
   } else {
      res <- lm(y ~ 0 + X[,1:i])
   }

   R2[i+1]     <- summary(res)$r.squared
   MSE[i+1]    <- mean((y - fitted(res))^2)
   R2.adj[i+1] <- summary(res)$adj.r.squared
   AIC[i+1]    <- AIC(res)
   BIC[i+1]    <- BIC(res)

   if (i == 0) {
      pred.test <- rep(0, n.test)
   } else {
      pred.test <- X.test[,1:i,drop=FALSE] %*% coef(res)
   }

   MSE.test[i+1] <- mean((y.test - pred.test)^2)

}

# like Figure 6.23 (but also include adjusted R^2, AIC, and BIC)

par(mfrow=c(3,2), mar=c(5,4.5,2,2))
plot(0:p, R2,       type="l", xlab="Number of Variables", ylab=expression(R^2),      xlim=c(0,20), lwd=3)
plot(0:p, MSE,      type="l", xlab="Number of Variables", ylab="Training MSE",       xlim=c(0,20), lwd=3)
plot(0:p, MSE.test, type="l", xlab="Number of Variables", ylab="Test MSE",           xlim=c(0,20), lwd=3)
plot(0:p, R2.adj,   type="l", xlab="Number of Variables", ylab=expression(R[adj]^2), xlim=c(0,20), lwd=3)
plot(0:p, AIC,      type="l", xlab="Number of Variables", ylab="AIC",                xlim=c(0,20), lwd=3)
plot(0:p, BIC,      type="l", xlab="Number of Variables", ylab="BIC",                xlim=c(0,20), lwd=3)

# note: with AIC and BIC, 'smaller is better' (i.e., models where the AIC/BIC
# is smaller strike a good balance between fit and complexity)

# note: actually, adjusted R^2 seems to work well here in suggesting that the
# intercept-only model is best; the AIC and BIC values also initially increase
# (again suggesting that one should not include predictors), although when p
# gets large, then suddenly these values drop down, leading to the false
# impression that including many predictors is appropriate (so one would have
# to restrict one's choice of models to those where p isn't getting too close
# to n); but these statistics definitely won't work anymore for model
# selection when n=p or p>n

############################################################################

### 6.4.3: Regression in High Dimensions

# conduct a simulation similar to what is described in this section for the
# case where p = 50 (it isn't entirely clear based on the description in the
# book what the authors did, so the code below is my attempt to create a
# similar simulation; also, we will include forward stepwise selection as an
# alternative model selection strategy)

# note: the code below takes a while to run, so be patient ...

# install and load the glmnet package
#install.packages("glmnet")
library(glmnet)

# install and load the leaps package
#install.packages("leaps")
library(leaps)

set.seed(5678)

iters <- 1000

n <- 100
p <- 50

beta <- 0.2

mat.dfs          <- matrix(NA, nrow=iters, ncol=p+1)
mat.MSE.test     <- matrix(NA, nrow=iters, ncol=p+1)
mat.MSE.test.fws <- matrix(NA, nrow=iters, ncol=p)

pbar <- txtProgressBar(min=0, max=iters, style=3)

for (i in 1:iters) {

   setTxtProgressBar(pbar, i)

   # simulate training data
   X <- replicate(p, runif(n, -2, 2))
   y <- 0 + c(X[,1:20] %*% rep(beta,20)) + rnorm(n)

   # simulate test data
   n.test <- 10000
   X.test <- replicate(p, runif(n.test, -2, 2))
   y.test <- 0 + c(X.test[,1:20] %*% rep(beta,20)) + rnorm(n.test)

   # use glmnet() to do lasso regression
   res <- glmnet(X, y)

   # note: glmnet() provides lasso fits for many different values of lambda
   # (from very strong shrinkage to essentially none)

   # compute predicted values in the test data for each lambda value
   pred <- predict(res, newx=X.test)

   # for every lambda value, compute the test MSE
   MSE.test <- apply(pred, 2, function(y.pred) mean((y.test - y.pred)^2))

   # extract the 'degrees of freedom' (non-zero coefficients) corresponding to
   # the various lambda values that glmnet() used
   dfs <- res$df

   # since we might have multiple degrees of freedom values that correspond to
   # different lambda values, take the smallest corresponding test MSE value
   # (so take the MSE for the level of shrinkage just needed to reach a given
   # dfs) and store this in the appropriate column of the mat.MSE.test matrix

   udfs <- unique(dfs)
   mat.dfs[i,udfs+1] <- udfs

   tmp <- tapply(MSE.test, dfs, min)
   mat.MSE.test[i,udfs+1] <- tmp

   # use forward stepwise selection in the training data
   res <- regsubsets(x=X, y=y, nvmax=p, method="forward")

   # for each level of complexity, find the best model in the training data
   # and compute the test MSE based on that model

   for (j in 1:p) {
      incl <- summary(res)$which[j,][-1]
      fit  <- lm(y ~ X[,incl])
      y.pred <- cbind(1,X.test[,incl]) %*% coef(fit)
      mat.MSE.test.fws[i,j] <- mean((y.test - y.pred)^2)
   }

}

close(pbar)

ylims <- c(min(cbind(mat.MSE.test, mat.MSE.test.fws), na.rm=TRUE),
           max(cbind(mat.MSE.test, mat.MSE.test.fws), na.rm=TRUE))

# like Figure 6.24 (middle panel) and also draw the analogous figure for
# forward stepwise selection

par(mfrow=c(1,2))
boxplot(mat.MSE.test,     cex=0.5, xlab="Degrees of Freedom", ylab="Test MSE", ylim=ylims, main="Lasso")
boxplot(mat.MSE.test.fws, cex=0.5, xlab="Degrees of Freedom", ylab="Test MSE", ylim=ylims, main="Stepwise Selection")

# conclusions: we know that the true model has 20 true predictors; however,
# the median test MSE based on lasso regression is lowest for models with a
# higher complexity (with around 30-35 non-zero coefficients); on the other
# hand, the median test MSE based on forward stepwise selection is lowest for
# models with around 20 predictors, which is appropriate; however, the test
# MSE is generally lower when using lasso regression and the test MSE is more
# variable for forward stepwise selection

# note: when running the code above with n <- 1000 (so with a larger training
# dataset), then forward stepwise selection outperforms lasso regression (the
# median test MSE is lowest for models with 20 predictors and it is lower at
# that point than the lowest median test MSE of lasso regression, which still
# suggests a model with too high complexity)

############################################################################