############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2022-08-04 # # Topic(s): # - An Introduction to Statistical Learning (https://www.statlearning.com) # - Section(s): 6.5.1 - 6.5.3 # # last updated: 2022-08-05 ############################################################################ ### 6.5.1: Subset Selection Methods # install (if needed) and load the ISLR2 package #install.packages("ISLR2") library(ISLR2) # copy the Hitters dataset to dat dat <- Hitters # examine the dataset in a spreadsheet-like view View(dat) # get the names of the variables in the dataset names(dat) # get the dimensions of the dataset (number of rows and columns) dim(dat) # number of players with missing salary value sum(is.na(dat$Salary)) # examine some summary statistics for all variables summary(dat) # remove all rows where at least one variable is missing dat <- na.omit(dat) # recheck dimensions and the number of players with missing salary value dim(dat) sum(is.na(dat$Salary)) ############################################################################ # install (if needed) and load the leaps package #install.packages("leaps") library(leaps) ############################################################################ # Best Subset Selection # best subset selection predicting salary from all other variables in the # dataset (going up to the model that includes all 19 predictors) res <- regsubsets(Salary ~ ., data=dat, nvmax=19) # note: could also set nvmax=Inf to automatically go up to the model that # includes all predictors, regardless of how many there are # note: this considers all possible models of which there are 2^19 # examine the results summary(res) # the output shows what the 'best' model is (i.e., which predictors are # included) for each model complexity (i.e., number of predictors included); # 'best' here essentially means the model with the highest R^2 value for a # given level of complexity # examine the R^2 value for all 'best' models for each level of complexity (so # the first value is the R^2 for the model that is best among all models with # a single predictor, the second value is the R^2 for the model that is best # among all models with two predictors, and so on) summary(res)$rsq # note: the R^2 values will always go up with model complexity, so these are # not useful for selecting the overall 'best' model; instead, we can use # statistics like adjusted R^2, Mallow's Cp, and the BIC to compare the # various 'best' models to find the one that is best overall # examine the adjusted R^2 values for these models summary(res)$adjr2 # copy the RSS, adjusted R^2, Cp, and BIC values into vectors res.rss <- summary(res)$rss res.adjr2 <- summary(res)$adjr2 res.cp <- summary(res)$cp res.bic <- summary(res)$bic # largest adjusted R^2 and its position max(res.adjr2) which.max(res.adjr2) # so according to the adjusted R^2, the model with 11 predictors is the best # overall model # get the regression coefficients for this model coef(res, id=11) # set up a 2x2 plot and then plot the RSS, adjusted R^2, Cp, and BIC values # for all 'best' models for the different levels of complexity par(mfrow=c(2,2)) plot(res.rss, xlab ="Number of Variables", ylab="RSS", type="o", pch=19, xlim=c(0,20)) plot(res.adjr2, xlab ="Number of Variables", ylab="Adjusted RSq", type="o", pch=19, xlim=c(0,20)) points(which.max(res.adjr2), max(res.adjr2), col="red", cex=2, pch=19) plot(res.cp, xlab ="Number of Variables", ylab="Cp", type="o", pch=19, xlim=c(0,20)) points(which.min(res.cp), min(res.cp), col="red", cex=2, pch=19) plot(res.bic, xlab ="Number of Variables", ylab="BIC", type="o", pch=19, xlim=c(0,20)) points(which.min(res.bic), min(res.bic), col="red", cex=2, pch=19) # note: RSS is also not useful for selecting the overall best model because it # will always go down with increasing complexity # plot which variables are included at each level of complexity and the # corresponding R^2, adjusted R^2, Cp, and BIC values par(mfrow=c(2,2)) plot(res, scale="r2") plot(res, scale="adjr2") plot(res, scale="Cp") plot(res, scale="bic") par(mfrow=c(1,1)) # we see that, according to the BIC, the model with 6 predictors is the # overall best model; get the regression coefficients for this model coef(res, id=6) # get the full regression table for this model X.best <- model.matrix(Salary ~ ., data=dat)[,names(coef(res, id=6))][,-1] summary(lm(Salary ~ X.best, data=dat)) # note: the standard errors (and hence test statistics and p-values) given # above do not properly reflect the fact that data were used in fitting this # model that were also used in selecting this model from many other models ############################################################################ # Forward and Backward Stepwise Selection # forward stepwise selection res <- regsubsets(Salary ~ ., data=dat, nvmax=19, method="forward") summary(res) # which model complexity is best overall according to the BIC which.min(summary(res)$bic) # get the corresponding regression coefficients coef(res, id=6) # backward stepwise selection res <- regsubsets(Salary ~ ., data=dat, nvmax=19, method="backward") summary(res) # which model complexity is best overall according to the BIC which.min(summary(res)$bic) # get the corresponding regression coefficients coef(res, id=8) # note: an exhaustive search, forward selection, and backward selection are # not guaranteed to lead to the same overall best model ############################################################################ # Choosing Among Models Using the Validation-Set Approach and Cross-Validation # for full reproducibility, set the seed of the random number generator set.seed(1) # create variables to indicate which players are in the training/test datasets train <- sample(c(TRUE, FALSE), nrow(dat), replace=TRUE) test <- !train # number of people in the training (TRUE) and the test (FALSE) group table(train) # best subset selection using only the training data res <- regsubsets(Salary ~ ., data=dat[train,], nvmax=19) summary(res) # set up the model matrix (with all 19 predictors) for the test data X.test <- model.matrix(Salary ~ ., data=dat[test,]) # for the 19 'best' models of increasing complexity, compute the test error rate test.error <- rep(NA, 19) for (i in 1:19) { coefi <- coef(res, id=i) pred <- X.test[,names(coefi)] %*% coefi test.error[i] <- mean((dat$Salary[test] - pred)^2) } # plot the test MSE for the models with increasing complexity plot(test.error, xlab ="Number of Variables", ylab="Test MSE", type="o", pch=19, xlim=c(0,20)) points(which.min(test.error), min(test.error), col="red", cex=2, pch=19) # for which model complexity is the test MSE minimized which.min(test.error) # coefficients / variables for the best overall model coef(res, id=7) # best subset selection using the full dataset going up to 7 predictors res <- regsubsets(Salary ~ ., data=dat, nvmax=7) summary(res) # get the coefficients for the best 7 predictor model using the full dataset coef(res, id=7) # get the full regression table for this model X.best <- model.matrix(Salary ~ ., data=dat)[,names(coef(res, id=7))][,-1] summary(lm(Salary ~ X.best, data=dat)) # note: same issue with the standard errors as mentioned earlier ############################################################################ # choosing the overall best model using 10-fold cross-validation k <- 10 set.seed(1) folds <- sample(rep(1:k, length=nrow(dat))) cv.errors <- matrix(NA, nrow=k, ncol=19) for (j in 1:k) { res <- regsubsets(Salary ~ ., data=dat[folds != j,], nvmax=19) X.test <- model.matrix(Salary ~ ., data=dat[folds == j,]) for (i in 1:19) { coefi <- coef(res, id=i) pred <- X.test[,names(coefi)] %*% coefi cv.errors[j,i] <- mean((dat$Salary[folds==j] - pred)^2) } } test.error.cv <- apply(cv.errors, 2, mean) # plot the test MSE for the models with increasing complexity plot(test.error.cv, xlab ="Number of Variables", ylab="Test MSE", type="o", pch=19, xlim=c(0,20)) points(which.min(test.error.cv), min(test.error.cv), col="red", cex=2, pch=19) # for which model complexity is the test MSE minimized which.min(test.error.cv) # best subset selection using the full dataset going up to 10 predictors res <- regsubsets(Salary ~ ., data=dat, nvmax=10) # get the coefficients for the best 10 predictor model using the full dataset coef(res, id=10) # get the full regression table for this model X.best <- model.matrix(Salary ~ ., data=dat)[,names(coef(res, id=10))][,-1] summary(lm(Salary ~ X.best, data=dat)) # note: same issue with the standard errors as mentioned earlier ############################################################################ ### 6.5.2: Ridge Regression and the Lasso # install (if needed) and load the glmnet package #install.packages("glmnet") library(glmnet) # create model matrix (leaving out the intercept) x <- model.matrix(Salary ~ ., data=dat)[,-1] # put the outcome variable into y y <- dat$Salary # check that we get the same as OLS when lambda=0 (note: have to use a smaller # value for the convergence threshold for this to give the same coefficients) res <- glmnet(x, y, alpha=0, lambda=0, thresh=1e-20) round(data.frame(glmnet = coef(res)[,1], lm = coef(lm(Salary ~ ., data=dat))), digits=5) # ridge regression for various values of lambda lambdas <- 10^seq(10, -2, length=100) res <- glmnet(x, y, alpha=0, lambda=lambdas) # coef(res) gives a matrix with the ridge regression coefficients where each # column corresponds to one of the lambda values considered # consider the 50th lambda value; get the corresponding coefficients and the # l2 norm value (= the square-root of the sum of the squared coefficients, # leaving out the intercept) res$lambda[50] coef(res)[,50] sqrt(sum(coef(res)[-1,50]^2)) # the same but for the 60th lambda value res$lambda[60] coef(res)[,60] sqrt(sum(coef(res)[-1,60]^2)) # get the regression coefficients for lambda=50 pred50 <- predict(res, s=50, type="coefficients")[,1] pred50 ############################################################################ # a note about how predict() works here # note: lambda=50 was not one the lambda values we specified above options(scipen=100) lambdas options(scipen=0) # predict() estimates the coefficients for lambda=50 by interpolating the # values we actually considered when we called glmnet() # coefficient profile plot as a function of log(lambda) plot(res, xvar="lambda") # indicate log(50) as a vertical line in the plot abline(v=log(50)) # however, this interpolation is not entirely accurate res50 <- glmnet(x, y, alpha=0, lambda=50) round(data.frame(pred50 = pred50, res50 = coef(res50)[,1]), digits=5) # can use exact=TRUE to avoid the use of interpolation pred50 <- predict(res, s=50, type="coefficients", exact=TRUE, x=x, y=y)[,1] round(data.frame(pred50 = pred50, res50 = coef(res50)[,1]), digits=5) # we still do not get the exact same coefficients; the problem is again that # 'thresh' needs to be lowered res <- glmnet(x, y, alpha=0, lambda=lambdas, thresh=1e-20) res50 <- glmnet(x, y, alpha=0, lambda=50, thresh=1e-20) pred50 <- predict(res, s=50, type="coefficients", exact=TRUE, x=x, y=y)[,1] round(data.frame(pred50 = pred50, res50 = coef(res50)[,1]), digits=5) ############################################################################ # validation-set approach for choosing lambda set.seed(1) train <- sample(1:nrow(x), nrow(x)/2) test <- -train y.test <- y[test] # ridge regression using only the training data res <- glmnet(x[train,], y[train], alpha=0, lambda=lambdas, thresh=1e-12) # predict the salary for the test data setting lambda=4 pred <- predict(res, s=4, newx=x[test,]) # compute the test MSE mean((pred-y.test)^2) # if lambda is very large, then essentially all coefficients are shrunken # towards 0, in which case the predicted value is just the intercept in the # training data, which is equal to the mean salary in the training data; then # the test MSE would be equal to mean((mean(y[train]) - y.test)^2) # same if we set lambda=10^10 (i.e., very large) pred <- predict(res, s=10^10, newx=x[test,]) mean((pred-y.test)^2) # predict the salary for the test data setting lambda=0 (i.e., like OLS) pred <- predict(res, s=0, newx=x[test,], exact=TRUE, x=x[train,], y=y[train]) # compute the test MSE mean((pred-y.test)^2) # compare the OLS coefficients from lm() with the ones we get from glmnet() pred0 <- predict(res, s=0, exact=TRUE, type="coefficients", x=x[train,], y=y[train]) round(data.frame(lm = coef(lm(y ~ x, subset=train)), glmnet = pred0[,1]), digits=5) # slightly different again; setting 'thresh' above to something even smaller # (like thresh=1e-20) yields coefficients that are even closer # compute the test MSE for various values of lambda (zooming into a range # where we can find the minimum) lambdas <- 10^seq(4, -2, length=100) res <- glmnet(x[train,], y[train], alpha=0, lambda=lambdas) test.error <- rep(NA, 100) for (i in 1:100) { pred <- predict(res, s=lambdas[i], newx=x[test,]) test.error[i] <- mean((pred-y.test)^2) } # plot the test MSE as a function of log(lambda) plot(log(lambdas), test.error, type="o", pch=19, xlab="lambda", ylab="Test MSE") # for which lambda value is test estimated test MSE minimized bestlam <- lambdas[which.min(test.error)] bestlam # indicate that value in the plot above as a vertical line abline(v=log(bestlam)) # ridge regression with this lambda value in the full dataset res <- glmnet(x, y, alpha=0, lambda=bestlam) coef(res)[,1] ############################################################################ # ridge regression using only the training data lambdas <- 10^seq(10, -2, length=100) res <- glmnet(x[train,], y[train], alpha=0, lambda=lambdas, thresh=1e-12) # do 10-fold cross-validation using cv.glmnet() to find the lambda value that # minimizes the out-of-sample MSE (but only using the training data!) set.seed(1) cv.out <- cv.glmnet(x[train,], y[train], alpha=0) plot(cv.out) # for which lambda value is the out-of-sample MSE minimized bestlam <- cv.out$lambda.min bestlam # compute the test MSE using this lambda value pred <- predict(res, s=bestlam, newx=x[test,]) mean((pred-y.test)^2) # ridge regression with this lambda value in the full dataset res <- glmnet(x, y, alpha=0, lambda=bestlam) coef(res)[,1] ############################################################################ # The Lasso # lasso regression using only the training data lambdas <- 10^seq(10, -2, length=100) res <- glmnet(x[train,], y[train], alpha=1, lambda=lambdas) # coefficient profile plot as a function of log(lambda) plot(res, xvar="lambda") # note: the range of lambda values considered in the book isn't ideal, since # there is a wide range of larger lambda values where all coefficients are # shrunken to 0 # 10-fold cross-validation set.seed(1) cv.out <- cv.glmnet(x[train,], y[train], alpha=1) plot(cv.out) # for which lambda value is the out-of-sample MSE minimized bestlam <- cv.out$lambda.min bestlam # compute the test MSE using this lambda value pred <- predict(res, s=bestlam, newx=x[test,]) mean((pred-y.test)^2) # use this lambda value in the full dataset res <- glmnet(x, y, alpha=1, lambda=bestlam) coef(res)[,1] # in the book, the authors use the full sequence of lambdas and then predict # the coefficients for the best lambda value, but this yields slightly # different results res <- glmnet(x, y, alpha=1, lambda=lambdas) predict(res, s=bestlam, type="coefficients")[,1] # again, this is due to the interpolation that predict() does by default # contrast this again with using exact=TRUE res <- glmnet(x, y, alpha=1, lambda=lambdas) predict(res, s=bestlam, type="coefficients", exact=TRUE, x=x, y=y)[,1] # but need to lower 'thresh' again to get essentially the same values res <- glmnet(x, y, alpha=1, lambda=bestlam, thresh=1e-20) predict(res, s=bestlam, type="coefficients")[,1] res <- glmnet(x, y, alpha=1, lambda=lambdas, thresh=1e-20) predict(res, s=bestlam, type="coefficients", exact=TRUE, x=x, y=y)[,1] ############################################################################ ### 6.5.3: PCR and PLS Regression # install (if needed) and load the pls package #install.packages("pls") library(pls) # Principal Components Regression (PCR) # do PCR with 10-fold cross-validation set.seed(2) res <- pcr(Salary ~ ., data=dat, scale=TRUE, validation="CV") summary(res) # plot the cross-validated MSE as a function of the number of components validationplot(res, val.type="MSEP") # do PCR with 10-fold cross-validation using only the training data set.seed(1) res <- pcr(Salary ~ ., data=dat, scale=TRUE, validation="CV", subset=train) validationplot(res, val.type="MSEP") # predict in the test data using 5 components pred <- predict(res, x[test,], ncomp=5) # compute the test MSE mean((pred-y.test)^2) # now fit PCR using 5 components in the full dataset res <- pcr(Salary ~ ., data=dat, scale=TRUE, ncomp=5) summary(res) ############################################################################ # Partial Least Squares Regression (PLSR) # do PLSR with 10-fold cross-validation set.seed(1) res <- plsr(Salary ~ ., data=dat, scale=TRUE, validation="CV", subset=train) summary(res) validationplot(res, val.type="MSEP") # predict in the test data using 1 component pred <- predict(res, x[test,], ncomp=1) # compute the test MSE mean((pred-y.test)^2) # now fit PLSR using 1 component in the full dataset res <- plsr(Salary ~ ., data=dat, scale=TRUE, ncomp=1) summary(res) ############################################################################