############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2022-06-02 # # Topic(s): # - An Introduction to Statistical Learning (https://www.statlearning.com) # - Section(s): 5.1.5 - 5.4 # # last updated: 2022-06-06 ############################################################################ ### 5.1.5: Cross-Validation on Classification Problems # the dataset used in this section is not available; therefore, we will use # the 'Default' dataset to try out some cross-validation methods in the # context of a classification setting # (install) and load the ISLR2 package #install.packages("ISLR2") library(ISLR2) # copy the Default dataset to dat dat <- Default # examine the first 6 rows of the dataset head(dat) # turn default into a logical variable dat$default <- dat$default == "Yes" # frequency table of the outcome variable (default) table(dat$default) # note: defaulting is a relatively rare outcome (only a little over 3% of the # customers defaulted on their credit card debt) # fit logistic regression model predicting defaulting from variables balance, # income, and student res <- glm(default ~ balance + income + student, data=dat, family=binomial) summary(res) # predict whether a customer will default based on the model (i.e., whether # the estimated probability of defaulting is larger than 0.5) and examine the # confusion matrix pred.train <- predict(res, type="response") > 0.5 table(pred.train, dat$default) # compute the training error rate errorrate.train <- mean(pred.train != dat$default) errorrate.train # do leave-one-out cross validation (LOOCV) to estimate the test error rate # (this will take a few minutes since 10,000 models need to be fitted, so use # a progress bar to indicate how things are progressing) pred.loocv <- rep(NA, nrow(dat)) pb <- txtProgressBar(max=nrow(dat), style=3) for (i in 1:nrow(dat)) { setTxtProgressBar(pb, value=i) res <- glm(default ~ balance + income + student, data=dat, family=binomial, subset=-i) pred.loocv[i] <- predict(res, newdata=dat[i,], type="response") > 0.5 } close(pb) # confusion matrix and LOOCV test error rate table(pred.loocv, dat$default) mean(pred.loocv != dat$default) # note: this is identical to the training error rate; since n=10,000, the # models are fitted with 9,999 data points each and therefore are essentially # all the same as the model fitted to all 10,000 data points; as a result, the # training error rate and the LOOCV test error rate are in this case identical # do k-fold cross-validation (CV) to estimate the test error rate (with k=10) set.seed(1234) k <- 10 grp <- sample(1:k, nrow(dat), replace=TRUE) pred.kfoldcv <- rep(NA, nrow(dat)) for (i in 1:k) { res <- glm(default ~ balance + income + student, data=dat, family=binomial, subset=grp!=i) pred.kfoldcv[grp==i] <- predict(res, newdata=dat[grp==i,], type="response") > 0.5 } # confusion matrix and k-fold CV test error rate table(pred.kfoldcv, dat$default) mean(pred.kfoldcv != dat$default) # again, the test error rate is the same as the training error rate, although # the confusion matrix is slightly different; here, the models are fitted with # ~9000 data points each and therefore are not going to differ all that much # from each other again # now fit (polynomial) models with increasing complexity and estimate the test # error rate using 10-fold cross-validation for each model k <- 10 degrees <- 1:6 errorrates.poly <- rep(0, length(degrees)) grp <- sample(1:k, nrow(dat), replace=TRUE) for (d in degrees) { for (i in 1:k) { res <- glm(default ~ poly(balance, degree=d) + poly(income, degree=d) + student, data=dat, family=binomial, subset=grp!=i) pred <- predict(res, newdata=dat[grp==i,], type="response") > 0.5 errorrates.poly[d] <- errorrates.poly[d] + sum(pred != dat$default[grp==i]) } } errorrates.poly <- errorrates.poly / nrow(dat) errorrates.poly # plot the test error rate as a function of the polynomial degree plot(degrees, errorrates.poly, type="o", pch=19, ylim=c(.025,.035), xlab="Degree of Polynomial", ylab="Error Rate") # add the line for the error rate we get if we predict that nobody defaults # (since defaulting is rare, doing so gives us a pretty low error rate) abline(h=mean(dat$default), lty="dotted") # add the line for the training error rate abline(h=errorrate.train, lty="dashed") # conclusion: no matter how high of a degree polynomial model we use, the test # error rate pretty much stays flat; so a model without any higher degree # polynomial terms seems perfectly fine for these data ############################################################################ # (install) and load the class package (to do k-nearest neighbor) #install.packages("class") library(class) # rescale the numeric variables into a 0-1 range and turn the student factor # variable into a 0/1 variable (so that when we use knn below, all predictors # have a comparable range) dat$balance <- (dat$balance - min(dat$balance)) / (max(dat$balance) - min(dat$balance)) dat$income <- (dat$income - min(dat$income)) / (max(dat$income) - min(dat$income)) dat$student <- ifelse(dat$student == "Yes", 1, 0) # split the data into a training and test dataset id <- sample(nrow(dat), round(nrow(dat)/2)) dat.train <- dat[ id,] dat.test <- dat[-id,] predvars <- c("balance", "income", "student") # knn with k=5 and compute the error rate in the test data set.seed(51358) pred <- knn(dat.train[predvars], dat.test[predvars], dat.train$default, k=5) mean(pred != dat.test$default) # knn with increasing values of k and compute the test error rate using k-fold # CV with k=10 (careful with the different meanings of k here) ks.for.knn <- c(1:15, seq(20, 50, by=5), seq(60, 100, by=10)) k.for.cv <- 10 errorrates.knn <- rep(0, length(ks.for.knn)) grp <- sample(1:k.for.cv, nrow(dat), replace=TRUE) for (j in 1:length(ks.for.knn)) { for (i in 1:k.for.cv) { pred <- knn(dat[grp!=i,predvars], dat[grp==i,predvars], dat$default[grp!=i], k=ks.for.knn[j]) errorrates.knn[j] <- errorrates.knn[j] + sum(pred != dat$default[grp==i]) } } errorrates.knn <- errorrates.knn / nrow(dat) errorrates.knn # plot the test error rate as a function of 1/k (flexibility) (use a log scale # for the x-axis to make it easier to distinguish points where 1/k is low) plot(1/ks.for.knn, errorrates.knn, type="o", pch=19, ylim=c(.025,.045), xlab="1/k (flexibility)", ylab="Error Rate", log="x") # add the line for the error rate we get if we predict that nobody defaults abline(h=mean(dat$default), lty="dotted") # add the line for the training error rate abline(h=errorrate.train, lty="dashed") # find the value of k (in knn) which minimizes the test error rate ks.for.knn[which.min(errorrates.knn)] # conclusion: now we see the 'classic' U-shape in the plot; we minimize the # test error rate for a value of around k = 25 (1/k = 0.04) for knn, but the # performance is slightly worse in these data when compared to the logistic # regression models we examined earlier ############################################################################ # while the data used in this section isn't available via the ISLR2 package # (or some other place as far as I know), we can extract the data with some # effort from Figure 2.13 (using packages like digitize, metaDigitise, or # juicr); I did this, yielding the following dataset dat <- structure(list(x1 = c(7.59, 6.67, 5.72, 5.12, 2.96, 2.25, 2.6, 2.35, 2.23, 3.15, 3.54, 3.2, 3.1, 3.32, 6.03, 5.81, 5.02, 4.37, 3.54, 4.49, 4.5, 5.16, 5.37, 5.63, 5.74, 4.89, 5.33, 5.36, 5.76, 4.44, 4.87, 4.78, 5.24, 5.87, 5.5, 4.94, 4.8, 4.65, 4.75, 5.19, 5.24, 5.42, 5.47, 5.71, 5.62, 5.79, 6.09, 6.29, 5.62, 5.89, 6.53, 6.66, 6.78, 6.67, 6.4, 6.51, 6.76, 6.93, 6.54, 6.32, 6.42, 6.6, 6.76, 7.01, 7.63, 7.28, 8.21, 8.54, 9.32, 8.97, 8.13, 8.01, 8.72, 8.88, 5.05, 5.26, 5.38, 5.4, 5.91, 5.97, 6.13, 6.33, 6.39, 6.94, 6.97, 6.08, 5.97, 5.63, 5.67, 5.96, 6.27, 6.21, 6.13, 5.6, 5.59, 5.69, 5.67, 5.34, 6.14, 5.99, 3.9, 4.77, 4.91, 5.88, 4.1, 3.06, 2.73, 3.91, 3.96, 4.85, 2.25, 1.84, 3.3, 4.13, 3.52, 1.34, 1.17, 0.64, 1.84, 2.42, 3.74, 3.79, 3.73, 1.41, 1.23, 1.24, 2.11, 1.99, 2.24, 1.93, 1.37, 1.71, 1.61, 1.51, 1.86, 1.45, 2.27, 2.4, 2.46, 3.04, 3.66, 4.2, 4.22, 4.27, 2.78, 2.27, 2.44, 2.38, 2.39, 2.66, 2.93, 3.09, 3.62, 2.77, 3.09, 3.48, 4.25, 4.69, 4.9, 5.83, 6.11, 6.55, 6, 5.59, 5.52, 5.39, 3.04, 3.53, 3.5, 3.57, 3.35, 3.34, 6.69, 4.3, 3.95, 4, 3.55, 3.84, 3.9, 3.88, 4.08, 4.37, 4.34, 4.04, 4, 4.11, 4.32, 4.18, 4.04, 4.36, 4.95, 5.47, 6.14, 5.94, 5.53, 4.7, 4.96, 4.89, 5.5, 5.5), x2 = c(2.65, 2.37, 1.62, 1.75, 1.82, 2.23, 2.62, 2.84, 3.25, 2.83, 3.34, 3.57, 3.83, 3.8, 7.98, 7.66, 8.17, 8.06, 6.6, 7.54, 7.08, 7.36, 7.37, 7.41, 7.06, 6.81, 6.77, 6.57, 6.61, 6.15, 6.14, 6.06, 5.93, 6.14, 5.86, 5.54, 5.22, 5.01, 4.61, 5.41, 5.24, 5.47, 5.57, 5.57, 5.44, 5.4, 5.5, 6.07, 5.04, 4.92, 5.77, 5.75, 5.75, 5.63, 5.04, 4.93, 5.07, 4.83, 4.63, 4.42, 4.13, 4.18, 4.28, 4.07, 4.34, 6.02, 5.63, 6.26, 6.38, 6.92, 7.16, 7.28, 7.68, 7.63, 3.72, 3.75, 3.74, 3.83, 3.82, 3.71, 3.71, 3.5, 3.32, 3.58, 3.27, 3.44, 3.42, 3.52, 3.13, 3.18, 2.9, 2.73, 2.76, 2.94, 2.84, 2.83, 2.7, 2.75, 2.48, 2.41, 9.26, 9.21, 9.03, 8.79, 9.09, 9.06, 8.97, 8.84, 8.67, 8.51, 8.55, 8.14, 8.49, 8.56, 8.42, 7.75, 7.62, 7.26, 7.63, 7.85, 8.06, 7.93, 7.81, 6.88, 6.77, 6.66, 7.54, 7.33, 7.07, 6.66, 6.25, 6.28, 6.17, 5.93, 5.91, 5.38, 5.12, 5.12, 5.4, 4.45, 4.32, 4.21, 4.23, 4.55, 5, 6.03, 6.03, 5.83, 5.75, 5.66, 5.58, 5.79, 4.72, 7.21, 7.26, 7.6, 7.96, 8.09, 8.23, 8.3, 7.97, 7.8, 7.78, 7.83, 7.81, 7.64, 6.59, 6.87, 6.9, 6.97, 6.91, 7.12, 6.75, 7.16, 7.05, 6.65, 6.45, 6.3, 6.01, 5.89, 5.83, 5.92, 5.65, 5.57, 5.46, 5.23, 5.07, 4.98, 4.82, 4.75, 4.63, 4.48, 4.38, 4, 3.8, 3.89, 3.39, 3.27, 2.86, 3.26), grp = c("blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange", "orange")), row.names = c(NA, -200L), class = "data.frame") # recreate Figure 2.13 symbols(dat$x1, dat$x2, circles=rep(.08,nrow(dat)), inches=FALSE, fg=dat$grp, lwd=4, xlim=c(0,10), ylim=c(0,10), xlab="x1", ylab="x2") # since the process of extracting data from figures isn't 100% accurate, there # might be some minor discrepancies, but that looks pretty much the same # note: we still do not know what the true model is that generated the data, # so we will not be able to compute the true test error rates as was done in # the book, but we should be able to reproduce (at least quite closely) the # estimated error rates using the given data # turn grp into a logical variable dat$grp <- dat$grp == "orange" # fit polynomial models with increasing complexity and compute the training # error rate degrees <- 1:10 errorrates.training <- rep(0, length(degrees)) for (d in degrees) { res <- glm(grp ~ poly(x1, degree=d) + poly(x2, degree=d), data=dat, family=binomial) pred <- predict(res, type="response") > 0.5 errorrates.training[d] <- mean(pred != dat$grp) } # plot the training error rate as a function of the polynomial degree plot(degrees, errorrates.training, type="l", lwd=7, col="#30b5ff", ylim=c(.11,.21), xlab="Degree of Polynomial", ylab="Error Rate") # note: the line doesn't look exactly like the one in Figure 5.8 (left panel) # but it's quite close (the discrepancy must be due to slight deviations in # the extracted data) # fit polynomial models with increasing complexity and estimate the test error # rate using k-fold cross-validation (with k=10) set.seed(261) k <- 10 degrees <- 1:10 errorrates.poly <- rep(0, length(degrees)) grp <- sample(rep(1:k, each=nrow(dat)/k)) for (d in degrees) { for (i in 1:k) { res <- glm(grp ~ poly(x1, degree=d) + poly(x2, degree=d), data=dat, family=binomial, subset=grp!=i) pred <- predict(res, newdata=dat[grp==i,], type="response") > 0.5 errorrates.poly[d] <- errorrates.poly[d] + sum(pred != dat$grp[grp==i]) } } errorrates.poly <- errorrates.poly / nrow(dat) errorrates.poly # add the test error rates to the plot lines(degrees, errorrates.poly, lwd=7) # note: I tried out many seeds, but couldn't find one where the black line # shows the U-shape as in Figure 5.8, but at least we see how the error rates # from CV are higher than the training error rates # knn with increasing values of k and compute the training error rate set.seed(544) ks.for.knn <- 1:100 errorrates.training <- rep(0, length(ks.for.knn)) predvars <- c("x1", "x2") for (j in 1:length(ks.for.knn)) { pred <- knn(dat[predvars], dat[predvars], dat$grp, k=ks.for.knn[j]) errorrates.training[j] <- mean(pred != dat$grp) } # smooth the error rates a bit errorrates.training <- predict(loess(errorrates.training ~ 1/ks.for.knn, span=0.35)) # plot the training error rate as a function of 1/k (flexibility) (use a log # scale for the x-axis to make it easier to distinguish points) plot(1/ks.for.knn, errorrates.training, type="l", lwd=7, col="#30b5ff", ylim=c(.11,.21), xlab="1/k (flexibility)", ylab="Error Rate", log="x") # knn with increasing values of k and compute the test error rate using k-fold # CV with k=10 (careful with the different meanings of k here) k.for.cv <- 10 errorrates.knn <- rep(0, length(ks.for.knn)) grp <- sample(1:k.for.cv, nrow(dat), replace=TRUE) for (j in 1:length(ks.for.knn)) { for (i in 1:k.for.cv) { pred <- knn(dat[grp!=i,predvars], dat[grp==i,predvars], dat$grp[grp!=i], k=ks.for.knn[j]) errorrates.knn[j] <- errorrates.knn[j] + sum(pred != dat$grp[grp==i]) } } errorrates.knn <- errorrates.knn / nrow(dat) # smooth the error rates a bit errorrates.knn <- predict(loess(errorrates.knn ~ 1/ks.for.knn, span=0.35)) # add the test error rates to the plot lines(1/ks.for.knn, errorrates.knn, lwd=7) # find the value of k (in knn) which minimizes the test error rate ks.for.knn[which.min(errorrates.knn)] # this looks similar to Figure 5.8 (right panel), although the error rates # need to be smoothed a bit as otherwise they look much more jagged ############################################################################ ### 5.2: The Bootstrap # let's see if we can derive equation 5.6 # # Var(aX + (1-a)Y) = a^2Var(X) + (1-a)^2Var(Y) + 2a(1-a)Cov(X,Y) # = a^2 sigma^2_X + (1-a)^2 sigma^2_Y + 2a(1-a) sigma_XY = Var(return) # # take the derivative of this with respect to a, which yields: # # d Var(return) / d a = 2 a sigma^2_X - 2 (1-a) sigma^2_Y + (2-4a) sigma_XY # = 2 a sigma^2_X - 2 sigma^2_Y + 2 a sigma^2_Y + 2 sigma_XY - 4 a sigma_XY # = a (2 sigma^2_X + 2 sigma^2_Y - 4 sigma_XY) - 2 sigma^2_Y + 2 sigma_XY # # set the derivative equal to 0 and solve for a: # # a = (2 sigma^2_Y - 2 sigma_XY) / (2 sigma^2_X + 2 sigma^2_Y - 4 sigma_XY) # = (sigma^2_Y - sigma_XY) / (sigma^2_X + sigma^2_Y - 2 sigma_XY) # simulate data like those in Figure 5.9 library(MASS) set.seed(1234) n <- 100 mu <- c(0,0) Sigma <- matrix(c(1,0.5,0.5,1.25), nrow=2, ncol=2) compalpha <- function(x, y) (var(y) - cov(x,y)) / (var(x) + var(y) - 2*cov(x,y)) # Figure 5.9 par(mfrow=c(2,2)) xy <- mvrnorm(n, mu=mu, Sigma=Sigma) plot(xy[,1], xy[,2], pch=19, col="seagreen4", xlab="X", ylab="Y") compalpha(xy[,1], xy[,2]) xy <- mvrnorm(n, mu=mu, Sigma=Sigma) plot(xy[,1], xy[,2], pch=19, col="seagreen4", xlab="X", ylab="Y") compalpha(xy[,1], xy[,2]) xy <- mvrnorm(n, mu=mu, Sigma=Sigma) plot(xy[,1], xy[,2], pch=19, col="seagreen4", xlab="X", ylab="Y") compalpha(xy[,1], xy[,2]) xy <- mvrnorm(n, mu=mu, Sigma=Sigma) plot(xy[,1], xy[,2], pch=19, col="seagreen4", xlab="X", ylab="Y") compalpha(xy[,1], xy[,2]) # estimate alpha for 1000 simulated datasets alphas <- replicate(1000, { xy <- mvrnorm(n, mu=mu, Sigma=Sigma) compalpha(xy[,1], xy[,2]) }) par(mfrow=c(1,3)) # histogram of the estimated alpha values hist(alphas, main="", col="#dba123") abline(v=0.6, lwd=5) # mean and SD of the alpha values mean(alphas) sd(alphas) ############################################################################ # generate a dataset that we will use for the bootstrapping xy <- mvrnorm(n, mu=mu, Sigma=Sigma) # compute alpha in the sample compalpha(xy[,1], xy[,2]) # number of bootstrap samples B <- 1000 alphas.boot <- double(B) for (i in 1:B) { xy.boot <- xy[sample(n, replace=TRUE),] alphas.boot[i] <- compalpha(xy.boot[,1], xy.boot[,2]) } # histogram of the bootstrap estimates of alpha hist(alphas.boot, main="", col="#30b5ff") abline(v=0.6, lwd=5) # mean and SD of the bootstrap estimates mean(alphas.boot) sd(alphas.boot) # boxplot boxplot(list(True=alphas, Bootstrap=alphas.boot), col=c("#dba123","#30b5ff")) abline(h=0.6, lwd=5) ############################################################################ # do bootstrapping using the boot package # load the boot package library(boot) compalpha.boot <- function(dat, indices) { dat.boot <- dat[indices,] x <- dat.boot[,1] y <- dat.boot[,2] (var(y) - cov(x,y)) / (var(x) + var(y) - 2*cov(x,y)) } res <- boot(xy, statistic=compalpha.boot, R=1000) res # get bootstrap CIs boot.ci(res, type=c("norm","basic","perc","bca")) # note: the percentile method is very similar to just taking the 2.5th and # 97.5th quantiles (for a 95% CI) of the bootstrap estimates quantile(res$t[,1], c(.025, .975)) ############################################################################ ### 5.3 Lab: Cross-Validation and the Bootstrap # skipped, because we pretty much covered everything already # in the lab, the cv.glm() function (from the boot package) is used to do # LOOCV and k-fold CV based on the regression models fitted with glm(); this # is of course very convenient, but this will not work in combination with # knn() from the class package; using the caret package, one can apparently do # CV in combination with knn: # # https://stats.stackexchange.com/questions/318968/knn-and-k-folding-in-r # # above, we simply programmed LOOCV and k-fold CV manually, which we can then # easily apply to essentially any model/method for making predictions # a note on page 218: the authors claim that "the standard formulas assume # (somewhat unrealistically) that the xi are fixed" (when computing the # standard errors of the regression coefficients); this isn't right; there are # indeed two types of regression models, one where the predictors are fixed # and one where the predictors are observed (i.e., random) variables; however, # inferences are made in the same way under both models ############################################################################ ### 5.4: Exercises # skipped in the interest of time ############################################################################