############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2022-03-31 # # Topic(s): # - An Introduction to Statistical Learning (https://www.statlearning.com) # - Section(s): 4.3.5 - 4.4.2 # # last updated: 2022-04-01 ############################################################################ # some notes about the previous stream: # # 1) Last time, we compared linear regression with k-nearest neighbors (KNN). # When applying KNN to predict the outcome value for an observation, we # examine the (Euclidean) distance between the observation and those in the # training dataset and find the k nearest observations (whose mean is then # used as the predicted value for the new observation). When computing the # distances, we first might want to rescale the predictor variables to a # common scale (e.g., z-scores), so that a one-unit distance is comparable # across all predictors. In the example we used last time, the predictors # all used the same units (one point = $1000 in advertising budget), so # this might not have been so relevant in the example, but it is still # worth noting here in case we apply KNN in other examples. # # 2) When we discussed linear regression, we covered the omnibus test of the # null hypothesis that all slope coefficients are equal to 0 (i.e., that # none of the predictors are related to the outcome variable). The book did # not discuss such a test for logistic regression, but this is of course # also possible in this context. An example: # load the ISLR2 package library(ISLR2) # copy the Default dataset to dat dat <- Default # include balance, income, and students as predictors in the model (Table 4.3) res <- glm(default ~ balance + income + student, data=dat, family=binomial) summary(res) # fit the 'null model' that includes no predictors res0 <- glm(default ~ 1, data=dat, family=binomial) # omnibus test by comparing the two models against each other anova(res0, res, test="Chisq") # the p-value is tiny here, so we reject the null hypothesis: # # H0: beta_balance = beta_income = beta_studentYes = 0 # # and hence conclude that there does appear to be some kind of relationship # between one or more of these predictors and the outcome ############################################################################ ### 4.3.5: Multinomial Logistic Regression # the book does not provide a concrete example for multinomial logistic # regression that we can replicate (since the focus in the book is on # classification for two groups/classes); for those interested in an example, # see here: https://stats.oarc.ucla.edu/r/dae/multinomial-logistic-regression/ ############################################################################ ### 4.4: Generative Models for Classification # an illustration of how things can go wrong when there is 'perfect # separation' in the data and we fit a logistic regression model set.seed(1234) n <- 100 x <- runif(n, 0, 1) y <- ifelse(x > 0.7, 1, 0) plot(x, y, pch=19) # note: everybody with an x value > 0.7 is in group y = 1 and 0 otherwise; # hence, we can perfectly predict y from x; let's fit a logistic regression # model with these data res <- glm(y ~ x, family=binomial) summary(res) # note the large coefficients for x and also the corresponding huge SE # consider an even a simpler case where the predictor is dichotomous (x is # either 0 or 1); then the logistic model is given by this: # # for x = 0: log(p(Y = 1 | X = 0) / p(Y = 0 | X = 0)) = beta0 # for x = 1: log(p(Y = 1 | X = 1) / p(Y = 0 | X = 1)) = beta0 + beta1 # # now say the data are like this (again, perfect separation): # # y = 1 y = 0 # x = 1 n11 0 # x = 0 0 n00 # # then the model should give: # # for x = 0: log(0 / 1) = -Inf, so beta0 should be -Inf # for x = 1: log(1 / 0) = +Inf, so beta0 + beta1 should be +Inf # so beta1 must be +Inf, to 'offset' the -Inf of beta0 # # when we fit a model to data with perfect separation, the estimates of beta0 # and beta1 are not -Inf and +Inf (since the model fitting algorithm doesn't # allow this), but the coefficients get really large # in practice, we typically do not have perfect separation, but we might have # something quite close to it; say the data are like in the example above # (where everybody with x > 0.7 has y = 1 and 0 otherwise), except for two # individuals; let repeatedly simulate such data, fit the logistic regression # model, and save the estimated slope for x iters <- 1000 coefs <- rep(NA, iters) for (i in 1:iters) { x <- runif(n, 0, 1) y <- ifelse(x > 0.7, 1, 0) y[1:2] <- 1 - y[1:2] # flip y for the first two individuals res <- glm(y ~ x, family=binomial) coefs[i] <- coef(res)[2] } # histogram of the estimated slopes hist(coefs, breaks=50, xlab="Slope Coefficient", main="") # note how unstable the estimated slopes are! # now let's do the same thing, but we will flip y for 25 individuals (so that # the relationship between x and y is not so strong anymore) for (i in 1:iters) { x <- runif(n, 0, 1) y <- ifelse(x > 0.7, 1, 0) y[1:25] <- 1 - y[1:25] res <- glm(y ~ x, family=binomial) coefs[i] <- coef(res)[2] } hist(coefs, breaks=50, xlab="Slope Coefficient", main="") # the estimated slopes still fluctuate, but not nearly as much # so, somewhat surprisingly, the closer we are in being able to predict y # perfectly from the predictor(s) in logistic regression, the more unstable # the estimated slopes become (which would make the model perform poorly in # new/test data) ############################################################################ # now let's consider the approach for classification described on page 142; # say there are two classes (k = 0 and k = 1), with n0 and n1 individuals in # these two groups and there is a single predictor, x, that we have measured # in these individuals; we can simulate such data as follows set.seed(1234) n1 <- 30000 n0 <- 10000 x_y1 <- rnorm(n1, mean=5, sd=1) x_y0 <- rnorm(n0, mean=4, sd=1) # note: while x is simulated from a normal distribution within the two groups # here, this is (for the purposes of this example) NOT an assumption # priors for y=0 and y=1 pi1 <- n1 / (n1 + n0) pi0 <- n0 / (n1 + n0) # estimate the densities of x within the two classes using 'kernel density # estimation': https://en.wikipedia.org/wiki/Kernel_density_estimation f1x <- density(x_y1) f0x <- density(x_y0) # plot the two distributions plot(f1x, lwd=5, col="blue", xlim=c(0,10), main="") lines(f0x, lwd=5, col="red") # say we want to know f1x and f0x when x = 6 abline(v=6, lwd=3, lty="dotted") # so we need to determine the height of the blue and red densities when x = 6; # we will do this by finding the value of f1x and f0x for the closest x to 6 # based on the estimated densities f1x_equal_to_6 <- f1x$y[which.min(abs(f1x$x - 6))] f0x_equal_to_6 <- f0x$y[which.min(abs(f0x$x - 6))] f1x_equal_to_6 f0x_equal_to_6 segments(-1, f1x_equal_to_6, 6, f1x_equal_to_6, lwd=3, lty="dotted", col="blue") segments(-1, f0x_equal_to_6, 6, f0x_equal_to_6, lwd=3, lty="dotted", col="red") # posterior probabilities of a person belonging to the blue and red classes when x = 6 pi1 * f1x_equal_to_6 / (pi1 * f1x_equal_to_6 + pi0 * f0x_equal_to_6) pi0 * f0x_equal_to_6 / (pi1 * f1x_equal_to_6 + pi0 * f0x_equal_to_6) # this illustrates the use of Bayes' theorem (equation 4.15) for this example # hence, the estimated probability that a person comes from the blue class # when x = 6 is around 93% and around 7% for the red class; so according to # the Bayes' classifier, we would assign an observation where x = 6 to the # blue class, since this has the highest posterior probability ############################################################################ # 4.4.1: Linear Discriminant Analysis for p = 1 # we will replicate the example in the book (note that we cannot simulate the # exact same data, but we can simulate data just like those in the book) # so, there are again two classes, now with 20 people in each class set.seed(1234) n1 <- 20 n0 <- 20 x_y1 <- rnorm(n1, mean= 1.25, sd=1) x_y0 <- rnorm(n0, mean=-1.25, sd=1) x <- c(x_y1, x_y0) y <- c(rep(1, n1), rep(0, n0)) # in practice, the dataset would look like this data.frame(y, x) # histogram of the data in the two groups (like Figure 4.4, right panel) hist(x_y1, xlim=c(-4,4), ylim=c(0,8), breaks=seq(-5,5,by=0.5), xlab="", main="", col=rgb(.8,0,1,.2)) par(new=TRUE) hist(x_y0, xlim=c(-4,4), ylim=c(0,8), breaks=seq(-5,5,by=0.5), xlab="", main="", col=rgb(0,.6,0,.2), axes=FALSE, ylab="") # compute the means of x within the two groups and the pooled variance m1 <- mean(x_y1) m0 <- mean(x_y0) s2p <- ((n1-1)*var(x_y1) + (n0-1)*var(x_y0)) / (n1 + n0 - 2) # add the LDA decision boundary to the histogram abline(v = (m1 + m0) / 2, lwd=5) # add the Bayes decision boundary to the histogram abline(v = 0, lty="dashed", lwd=5) # LDA classifier pred.lda <- ifelse(c(x_y1, x_y0) > (m1 + m0) / 2, 1, 0) # Bayes classifier pred.bayes <- ifelse(c(x_y1, x_y0) > 0, 1, 0) # show data with these classifiers data.frame(y, x, pred.lda, pred.bayes) # error rates for LDA and the Bayes classifier mean(y != pred.lda) mean(y != pred.bayes) ############################################################################ # but the above gives us the performance of LDA and the Bayes classifier in # the training data; more important is the performance in new/test data # for this, we will repeat the above many times and in each iteration of the # simulation, we will also simulate new/test data and compute the error rate # for these test data iters <- 10000 n1_test <- 100 n0_test <- 100 error.rate.lda <- rep(NA, iters) error.rate.bayes <- rep(NA, iters) for (i in 1:iters) { x_y1 <- rnorm(n1, mean= 1.25, sd=1) x_y0 <- rnorm(n0, mean=-1.25, sd=1) m1 <- mean(x_y1) m0 <- mean(x_y0) y_test <- c(rep(1, n1_test), rep(0, n0_test)) x_y1_test <- rnorm(n1_test, mean= 1.25, sd=1) x_y0_test <- rnorm(n0_test, mean=-1.25, sd=1) pred.lda <- ifelse(c(x_y1_test, x_y0_test) > (m1 + m0) / 2, 1, 0) pred.bayes <- ifelse(c(x_y1_test, x_y0_test) > 0, 1, 0) error.rate.lda[i] <- mean(y_test != pred.lda) error.rate.bayes[i] <- mean(y_test != pred.bayes) } # average test data error rate of LDA and the Bayes classifier round(100 * mean(error.rate.lda), 1) round(100 * mean(error.rate.bayes), 1) ############################################################################ ### 4.4.2: Linear Discriminant Analysis for p >1 # install the ISLR2 package #install.packages("ISLR2") # load the ISLR2 package library(ISLR2) # copy the Default dataset to dat dat <- Default # load MASS package library(MASS) # look at the documentation of the lda() function help(lda) # do LDA using balance and student as predictors for 'default' (yes/no) res <- lda(default ~ balance + student, data=dat) res # get predictions for the training data pred <- predict(res) # confusion matrix (Table 4.4) addmargins(table(pred$class, dat$default), margin=c(1,2)) # error rate of LDA in the training data 100 * mean(pred$class != dat$default) # sensitivity table((pred$class == dat$default)[dat$default == "Yes"]) round(100 * mean((pred$class == dat$default)[dat$default == "Yes"]), 1) # specificity table((pred$class == dat$default)[dat$default == "No"]) round(100 * mean((pred$class == dat$default)[dat$default == "No"]), 1) # use 0.2 as the threshold for assigning people to the default = "yes" class pred$class <- ifelse(pred$posterior[,"Yes"] > 0.2, "Yes", "No") # confusion matrix (Table 4.5) addmargins(table(pred$class, dat$default), margin=c(1,2)) # error rate of LDA in the training data 100 * mean(pred$class != dat$default) # sensitivity and specificity round(100 * mean((pred$class == dat$default)[dat$default == "Yes"]), 1) round(100 * mean((pred$class == dat$default)[dat$default == "No"]), 1) ############################################################################ # compute the error rate, sensitivity, and specificity for 1000 different # threshold values between 0 and 1 thresholds <- seq(0, 1, length=1000) error.rate <- rep(NA, length(thresholds)) sensitivity <- rep(NA, length(thresholds)) specificity <- rep(NA, length(thresholds)) for (i in 1:length(thresholds)) { pred$class <- ifelse(pred$posterior[,"Yes"] > thresholds[i], "Yes", "No") error.rate[i] <- mean(pred$class != dat$default) sensitivity[i] <- mean((pred$class == dat$default)[dat$default == "Yes"]) specificity[i] <- mean((pred$class == dat$default)[dat$default == "No"]) } # plot the overall and the false negative and false positive error rates as a # function of the threshold values (Figure 4.7) plot(thresholds, error.rate, type="l", lwd=5, ylim=c(0,.8), xlim=c(.001,.55)) lines(thresholds, 1 - sensitivity, lwd=5, col="blue") lines(thresholds, 1 - specificity, lwd=5, col="orange") legend("right", inset=.02, lty="solid", col=c("blue", "black", "orange"), lwd=5, legend = c("false negative rate", "overall error rate", "false positive rate")) # ROC curve (Figure 4.8) plot(1 - specificity, sensitivity, type="l", lwd=3, col="blue", xlim=c(0,1), ylim=c(0,1), xlab="False positive rate", ylab="True positive rate") abline(a = 0, b = 1, lty="dotted") ############################################################################ # typically we would not want to draw the ROC curve manually; we can use the # pROC package to automate all of this # install and load package pROC #install.packages(pROC) library(pROC) # for completeness sake, apply LDA again and get the posterior probabilities res <- lda(default ~ balance + student, data=dat) pred <- predict(res) # use the roc() function, draw the plot, and get the area under the curve sav <- roc(default ~ pred$posterior[,"Yes"], data=dat) plot(sav, lwd=3) auc(sav) # compare LDA with logistic regression res <- glm(default ~ balance + student, data=dat, family=binomial) pred <- predict(res, type="response") sav <- roc(default ~ pred, data=dat) plot(sav, lwd=3, add=TRUE, col="red") auc(sav) # performance of LDA and logistic regression is almost identical for these data ############################################################################