############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2022-03-03 # # Topic(s): # - An Introduction to Statistical Learning (https://www.statlearning.com) # - Section(s): 2.4 - 3.1 # # last updated: 2022-03-25 ############################################################################ ###### 2.4: Exercises ### Question 7 # read in the data dat <- read.table(header=TRUE, text = " obs x1 x2 x3 y 1 0 3 0 red 2 2 0 0 red 3 0 1 3 red 4 0 1 2 green 5 -1 0 1 green 6 1 1 1 red") # new data point for which we do not know y but want to make a prediction for new <- c(7, 0, 0, 0, NA) # (a) Compute the Euclidean distance between each observation and the test # point, X1 = X2 = X3 = 0. # we can combine the data with the new data point using rbind() (we only need # columns 2 through 4) rbind(dat, new)[2:4] # get the matrix of all Euclidean distances; the last row of this matrix gives # the distances of the original data and the new data point dist(rbind(dat, new)[2:4]) # but we want this as a matrix as.matrix(dist(rbind(dat, new)[2:4])) # keep only the last row and remove the last value d <- as.matrix(dist(rbind(dat, new)[2:4]))[7,-7] # these are the Euclidian distances d # double-check the Euclidean distance for 3 and 7 sqrt((0-0)^2 + (1-0)^2 + (3-0)^2) # (b) What is our prediction with K = 1? Why? # the prediction when k = 1 is simply the value of y from the data point that # has the lowest distance to the new data point # the data point with the lowest distance is which.min(d) # so take the value of y from this dat$y[which.min(d)] # (c) What is our prediction with K = 3? Why? # we need to find the 3 points with the lowest distance and see what value of y # is most common among those 3 points; we can do this as follows # order() gives us the ordering permutation; that is the 5th point has the # lowest distance, the 6th point has the next lowest distance, and so on order(d) # so the first three of this as the points with the lowest distance order(d)[1:3] # get the value of y for those three points dat$y[order(d)[1:3]] # create a frequency table based on these 3 points table(dat$y[order(d)[1:3]]) # so we see that 'red' is most common among those 3 points, so this is the # predicted value for the new data point when k=3 # (d) If the Bayes decision boundary in this problem is highly non-linear, # then would we expect the best value for K to be large or small? Why? # k should be low, since this means more flexibility for KNN ###################################### ### Question 8 # note: the 'solution' below deviates slightly here and there from what is # suggested in the book based on my own preferences # need to get the data from here: https://www.statlearning.com/s/College.csv # can also use R to download the file #download.file("https://www.statlearning.com/s/College.csv", destfile="College.csv") # read in the data and examine the first 6 rows dat <- read.csv("College.csv") head(dat) # look at all rows where variable X includes 'Illinois' dat[grep("Illinois", dat$X),] # set the row names equal to variable X and remove the first variable (which is X) rownames(dat) <- dat[,1] dat <- dat[,-1] head(dat) # some summary statistics on the variables in the dataset summary(dat) # for character variables (i.e., Private), create a frequency table table(dat$Private) # scatterplot matrix of all variables except 'Private' pairs(dat[2:11], pch=19, cex=0.1) # boxplot of the 'Outstate' variable for each level of 'Private' boxplot(Outstate ~ Private, data=dat) # create a variable that is equal to 'Yes' when Top10perc > 50 and 'No' otherwise dat$Elite <- ifelse(dat$Top10perc > 50, "Yes", "No") # frequency table of this variable table(dat$Elite) # boxplot of the 'Outstate' variable for each level of 'Elite' boxplot(Outstate ~ Elite, data=dat) # histograms of the 4 variables below in a single figure numb <- 20 op <- par(mfrow=c(2,2)) hist(dat$S.F.Ratio, main="Student to Faculty Ratio", xlab="", breaks=numb) hist(dat$Outstate, main="Out of State Tuition", xlab="", breaks=numb) hist(dat$Top10perc, main="Top 10% HS Student Applicants", xlab="", breaks=numb) hist(dat$PhD, main="Percent of Faculty with PhDs", xlab="", breaks=numb) par(op) # calculate the acceptance percentage dat$AcceptPerc <- 100 * dat$Accept / dat$Apps # find the 10 universities/colleges with the lowest acceptance percentage dat[order(dat$AcceptPerc),][1:10,] # the usual suspects ... ###################################### ### Question 9 # skipped this because it is essentially the same as the lab ###################################### ### Question 10 # install the ISLR2 package (not 'library' -- in R, a 'library' is a location on # your computer where packages are installed); note: by putting a # in front of # the command, we do not accidentally reinstall the package every time we run # this code (so remove the # once and run the command and then put the # back) #install.packages("ISLR2") # load the ISLR2 package library(ISLR2) # to look at the help file for this dataset (same idea with the # as above) #?Boston # copy the dataset to 'dat' (since I always name datasets 'dat') dat <- Boston # check the dimensions (number of rows and columns/variables) dim(dat) # examine the first 6 rows head(dat) # scatterplot of 'tax' versys 'ptration' plot(dat$tax, dat$ptratio, pch=19) # this doesn't look like 506 points; so there must be many cases where points # are on top of each other, which we cannot see in this plot # find all unique combinations of the two variables and how often they occurred res <- xyTable(dat$tax, dat$ptratio) res # use this to create a scatterplot with the points proportional in their area to # how often they occurred plot(res$x, res$y, cex=sqrt(res$number/15), pch=19) #pairs(dat, pch=19, cex=0.1) # scatterplots of all variables versus 'crim' op <- par(mfrow=c(4,3), mar=c(5,4,1,1)) for (i in 2:ncol(dat)) { plot(dat[[i]], dat$crim, pch=19, cex=0.5, xlab=names(dat)[i], ylab="crime rate") } par(op) # the 10 census tracts with the highest crime rates dat[order(dat$crim, decreasing=TRUE),][1:10,] # compare this against summary statistics for the crime rate variable summary(dat$crim) # frequency table of the 'chas' variable (so 35 census tracts in this data set # bound the Charles river) table(dat$chas) # median pupil-teacher ratio median(dat$ptratio) # the census tract with the lowest median value of owner-occupied homes dat[order(dat$medv),][1,] # 64 tracts average more than seven rooms per dwelling table(dat$rm > 7) # 13 tracts average more than seven rooms per dwelling table(dat$rm > 8) # all tracts that average more than eight rooms per dwelling dat[dat$rm > 8,] ############################################################################ ### 3.1: Simple Linear Regression # the data used in this section: https://www.statlearning.com/s/Advertising.csv #download.file("https://www.statlearning.com/s/Advertising.csv", destfile="Advertising.csv") # read in the data and examine the first 6 rows dat <- read.csv("Advertising.csv") head(dat) # plot the data plot(sales ~ TV, data=dat, pch=19, col="red3", xlab="TV Advertisement Budget", ylab="Sales (in 1000 of Units)") # fit the simple regression model res <- lm(sales ~ TV, data=dat) # add regression line to the plot abline(res, lwd=6, col="blue") # examine the model results summary(res) # the estimated intercept is 7.032594, the estimated slope is 0.047537; # so the model says: Y-hat = 7.032594 + 0.047537 * TV # if we just want the model coefficients, we can use coef() coef(res) # the estimates of intercept and slope are those that make the sum of the # squared residuals (the 'residual sum of squares') as small as possible # add the residuals to the plot segments(dat$TV, predict(res), dat$TV, dat$sales, col="gray", lwd=2) abline(res, lwd=6, col="blue") points(sales ~ TV, data=dat, pch=19, col="red3") # for example, the residual for the first data point is as follows dat$TV[1] # value of TV coef(res)[[1]] + coef(res)[[2]] * dat$TV[1] # predicted value dat$sales[1] - (coef(res)[[1]] + coef(res)[[2]] * dat$TV[1]) # residual # so the squared residual is (dat$sales[1] - (coef(res)[[1]] + coef(res)[[2]] * dat$TV[1]))^2 # now let's do this for all data points (dat$sales - (coef(res)[[1]] + coef(res)[[2]] * dat$TV))^2 # the residual sum of squares is just the sum of these squared residuals sum((dat$sales - (coef(res)[[1]] + coef(res)[[2]] * dat$TV))^2) # we can just get the residuals directly with resid(), so a more direct way to # compute the RSS would have been sum(resid(res)^2) # the coef(res) values are those that make this as small as possible; for # example, suppose we would set the estimated intercept to 7.05 and the # estimated slope to 0.045; then the RSS would have been slightly larger sum((dat$sales - (7.05 + 0.045 * dat$TV))^2) # let's calculate the RSS value for different intercept values (between 5 and 9) # and for different slope values (between .03 and .07) b0s <- seq(5, 9, length=100) b1s <- seq(.027, .068, length=100) RSSmat <- matrix(NA, nrow=length(b0s), ncol=length(b1s)) for (i in 1:length(b0s)) { for (j in 1:length(b1s)) { RSSmat[i,j] <- sum((dat$sales - (b0s[i] + b1s[j] * dat$TV))^2) } } # contour plot of the RSS values (like Figure 3.2; apparently, the authors # divided the RSS values by 1000, so we will also do the same) RSSmat <- RSSmat / 1000 contour(b0s, b1s, RSSmat, levels=c(2.11,2.15,2.2,2.3,2.5,3), labcex=1, col="blue") points(coef(res)[1], coef(res)[2], pch=19, col="red", cex=1.5) # the red point corresponds to the actual estimated intercept and slope values # that minimize the RSS # we can also draw the three-dimensional plot pmat <- persp(b0s, b1s, RSSmat, theta=30, phi=-10, border="blue", col="gray", xlab="beta_0", ylab="beta_1", zlab="RSS") points(trans3d(coef(res)[1], coef(res)[2], min(RSSmat), pmat=pmat), pch=19, col="red", cex=1.5) ############################################################################ # let's simulate some data like the authors did for Figure 3.3 set.seed(1350) n <- 100 x <- rnorm(n) y <- 2 + 3 * x + rnorm(n, 0, 2.2) # plot the data (like Figure 3.3, left panel) plot(x, y) # since we simulate the data, we know what the true intercept is (2) and what # the true slope is (3); let's add the true regression line to the plot abline(a=2, b=3, col="red", lwd=3) # fit the regression model and add the estimated regression line to the plot res <- lm(y ~ x) summary(res) abline(res, col="blue", lwd=3) # the estimated regression line differs from the true regression line because of # 'sampling error' (i.e., for each training dataset, we get a slightly different # regression line); see Figure 3.3, right panel) plot(NA, xlim=c(-2,2), ylim=c(-10,10), xlab="X", ylab="Y") for (i in 1:10) { y <- 2 + 3 * x + rnorm(n, 0, 2.2) tmp <- lm(y ~ x) abline(tmp, col="lightblue", lwd=2) } abline(a=2, b=3, col="red", lwd=3) abline(res, col="blue", lwd=3) # let's simulate 10000 new training datasets and compute the estimated intercept # and slope for each one of them b0s <- rep(NA, 10000) b1s <- rep(NA, 10000) for (i in 1:10000) { y <- 2 + 3 * x + rnorm(n, 0, 2.2) tmp <- lm(y ~ x) b0s[i] <- coef(tmp)[1] b1s[i] <- coef(tmp)[2] } # draw histograms of the estimated intercepts and slopes; also add the true # intercept and slope as vertical lines to the plots and the estimated intercept # and slope based on our single training dataset above as dotted lines op <- par(mfrow=c(1,2)) hist(b0s, breaks=50) abline(v=2, lwd=6) abline(v=coef(res)[1], lwd=6, lty="dotted") hist(b1s, breaks=50) abline(v=3, lwd=6) abline(v=coef(res)[2], lwd=6, lty="dotted") par(op) # note that the estimated intercepts and slopes fluctuate randomly around the # true intercept and slope; these are the 'sampling distributions' of the # intercept and slope estimates for this particular example # of course in practice, we do not know the true intercept and slope and we also # only have a single training dataset; but we know that the estimated intercept # and slope we observe in our data have come from such sampling distributions # (as we can see above in the histograms) # we would like to know how accurate our estimates are; for this, we want to # know how large the variability in the sampling distributions is; if the # sampling distributions are really 'wide', then we can't be very confident that # the estimates we have are close to the true intercept and slope; on the other # hand, if the sampling distributions are narrow, then the estimates all tend to # fall close to the true intercept and slope and hence we can be more confident # that our estimates are also close to the true values # since we are simulating data, we can actually compute the standard deviation # of the intercept and slope estimates across the 10000 datasets we simulated sd(b0s) sd(b1s) # these are essentially the true 'standard errors' of the intercept and slope # estimates (we would have to simulate an infinite number of training datasets # to really get the true SEs, but this is close enough) # again, in practice, we cannot compute the true SEs, since we only have a # single dataset; but based on statistical theory, we can derive equations that # allow us to estimate the true SEs; this is what you see in the output from our # model under the 'Std. Error' column summary(res) # note that these values are not exactly the same as sd(b0s) and sd(b1s) but # quite close (so strictly speaking, these values are estimates of the true SEs, # but in practice, we simply call them 'standard errors' and do not draw the # distinction between the true SEs and the estimated SEs) # if n (the sample size) is larger, than the sampling distributions will be # narrower and hence the SEs will be smaller (you can try this out by setting n # above to a larger value, say 500, and rerunning the code above) # using these SEs, we can compute 'confidence intervals' for the intercept and # slope; as described in the book, a 95% confidence interval has the following # property: "if we take repeated samples and construct the confidence interval # for each sample, 95% of the intervals will contain the true unknown value of # the parameter." # let's check if this is true with our simulation study; we again repeatedly # simulate new data, construct the confidence interval for the slope for each # new sample and then check what proportion of these intervals captured the true # slope (which we know to be 3) lbs <- rep(NA, 10000) ubs <- rep(NA, 10000) for (i in 1:10000) { y <- 2 + 3 * x + rnorm(n, 0, 2.2) tmp <- confint(lm(y ~ x)) lbs[i] <- tmp[2,1] ubs[i] <- tmp[2,2] } mean(lbs <= 3 & ubs >= 3) # note that this is very close to .95 (or 95%); it deviates slightly from 95% # because again, we would have to run this for an infinite number of iterations # to get exactly 95%, but this is close enough to demonstrate that this really # works ############################################################################ # let's go back to the advertising data # fit the simple regression model res <- lm(sales ~ TV, data=dat) summary(res) # so 0.457843 is the SE of the intercept and 0.002691 the SE of the slope # how are the confidence intervals actually computed? roughly with this equation: # estimate +- 2 * SE, so for example, for the slope, we would compute 0.047537 + c(-1,1) * 2 * 0.002691 # this isn't completely accurate (the value 2 in the equation above is an # approximation), but we can use the confint() function to compute the exact # confidence interval (for both the intercept and the slope) confint(res) # we can also think of a confidence interval (CI) as a range of values that are, # in some sense, 'compatible' with our data; so if the 95% CI includes 0, then # even no relationship between the predictor and outcome (which is what a slope # of 0 implies) is also compatible with our data and we therefore shouldn't be # too confident that there really is a relationship between the predictor and # outcome; but in this dataset, the 95% CI for the slope excludes 0 (all values # in the CI are positive), so a slope of 0 would not be compatible with our data # another approach is to conduct a null hypothesis test; the idea is this: # assume that there really is no relationship between the predictor and the # outcome (i.e., the true slope is 0); this is our 'null hypothesis' # # H_0: beta_1 = 0 # # and we want to test this against the alternative that there really is a # relationship (i.e., the true slope is not 0); this is our 'alternative # hypothesis' (often this is denoted H_1 instead of H_a as in the book) # # H_a: beta_1 != 0 # in the output, we see a 't value' for the slope (17.67); this is the 'test # statistic for testing the null hypothesis; it is computed by dividing the # estimate of the slope by its SE, so we can think of the test statistic as a # 'signal to noise ratio', so the test statistic will be large when the # estimated slope is far from 0 and/or when the SE is small (i.e., when we have # a precise estimate of the true slope) summary(res) # next to the test statistic is the 'p-value' (Pr(>|t|) in the output); in this # case, the p-value is very very small (< 2 * 10^-16) # the p-value is the probability of seeing a test statistic as large as we see # in our data, or an even larger one, assuming the null hypothesis is true (and # under the 'assumptions of the model', but the book hasn't really discussed # these assumptions so far; I am just noting this here already for preciseness) # so a small p-value indicates that it is very unlikely that we would see such a # large slope (or an even larger one) when we would assume that there really is # no relationship between the predictor and the outcome # in this case, we 'reject the null hypothesis'; how small does the p-value have # to be before we do so? by convention, we reject H_0 when the p-value is <= .05 # although it should be emphasized that this is purely an arbitrary convention # so in this example, we reject the null hypothesis that there is no # relationship between TV advertising budgets and sales (and therefore conclude # that there is a relationship) # of course that conclusion could be right or wrong; since we do not know the # truth (i.e., the true slope), we don't know if we have drawn the right # conclusion # a very nice article that discusses statistical tests, p-values, and confidence # intervals is: # Greenland, S., Senn, S. J., Rothman, K. J., Carlin, J. B., Poole, C., Goodman, # S. N. & Altman, D. G. (2016). Statistical tests, P values, confidence # intervals, and power: A guide to misinterpretations. European Journal of # Epidemiology, 31(4), 337-350. https://doi.org/10.1007/s10654-016-0149-3 ############################################################################ # concluding that there is a relationship between X and Y does not tell us how # accurate our model is in predicting Y from X; to assess this, we can compute # the 'residual standard error', which is in essence the standard deviation of # the residuals sd(resid(res)) # in the output, we see that the residual standard error is slightly different summary(res) # the actual equation (3.15) divides the RSS by n-2, while sd() above divides # the RSS by n-1; we can correct this to get the exact same value n <- 200 sd(resid(res)) * sqrt((n-1) / (n-2)) # another way to assess the accuracy of the model is to compute R^2 (we also see # this value in the output above); R^2 indicates how much of the variance in the # outcome variable is 'accounted for' (or 'explained') based on the predictor # to understand this better, let's again fit our model and compute the variance # of the residuals res1 <- lm(sales ~ TV, data=dat) var(resid(res1)) # what if we have not used any predictor in our regression model? then our model # would have been Y = beta0 + error, which we fit as follows res0 <- lm(sales ~ 1, data=dat) summary(res0) # the estimated intercept is identical to the mean of the outcome variable mean(dat$sales) # so if we only have an intercept and no predictor in the model, then the best # we can do in predicting Y is to just use the mean # how accurate is this model? let's again compute the variance of the residuals var(resid(res0)) # this is actually identical to the variance of the outcome itself var(dat$sales) # and note that this is quite a bit larger than var(resid(res1)) above # so var(resid(res0)) is the amount of variance in the outcome to begin with and # var(resid(res1)) is the variance that is *unaccounted* for based on the model # that includes the predictor; so the following will tell us how much of the # variance has been accounted for var(resid(res0)) - var(resid(res1)) # now take this value and divide it by how much we started out with; that way we # get a proportion between 0 and 1 (var(resid(res0)) - var(resid(res1))) / var(resid(res0)) # so about 61% of the variance in Y has been accounted for by X; this is R^2 # as explained in the book, this is also identical to the square of the # correlation between X and Y cor(dat$sales, dat$TV)^2 ############################################################################