############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2023-01-12 # # Topic(s): # - An Introduction to Statistical Learning (https://www.statlearning.com) # - Section(s): 12.2.3 - 12.2.5 # # last updated: 2023-01-13 ############################################################################ ### 12.2.3: The Proportion of Variance Explained # copy the USArrests dataset to dat dat <- USArrests # dimensions of the dataset dim(dat) # examine the first 6 rows of the dataset head(dat) # standardize all 4 variables and put them into X X <- apply(dat, 2, scale) # after standardization, each variable has a variance of 1 apply(X, 2, var) # so the total variance (when we standardize each variable) is simply p sum(apply(X, 2, var)) ncol(X) # this is (12.8) when all variables are standardized # in the book, they are not assuming that the variables are standardized (only # mean centered), but in many cases, we do standardize the variables before a # PCA, so let's also focus on this situation # do the PCA with prcomp() res <- prcomp(X) res # the standard deviations squared are the variances of the PCs res$sdev^2 # this is essentially what they are referring to in equation (12.9) # and equation (12.10) is therefore just the variances of the PCs divided by the # total variance to begin with res$sdev^2 / ncol(X) # and the sum of the variances of the PCs is equal to p sum(res$sdev^2) # therefore we can also write res$sdev^2 / sum(res$sdev^2) # compute the cumulative sum of these values cumsum(res$sdev^2 / sum(res$sdev^2)) # so, the first PC explains about 62% of the total variance, the second PC # explains about 25%, so the first and second PC together explain roughly 87% # (and so on if we want to pay attention also to the third and fourth PC) # try out the decomposition in equation (12.11) M <- 2 phi <- res$rotation[,1:M] z <- res$x[,1:M] pred <- z %*% t(phi) n <- 50 sum(res$sdev[1:M]^2) + 1/(n-1) * sum((X - pred)^2) # # # # #################### ########################### # Var of first 2 PCs MSE of 2-dimensional approximation # note that we have to compute the last term using 1/(n-1), because scale() # standardized each variable using their variances (or really, their standard # deviations) computed using 1/(n-1) and the variances of the PCs (given by # res$sdev^2) are also computed using 1/(n-1) # Figure 13.3 par(mfrow=c(1,2)) plot(1:ncol(X), res$sdev^2 / sum(res$sdev^2), type="o", col="blue", xlab="Principal Component", ylab="Prop. Variance Explained", ylim=c(0,1), xaxt="n", lwd=3) axis(side=1, at=1:4) plot(1:ncol(X), cumsum(res$sdev^2) / sum(res$sdev^2), type="o", col="blue", xlab="Principal Component", ylab="Cumulative Prop. Variance Explained", ylim=c(0,1), xaxt="n", lwd=3) axis(side=1, at=1:4) par(mfrow=c(1,1)) # note: a more 'traditional' scree plot just puts the variances of the PCs on # the y-axis (https://en.wikipedia.org/wiki/Scree_plot) plot(1:ncol(X), res$sdev^2, type="o", col="blue", xlab="Principal Component", ylab="Variance of the PCs", xaxt="n", lwd=3) axis(side=1, at=1:4) ############################################################################ ### 12.2.4: More on PCA ## Scaling the Variables # all of the results obtained earlier actually used standardized variables, # because the 4 variables were not measured in the same units (in particular, # UrbanPop uses a different scale than the other three variables) ## Uniqueness of the Principal Components # to illustrate that the signs can be flipped, do a PCA and keep 2 components res <- prcomp(X) res M <- 2 phi <- res$rotation[,1:M] z <- res$x[,1:M] pred <- z %*% t(phi) head(pred) # now flip the sign of phi and z phi <- -res$rotation[,1:M] z <- -res$x[,1:M] pred <- z %*% t(phi) head(pred) # the predicted values are exactly the same ## Deciding How Many Principal Components to Use # we have already seen the scree plot earlier, but as noted in the book (and # see also Wikipedia), there are criticisms of this approach for choosing the # number of components to keep; an approach that is not mentioned in the book, # but that could be considered state-of-the-art, is 'parallel analysis' (see: # https://en.wikipedia.org/wiki/Parallel_analysis) # in a parallel analysis, we reshuffle each variable independently of the # other variables, which we can easily do with the sample() function, applying # this to each column in X X.s <- apply(X, 2, sample) # in such a reshuffled dataset, any relationships between the variables are # purely coincidental cor(X.s) # now we use these reshuffled data in a PCA tmp <- prcomp(X.s) tmp # this tells us what kind of variances of the PCs we might expect to see if # the variables are essentially unrelated to each other (or again, if there # are only coincidental relationships between variables) # we can repeat this process many times, saving the variances of the PCs from # each iteration set.seed(1234) iters <- 1000 evmat <- matrix(NA, nrow=iters, ncol=ncol(X)) for (i in 1:iters) { X.s <- apply(X, 2, sample) tmp <- prcomp(X.s) evmat[i,] <- tmp$sdev^2 } # and then we can see what the average value of these variances is apply(evmat, 2, mean) # we then compare the actual variances with these averages res$sdev^2 # often this is done by drawing a scree plot and adding the mean variances to it plot(1:ncol(X), res$sdev^2, type="o", col="blue", xlab="Principal Component", ylab="Variance of the PCs", xaxt="n", lwd=3) axis(side=1, at=1:4) lines(1:4, apply(evmat, 2, mean), type="o", col="red", lwd=3) legend("topright", inset=.02, lwd=3, col=c("blue","red"), legend = c("PCs of the Actual Data", "PCs of the Reshuffled Data")) # and only PCs with variances that are larger than the mean variances of the # PCs from the reshuffled data are worth keeping; so in this case, the # parallel analysis would suggest to only keep the very first principal # component # instead of doing this manually, we can use the 'psych' package for this # install the 'psych' package (if needed) #install.packages("psych") # load the 'psych' package library(psych) # run a parallel analysis fa.parallel(X, fa="pc", n.iter=1000, sim=FALSE) ############################################################################ ### 12.2.5 Other Uses for Principal Components # since we already did this in section 6.3.1, no need to repeat this here ############################################################################