############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2023-01-12
#
# Topic(s):
# - An Introduction to Statistical Learning (https://www.statlearning.com)
# - Section(s): 12.2.3 - 12.2.5
#
# last updated: 2023-01-13

############################################################################

### 12.2.3: The Proportion of Variance Explained

# copy the USArrests dataset to dat
dat <- USArrests

# dimensions of the dataset
dim(dat)

# examine the first 6 rows of the dataset
head(dat)

# standardize all 4 variables and put them into X
X <- apply(dat, 2, scale)

# after standardization, each variable has a variance of 1
apply(X, 2, var)

# so the total variance (when we standardize each variable) is simply p
sum(apply(X, 2, var))
ncol(X)

# this is (12.8) when all variables are standardized

# in the book, they are not assuming that the variables are standardized (only
# mean centered), but in many cases, we do standardize the variables before a
# PCA, so let's also focus on this situation

# do the PCA with prcomp()
res <- prcomp(X)
res

# the standard deviations squared are the variances of the PCs
res$sdev^2

# this is essentially what they are referring to in equation (12.9)

# and equation (12.10) is therefore just the variances of the PCs divided by the
# total variance to begin with
res$sdev^2 / ncol(X)

# and the sum of the variances of the PCs is equal to p
sum(res$sdev^2)

# therefore we can also write
res$sdev^2 / sum(res$sdev^2)

# compute the cumulative sum of these values
cumsum(res$sdev^2 / sum(res$sdev^2))

# so, the first PC explains about 62% of the total variance, the second PC
# explains about 25%, so the first and second PC together explain roughly 87%
# (and so on if we want to pay attention also to the third and fourth PC)

# try out the decomposition in equation (12.11)
M <- 2
phi <- res$rotation[,1:M]
z <- res$x[,1:M]
pred <- z %*% t(phi)
n <- 50
sum(res$sdev[1:M]^2) + 1/(n-1) * sum((X - pred)^2)
#                  #   #                         #
####################   ###########################
# Var of first 2 PCs   MSE of 2-dimensional approximation

# note that we have to compute the last term using 1/(n-1), because scale()
# standardized each variable using their variances (or really, their standard
# deviations) computed using 1/(n-1) and the variances of the PCs (given by
# res$sdev^2) are also computed using 1/(n-1)

# Figure 13.3
par(mfrow=c(1,2))
plot(1:ncol(X), res$sdev^2 / sum(res$sdev^2), type="o", col="blue",
     xlab="Principal Component", ylab="Prop. Variance Explained",
     ylim=c(0,1), xaxt="n", lwd=3)
axis(side=1, at=1:4)
plot(1:ncol(X), cumsum(res$sdev^2) / sum(res$sdev^2), type="o", col="blue",
     xlab="Principal Component", ylab="Cumulative Prop. Variance Explained",
     ylim=c(0,1), xaxt="n", lwd=3)
axis(side=1, at=1:4)
par(mfrow=c(1,1))

# note: a more 'traditional' scree plot just puts the variances of the PCs on
# the y-axis (https://en.wikipedia.org/wiki/Scree_plot)
plot(1:ncol(X), res$sdev^2, type="o", col="blue",
     xlab="Principal Component", ylab="Variance of the PCs", xaxt="n", lwd=3)
axis(side=1, at=1:4)

############################################################################

### 12.2.4: More on PCA

## Scaling the Variables

# all of the results obtained earlier actually used standardized variables,
# because the 4 variables were not measured in the same units (in particular,
# UrbanPop uses a different scale than the other three variables)

## Uniqueness of the Principal Components

# to illustrate that the signs can be flipped, do a PCA and keep 2 components

res <- prcomp(X)
res

M <- 2
phi <- res$rotation[,1:M]
z <- res$x[,1:M]
pred <- z %*% t(phi)
head(pred)

# now flip the sign of phi and z
phi <- -res$rotation[,1:M]
z <- -res$x[,1:M]
pred <- z %*% t(phi)
head(pred)

# the predicted values are exactly the same

## Deciding How Many Principal Components to Use

# we have already seen the scree plot earlier, but as noted in the book (and
# see also Wikipedia), there are criticisms of this approach for choosing the
# number of components to keep; an approach that is not mentioned in the book,
# but that could be considered state-of-the-art, is 'parallel analysis' (see:
# https://en.wikipedia.org/wiki/Parallel_analysis)

# in a parallel analysis, we reshuffle each variable independently of the
# other variables, which we can easily do with the sample() function, applying
# this to each column in X
X.s <- apply(X, 2, sample)

# in such a reshuffled dataset, any relationships between the variables are
# purely coincidental
cor(X.s)

# now we use these reshuffled data in a PCA
tmp <- prcomp(X.s)
tmp

# this tells us what kind of variances of the PCs we might expect to see if
# the variables are essentially unrelated to each other (or again, if there
# are only coincidental relationships between variables)

# we can repeat this process many times, saving the variances of the PCs from
# each iteration

set.seed(1234)

iters <- 1000

evmat <- matrix(NA, nrow=iters, ncol=ncol(X))

for (i in 1:iters) {
   X.s <- apply(X, 2, sample)
   tmp <- prcomp(X.s)
   evmat[i,] <- tmp$sdev^2
}

# and then we can see what the average value of these variances is
apply(evmat, 2, mean)

# we then compare the actual variances with these averages
res$sdev^2

# often this is done by drawing a scree plot and adding the mean variances to it
plot(1:ncol(X), res$sdev^2, type="o", col="blue",
     xlab="Principal Component", ylab="Variance of the PCs",
     xaxt="n", lwd=3)
axis(side=1, at=1:4)
lines(1:4, apply(evmat, 2, mean), type="o", col="red", lwd=3)
legend("topright", inset=.02, lwd=3, col=c("blue","red"),
       legend = c("PCs of the Actual Data", "PCs of the Reshuffled Data"))

# and only PCs with variances that are larger than the mean variances of the
# PCs from the reshuffled data are worth keeping; so in this case, the
# parallel analysis would suggest to only keep the very first principal
# component

# instead of doing this manually, we can use the 'psych' package for this

# install the 'psych' package (if needed)
#install.packages("psych")

# load the 'psych' package
library(psych)

# run a parallel analysis
fa.parallel(X, fa="pc", n.iter=1000, sim=FALSE)

############################################################################

### 12.2.5 Other Uses for Principal Components

# since we already did this in section 6.3.1, no need to repeat this here

############################################################################