############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2022-12-13
#
# Topic(s):
# - An Introduction to Statistical Learning (https://www.statlearning.com)
# - Section(s): 12.2.1 - 12.2.2
#
# last updated: 2023-01-13

############################################################################

### 12.2.1: What Are Principal Components?

# to illustrate the idea of constructing the first principal component, let's
# simulate the simplest case where we only have two variables (that are
# correlated to some extent); we will also standardize the two variables

set.seed(1235)

n <- 100
x1 <- scale(rnorm(n))[,1]
x2 <- scale(0.8 * x1 + rnorm(n))[,1]

# correlation between x1 and x2
cor(x1, x2)

# plot the data
plot(x1, x2, pch=19, xlim=range(c(x1,x2)), ylim=range(c(x1,x2)))

# try out some linear combinations that fulfill the restriction that the sum
# of squared coefficients is equal to 1 and compute the corresponding variance

phi1 <- c(sqrt(.5), sqrt(.5))
sum(phi1^2)
var(phi1[1]*x1 + phi1[2]*x2)

phi1 <- c(.8, .6)
sum(phi1^2)
var(phi1[1]*x1 + phi1[2]*x2)

phi1 <- c(.6, .8)
sum(phi1^2)
var(phi1[1]*x1 + phi1[2]*x2)

phi1 <- c(.6, -.8)
sum(phi1^2)
var(phi1[1]*x1 + phi1[2]*x2)

# now we want to find the two coefficients that maximize the variance of the
# linear combination; let's do this by trial and error by computing the
# variance of a very large number of linear combinations we can form

phi11 <- seq(-1, 1, length=10000)

varcomb1 <- rep(NA_real_, length(phi11))
varcomb2 <- rep(NA_real_, length(phi11))

for (i in 1:length(phi11)) {
   phi21 <- sqrt(1 - phi11[i]^2) # this must be the value of phi21 given phi11
   varcomb1[i] <- var(phi11[i] * x1 + phi21 * x2) # variance of the linear combination
   phi21 <- -1*phi21 # but phi21 could also be negative
   varcomb2[i] <- var(phi11[i] * x1 + phi21 * x2) # variance of the linear combination
}

# plot phi11 against the variance of the linear combination
plot(phi11, varcomb1, type="l", lwd=3, ylab="Variance of Linear Combination")
lines(phi11, varcomb2, lwd=3, lty="dotted")

# find the value of phi1 for which the variance is as large as possible
phi11 <- phi11[which.max(varcomb1)]

# compute the corresponding value of phi2
phi21 <- sqrt(1 - phi11^2)

# print the values
c(phi11, phi21)

# note: there are technically two possibilities; either phi11 and phi21 are
# both positive or they are both negative (which would give the same maximized
# variance as shown in the plot above); so in essence, the sign of the
# coefficients could be flipped

# add a vertical line at phi1 and a horizontal line at the maximum variance
abline(v=phi11, lty="dotted")
abline(h=max(varcomb1), lty="dotted")

# compute the first principal component (PC)
z1 <- phi11 * x1 + phi21 * x2
z1

# this newly derived variable has the largest possible variance
var(z1)

# in practice, we wouldn't want to find the 'loadings' by hand this way

# while in principle, one could formulate the problem of finding the loadings
# as an optimization problem as shown in (12.3), this is actually slightly
# tricky, since this is a non-convex optimization problem; however, it turns
# out that we don't need to do this via optimization at all, since we can use
# an eigenvalue-eigenvector decomposition to find them

# put x1 and x2 into matrix X
X <- cbind(x1, x2)

# then var(X) gives us the variance-covariance matrix of the two variables
var(X)

# since x1 and x2 are standardized, var(X) is actually the correlation matrix
cor(X)

# now do the eigenvalue-eigenvector decomposition of this correlation matrix
evd <- eigen(cor(X))
evd

# show that cor(X) is identical to the eigenvector matrix times the diagonal
# eigenvalue matrix times the inverse of the eigenvector matrix
cor(X)
evd$vectors %*% diag(evd$values) %*% solve(evd$vectors)

# note that the first column of evd$vectors contains the loadings for the
# first PC (these values are slightly different than the values we found
# earlier using trial-and-error because that approach is not as accurate)

# recompute the first PC based on the more accurate loadings from eigen()
phi1 <- evd$vectors[,1]
z1 <- phi1[1] * x1 + phi1[2] * x2
z1

# the second column of evd$vectors contains the loadings for the second PC
phi2 <- evd$vectors[,2]
z2 <- phi2[1] * x1 + phi2[2] * x2
z2

# double-check that the sum of the squared loadings for the two PCs are 1
sum(phi1^2)
sum(phi2^2)

# check that the first and second PC are uncorrelated
cor(z1, z2)

# note: the eigenvalues are identical to the variances of the two PCs
evd$values
c(var(z1), var(z2))

# plot the original data again and next to it the PCs
par(mfrow=c(1,2), pty="s")
plot(x1, x2, pch=19, xlim=range(c(x1,x2)), ylim=range(c(x1,x2)))
abline(a=0, b=evd$vectors[1,1] / evd$vectors[2,1], lty="dotted")
abline(a=0, b=evd$vectors[1,2] / evd$vectors[2,2], lty="dotted")
plot(z1, z2, pch=19, xlim=range(c(z1,z2)), ylim=range(c(z1,z2)))
abline(h=0, lty="dotted")
abline(v=0, lty="dotted")

############################################################################

# as in the book, we will now use the USArrests dataset for a PCA

# copy the USArrests dataset to dat
dat <- USArrests

# dimensions of the dataset
dim(dat)

# examine the first 6 rows of the dataset
head(dat)

# standardize all 4 variables and put them into X
X <- apply(dat, 2, scale)

# add the states as row names to X
rownames(X) <- rownames(dat)

# correlation matrix
cor(X)

# get the eigenvectors and values
evd <- eigen(cor(X))
evd

# the columns in evd$vectors give the loadings for the 4 PCs; note that the
# signs of the eigenvectors we obtain above may actually be flipped compared
# to what is shown in Table 12.1 in the book (this can depend on your CPU and
# the linear algebra routines that R uses)

# to ensure that the signs are the same as in the book, check the sign of the
# element in position (1,1); if it is negative, flip the sign of the entire
# eigenvector matrix
if (evd$vectors[1,1] < 0)
   evd$vectors <- -1 * evd$vectors

# put the eigenvectors into a separate object
evec <- evd$vectors

# now compute the first and second PC
z1 <- c(X %*% evec[,1])
z2 <- c(X %*% evec[,2])

# plot the first PC against the second PC
par(mfrow=c(1,1), mar=c(5,5,3,3))
plot(z1, z2, cex=0, xlim=range(c(z1,z2)), ylim=range(c(z1,z2)),
     xlab="First Principal Component", ylab="Second Principal Component")
abline(h=0, lty="dotted")
abline(v=0, lty="dotted")
text(z1, z2, rownames(dat), col="dodgerblue")

# add arrows for the loadings on the first and second PC
par(usr=c(-1.2,1.2,-1.2,1.2))
arrows(0, 0, evec[,1], evec[,2], col="orange", lwd=2)
axis(side=3, col.axis="orange", col.ticks="orange")
axis(side=4, col.axis="orange", col.ticks="orange")
text(1.25*evec[,1], 1.25*evec[,2], colnames(dat), col="orange", cex=1.2, font=2)

# create a loading plot (separate plot of the loadings instead of adding them
# to the 'biplot' above)
plot(evec[,1], evec[,2], pch=19, xlim=c(-1,1), ylim=c(-1,1),
     xlab="PC1 Loading", ylab="PC2 Loading")
text(evec[,1], evec[,2], colnames(dat), pos=4)
abline(h=0, lty="dotted")
abline(v=0, lty="dotted")

############################################################################

# in practice, we don't want to do PCA by hand as we did above; R comes with
# the princomp() and prcomp() functions to do a PCA

# use princomp() to do a PCA
res <- princomp(X, cor=TRUE)
res

# note: since X is already standardized, can leave out cor=TRUE

# print the loadings (note: by default, loadings below 0.1 (in absolute value)
# are not shown; setting 'cutoff' to 0 shows all loadings)
print(loadings(res), digits=7, cutoff=0)

# show the first 6 rows of the PCs
head(res$scores)

# use prcomp() function to do PCA
res <- prcomp(X, center=TRUE, scale.=TRUE)
res

# note: since X is already standardized, we don't need to set scale.=TRUE (and
# in fact, center=TRUE by default anyway, so we can also leave this out)
res <- prcomp(X)
res

# note that the loadings have flipped signs again; let's flip their sign
res$rotation <- -1 * res$rotation

# element 'x' contains the actual PCs; flip their signs as well
res$x <- -1 * res$x

# print again
res

# show the first 6 rows for the PCs
head(res$x)

# create the biplot
biplot(res, col=c("dodgerblue","orange"), scale=0)

# note: the values of the PCs are slightly different when using princomp() and
# when using prcomp(); this has to do with how variances are computed in the
# two functions; prcomp() uses the divisor N-1, while princomp() uses N; also,
# as noted in help(prcomp), this function uses a singular value decomposition
# for getting the loadings, which is "the preferred method for numerical
# accuracy"

# the 'psych' package also provides a function for PCA

# install the 'psych' package (if needed)
#install.packages("psych")

# load the 'psych' package
library(psych)

# use psych::principal() to do PCA
res <- principal(X, nfactors=ncol(X), rotate="none")
res

# note that the values shown are 'standardized loadings', that is, these are
# the loadings rescaled by the square root of the eigenvalues; we can undo
# this as follows
t(t(res$loadings[,]) / sqrt(res$values))

# show the first 6 rows for the PCs
head(res$scores)

# again, these are different than the ones we found earlier, because these are
# standardized PC scores; we can undo this as follows
res$scores <- t(t(res$scores) * sqrt(res$values))
head(res$scores)

############################################################################

### 12.2.2: Another Interpretation of Principal Components

# say M = 2 and we want to compute the approximate values of the 'Murder'
# variable based on the first two PCs; pull out the loading matrix and the
# principal component scores from 'res' and use the same notation
res <- prcomp(X)
phi <- res$rotation[,1:2]
z <- res$x[,1:2]
pred.murder <- z[,1] * phi[1,1] + z[,2] * phi[1,2]
pred.murder

# plot the 'predicted' against the actual value for this variable
plot(pred.murder, X[,"Murder"], pch=19, xlim=c(-2,2.5), ylim=c(-2,2.5),
     xlab="Predicted Values Based on PC1 and PC2", ylab="Actual Value")

# predict the values of all 4 variables based on the first two PCs
pred <- z %*% t(phi)

# compute the squared distance between the actual and predicted value for each
# variable, sum up these squared distances over the states for each variable,
# and then sum these sums over the 4 variables
sum(apply((X - pred)^2, 2, sum))

# this is identical to just taking the difference between each element in X
# and pred, squaring this, and summing this all up
sum((X - pred)^2)

# the scores and loadings and for the first two PCs minimize this value

# try to obtain the scores and loadings by framing this as an optimization
# problem as shown in (12.6)
optfun <- function(par, X) {
   z <- matrix(par[1:100], nrow=50, ncol=2)
   phi <- matrix(par[101:108], nrow=4, ncol=2)
   pred <- z %*% t(phi)
   sum(apply((X - pred)^2, 2, sum))
}

opt <- nlminb(rep(1,108), optfun, X=X)
opt

# note that the value for the objective function is the same as above

# get the scores and loadings from 'opt'
z.opt <- matrix(opt$par[1:100], nrow=50, ncol=2)
phi.opt <- matrix(opt$par[101:108], nrow=4, ncol=2)

# predict the values of all 4 variables based on the first two PCs from 'opt'
pred.opt <- z.opt %*% t(phi.opt)
rownames(pred.opt) <- rownames(dat)
colnames(pred.opt) <- colnames(dat)

# compare the two sets of predicted values (not exactly identical because the
# optimization approach introduces some slight inaccuracies, but this actually
# works)
head(pred)
head(pred.opt)

# note: the values of z.opt and phi.opt do not match up with z and phi because
# there is not a unique solution (see also footnote 4), but as we see above,
# the predicted values (or the 'projections' of the original data onto the
# plane defined by the first two PCs that best fits the data) are the same

############################################################################

# the optimization approach above can be simplified, since the principal
# component scores in z are a function of phi (the loadings) and X; so we do
# not really need to optimize over the scores

optfun <- function(par, X) {
   phi <- matrix(par[1:8], nrow=4, ncol=2)
   z <- X %*% phi
   pred <- z %*% t(phi)
   sum(apply((X - pred)^2, 2, sum))
}

opt <- nlminb(rep(1,8), optfun, X=X)
opt

phi.opt <- matrix(opt$par, nrow=4, ncol=2)
z.opt <- X %*% phi.opt
pred.opt <- z.opt %*% t(phi.opt)
rownames(pred.opt) <- rownames(dat)
colnames(pred.opt) <- colnames(dat)

head(pred)
head(pred.opt)

# this again gives us the same predicted values

############################################################################

# the above suggests that we may be able to use the same optimization approach
# to conduct a PCA in general; so instead of restricting the optimization over
# M (with M < p) PCs, we could also do the optimization over all p PCs and try
# to find their loadings; but we also need some restrictions, namely that the
# sum of the squared loadings for each PC is equal to 1 and that the variance
# of the scores on the first PC is as large as possible, followed by making
# the scores on the other PCs as large as possible under the restriction that
# the PC scores are uncorrelated; after some trial-and-error, it appears that
# it is sufficient to check for uncorrelatedness of the scores

optfun <- function(par, X) {
   p <- ncol(X)
   phi <- matrix(par[1:p^2], nrow=p, ncol=p)
   z <- X %*% phi
   pred <- z %*% t(phi)
   # the first part of this sum should be zero since the difference between
   # the observed and predicted values must be zero and the second part should
   # be zero since the correlation matrix of the principal component scores
   # should be an identity matrix
   sum(apply((X - pred)^2, 2, sum)) + sum((cor(z) - diag(p))^2)
}

# do PCA (and flip signs again) and extract the loadings for all 4 PCs
res <- prcomp(X)
res$rotation <- -1 * res$rotation
res$x <- -1 * res$x
phi <- res$rotation
z <- res$x

# now try to do the same via the optimization approach above
p <- ncol(X)
opt <- nlminb(rep(1,p^2), optfun, X=X)
opt

# extract the estimates (loadings) and the PC scores
phi.opt <- matrix(opt$par[1:p^2], nrow=p, ncol=p)
z.opt <- X %*% phi.opt

# reorder the columns based on the variance in the PC scores
phi.opt <- phi.opt[,order(apply(z.opt, 2, var), decreasing=TRUE)]
z.opt <- X %*% phi.opt

# add dimension names to phi.opt and z.opt
dimnames(phi.opt) <- dimnames(phi)
colnames(z.opt) <- colnames(z)

# compare the results
phi
phi.opt

# compare the variances of the PC scores
apply(z, 2, var)
apply(z.opt, 2, var)

# check that the correlations are 0
round(cor(z), 6)
round(cor(z.opt), 6)

# check that the sums of the squared loadings are equal to 1
apply(phi, 2, function(x) sum(x^2))
apply(phi.opt, 2, function(x) sum(x^2))

# this works! not sure how well this generalizes, but I tried this out for
# several different datasets and it worked for all of them (sometimes had to
# increase the number of iterations to obtain convergence)

############################################################################