############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2022-04-07
#
# Topic(s):
# - An Introduction to Statistical Learning (https://www.statlearning.com)
# - Section(s): 4.4.3 - 4.6.1
#
# last updated: 2022-04-29

############################################################################

### 4.4.3: Quadratic Discriminant Analysis

# install the ISLR2 package
#install.packages("ISLR2")

# load the ISLR2 package
library(ISLR2)

# copy the Default dataset to dat
dat <- Default

# load MASS package
library(MASS)

# look at the documentation of the qda() function
help(qda)

# do LDA using balance and student as predictors for 'default' (yes/no)
res1 <- lda(default ~ balance + student, data=dat)
res1

# do QDA using balance and student as predictors for 'default' (yes/no)
res2 <- qda(default ~ balance + student, data=dat)
res2

# get predictions based on LDA and QDA for the training data
pred1 <- predict(res1)
pred2 <- predict(res2)

# confusion matrices for LDA and QDA
addmargins(table(pred1$class, dat$default), margin=c(1,2))
addmargins(table(pred2$class, dat$default), margin=c(1,2))

# error rate of LDA and QDA in the training data
100 * mean(pred1$class != dat$default)
100 * mean(pred2$class != dat$default)

# install and load package pROC
#install.packages(pROC)
library(pROC)

# draw the ROC curve based on LDA
sav1 <- roc(default ~ pred1$posterior[,"Yes"], data=dat)
plot(sav1, lwd=3, col="blue")

# add the curve for QDA to the plot
sav2 <- roc(default ~ pred2$posterior[,"Yes"], data=dat)
plot(sav2, lwd=3, add=TRUE, col="red")

# add legend
legend("top", inset=.06, lty="solid", col=c("blue","red"),
       lwd=3, legend=c("LDA","QDA"), bty="n")

# compute the area under the curve for both methods
auc(sav1)
auc(sav2)

############################################################################

### 4.4.4: Naive Bayes

# install and load the mvtnorm package
#install.packages("mvtnorm")
library(mvtnorm)

# compute the density for p=2 predictors where x1=1 and x2=2 for a bivariate
# normal distribution with true means equal to 0 for both variables, true
# variances equal to 1 for both variables, and true correlation equal to 0.7
dmvnorm(c(1,2), mean=c(0,0), sigma=matrix(c(1,0.7,0.7,1), nrow=2, ncol=2))

# same, but now assume the true means are 0 and 2 for the two variables
dmvnorm(c(1,2), mean=c(0,2), sigma=matrix(c(1,0.7,0.7,1), nrow=2, ncol=2))

# show for the case of a bivariate normal distribution that f(x1,x2) is equal
# to f(x1) * f(x2) when the two variables are independent (i.e., this is a
# demonstration of equation 4.29)
dmvnorm(c(1,2), mean=c(0,0), sigma=matrix(c(1,0,0,1), nrow=2, ncol=2))
dnorm(1, mean=0, sd=1) * dnorm(2, mean=0, sd=1)

############################################################################

# simulate data like in the example on page 156

set.seed(1234)

n1 <- 10000
n2 <- 10000

x11 <- rnorm(n1, mean=0, sd=1)
x12 <- rnorm(n1, mean=2, sd=0.5)
x13 <- sample(1:3, n1, replace=TRUE, prob=c(4/18, 6/18, 8/18))

x21 <- rnorm(n1, mean=-1.8, sd=0.75)
x22 <- rnorm(n1, mean=0, sd=1)
x23 <- sample(1:3, n2, replace=TRUE, prob=c(11/18, 1/18, 6/18))

# histograms of variables x1 and x2 and barplot of x3 in each class (Figure 4.10)

par(mfrow=c(2,3))

hist(x11, xlab="", xlim=c(-4,4), breaks=seq(-5,5,by=.5), col="seagreen3")
hist(x12, xlab="", xlim=c(-4,4), breaks=seq(-5,5,by=.5), col="skyblue2")
barplot(table(x13), col="darkorange2", main="Barplot of x13", space=2)

hist(x21, xlab="", xlim=c(-4,4), breaks=seq(-5,5,by=.5), col="seagreen3")
hist(x22, xlab="", xlim=c(-4,4), breaks=seq(-5,5,by=.5), col="skyblue2")
barplot(table(x23), col="darkorange2", main="Barplot of x23", space=2)

par(mfrow=c(1,1))

# compute pi_1 * f_1(x1=0.4) * f_1(x2=1.5) * f_1(x3=1) (for class 1)
f1 <- n1 / (n1 + n2) *
dnorm(0.4, mean=mean(x11), sd=sd(x11)) *
dnorm(1.5, mean=mean(x12), sd=sd(x12)) *
mean(x13 == 1)

# compute pi_2 * f_2(x1=0.4) * f_2(x2=1.5) * f_2(x3=1) (for class 2)
f2 <- n2 / (n1 + n2) *
dnorm(0.4, mean=mean(x21), sd=sd(x21)) *
dnorm(1.5, mean=mean(x22), sd=sd(x22)) *
mean(x23 == 1)

# posterior probability for class 1 and class 2
f1 / (f1 + f2)
f2 / (f1 + f2)

# note: cannot reproduce the values on page 156 exactly, since we not know the
# exact values of the distributions from which the authors simulated the data
# (and there would also be a discrepancy simply due to chance)

############################################################################

# naive Bayes for the 'Default' dataset

# copy the Default dataset to dat
dat <- Default

# do LDA using balance and student as predictors for 'default' (yes/no)
res <- lda(default ~ balance + student, data=dat)
res

# confusion matrices for LDA (Table 4.4)
pred <- predict(res)
addmargins(table(pred$class, dat$default), margin=c(1,2))

# function to do naive Bayes for this example

nb <- function(x1, x2, x3, outcome, balance, student, income) {

   f.y <- mean(outcome == "Yes") *
          dnorm(x1, mean=mean(balance[outcome == "Yes"]), sd=sd(balance[outcome == "Yes"])) *
          mean(student[outcome == "Yes"] == x2) *
          dnorm(x3, mean=mean(income[outcome == "Yes"]), sd=sd(income[outcome == "Yes"]))

   f.n <- mean(outcome == "No") *
          dnorm(x1, mean=mean(balance[outcome == "No"]), sd=sd(balance[outcome == "No"])) *
          mean(student[outcome == "No"] == x2) *
          dnorm(x3, mean=mean(income[outcome == "No"]), sd=sd(income[outcome == "No"]))

   post.y <- f.y / (f.y + f.n)
   post.n <- f.n / (f.y + f.n)

   return(c(post.y, post.n))

}

head(dat)

# posterior probabilities based on naive Bayes for the first subject
nb(x1=729.5265, x2="No", x3=44361.625, outcome=dat$default, balance=dat$balance, student=dat$student, income=dat$income)

# compute posterior probabilities for all subjects

dat$post.y <- NA
dat$post.n <- NA

for (i in 1:nrow(dat)) {
   sav <- nb(x1=dat$balance[i], x2=dat$student[i], x3=dat$income[i],
             outcome=dat$default, balance=dat$balance, student=dat$student, income=dat$income)
   dat$post.y[i] <- sav[1]
   dat$post.n[i] <- sav[2]
}

dat$pred <- ifelse(dat$post.y > 0.5, "Yes", "No")

head(dat)

# confusion matrix (Table 4.8)
addmargins(table(dat$pred, dat$default), margin=c(1,2))

# error rate
100 * mean(dat$pred != dat$default)

# note: in LDA (Table 4.4), the authors only used balance and student as
# predictors, but for applying naive Bayes, they also used income

# also add income for LDA
res <- lda(default ~ balance + student + income, data=dat)
pred <- predict(res)
addmargins(table(pred$class, dat$default), margin=c(1,2))
100 * mean(pred$class != dat$default)

# use a threshold of 0.2 for classifying when using naive Bayes (Table 4.9)
dat$pred <- ifelse(dat$post.y > 0.2, "Yes", "No")
addmargins(table(dat$pred, dat$default), margin=c(1,2))

############################################################################

### 4.5: A Comparison of Classification Methods

### 4.5.1: An Analytical Comparison

# this section is theoretical, so no code here

### 4.5.2: An Empirical Comparison

# let's try to replicate the results for scenario 1; note: we will leave out
# KNN-CV, since cross-validation has not been covered in the book yet

set.seed(1234)

# simulate the test data
n1.test <- 10000
n0.test <- 10000

x11.test <- rnorm(n1.test, 1, 1)
x12.test <- rnorm(n1.test, 1, 1)
x01.test <- rnorm(n0.test, 0, 1) # set sd to 2 for scenario 6
x02.test <- rnorm(n0.test, 0, 1) # set sd to 2 for scenario 6

y.test <- c(rep(1, n1.test), rep(0, n0.test))
X.test <- data.frame(x1 = c(x11.test, x01.test), x2 = c(x12.test, x02.test))

n1 <- 10 # set to 6 for scenario 6
n0 <- 10 # set to 6 for scenario 6

iter <- 100

errorrate.knn <- rep(NA, iter)
errorrate.lda <- rep(NA, iter)
errorrate.glm <- rep(NA, iter)
errorrate.nba <- rep(NA, iter)
errorrate.qda <- rep(NA, iter)

# function to do KNN-1

knn <- function(x, x1, x2, y) {
   d <- sqrt((x[1] - x1)^2 + (x[2] - x2)^2)
   y[which.min(d)]
}

# function for naive Bayes with two predictors

nb <- function(x1.test, x2.test, x1, x2, y) {

   f.0 <- mean(y == 0) *
          dnorm(x1.test, mean=mean(x1[y == 0]), sd=sd(x1[y == 0])) *
          dnorm(x2.test, mean=mean(x2[y == 0]), sd=sd(x2[y == 0]))

   f.1 <- mean(y == 1) *
          dnorm(x1.test, mean=mean(x1[y == 1]), sd=sd(x1[y == 1])) *
          dnorm(x2.test, mean=mean(x2[y == 1]), sd=sd(x2[y == 1]))

   post.0 <- f.0 / (f.1 + f.0)
   post.1 <- f.1 / (f.1 + f.0)

   return(cbind(post.0, post.1))

}

for (i in 1:iter) {

   # simulate the training data
   x11 <- rnorm(n1, 1, 1)
   x12 <- rnorm(n1, 1, 1)
   x01 <- rnorm(n0, 0, 1) # set sd to 2 for scenario 6
   x02 <- rnorm(n0, 0, 1) # set sd to 2 for scenario 6
   y <- c(rep(1, n1), rep(0, n0))
   x1 <- c(x11, x01)
   x2 <- c(x12, x02)

   # KNN-1
   pred <- apply(X.test, 1, function(x) knn(x, x1=x1, x2=x2, y=y))
   errorrate.knn[i] <- mean(pred != y.test)

   # LDA
   res <- lda(y ~ x1 + x2)
   pred <- predict(res, newdata=X.test)
   errorrate.lda[i] <- mean(pred$class != y.test)

   # logistic regression model
   res <- glm(y ~ x1 + x2, family=binomial)
   pred <- predict(res, newdata=X.test, type="response")
   errorrate.glm[i] <- mean(ifelse(pred > 0.5, 1, 0) != y.test)

   # naive Bayes
   pred <- nb(X.test[,1], X.test[,2], x1=x1, x2=x2, y=y)
   errorrate.nba[i] <- mean(ifelse(pred[,2] > 0.5, 1, 0) != y.test)

   # QDA
   res <- qda(y ~ x1 + x2)
   pred <- predict(res, newdata=X.test)
   errorrate.qda[i] <- mean(pred$class != y.test)

}

# show boxplots of the 100 error rates for each method (Figure 4.11, left panel)
boxplot(cbind(errorrate.knn, errorrate.lda, errorrate.glm, errorrate.nba, errorrate.qda),
        names=c("KNN-1", "LDA", "Logistic", "NBayes", "QDA"),
        col=c("skyblue", "seagreen", "seagreen", "pink", "darkorange"))

############################################################################

### 4.6: Generalized Linear Models

# look at help file for the Bikeshare dataset
help(Bikeshare)

# copy Bikeshare to dat
dat <- Bikeshare

# outcome: bikers
# categorical predictors: mnth, hr, weathersit
# dichotomous predictors: workingday
# quantitative predictor: temp

### 4.6.1: Linear Regression on the Bikeshare Data

# fit regression model (Table 4.10)
res <- lm(bikers ~ workingday + temp + weathersit + relevel(mnth, ref="Dec") + hr, data=dat)
round(coef(summary(res))[1:6,], digits=2)

# note: the authors do not indicate which level they made the reference level
# for the mnth and hr predictors; it seems like they used Dec(ember) for mnth,
# but for hr none of the levels yields the intercept as shown in Table 4.10

# plot coefficients for mnth (Figure 4.13, left panel)
b <- coef(res)
b <- c(b[grep("mnth", names(b))], 0)
plot(b, type="o", pch=19, col="blue", xlab="Month", ylab="Coefficient", xaxt="n")
axis(side=1, at=1:12, label=substr(levels(dat$mnth), 1, 1))
abline(h=0, lty="dotted")

# plot coefficients for hr (Figure 4.13, right panel)
b <- coef(res)
b <- c(0, b[grep("hr", names(b))])
plot(b, type="o", pch=19, col="blue", xlab="Hour", ylab="Coefficient", xaxt="n")
axis(side=1, at=0:24)
abline(h=0, lty="dotted")

# note: the reference level gets a coefficient of 0; since we do not know how
# the authors chose the reference level for hr, the y-axis values are not the
# same as in the figure, but the pattern is exactly the same

# addendum: the coding they used is explained in section 4.7.7

# percentage of fitted values below 0
round(100 * mean(predict(res) < 0), digits=1)

# plot of hr (jittered) versus bikers
plot(bikers ~ jitter(as.numeric(as.character(dat$hr))), data=dat, pch=19,
     cex=0.2, col="skyblue2", xlab="Hour", ylab="Number of Bikers")

# to diagnose potential problems with the regression model, we can use the
# built-in diagnostics plots that R provides
par(mfrow=c(2,2))
plot(res, cex=0.2, pch=19, col="skyblue2")
par(mfrow=c(1,1))

# plot of hr (jittered) versus log(bikers)
plot(log(bikers) ~ jitter(as.numeric(as.character(dat$hr))), data=dat, pch=19,
     cex=0.2, col="skyblue2", xlab="Hour", ylab="Log(Number of Bikers)")

# fit regression model with log(bikers) as outcome
res <- lm(log(bikers) ~ workingday + temp + weathersit + relevel(mnth, ref="Dec") + hr, data=dat)
round(coef(summary(res))[1:6,], digits=2)

# diagnostic plots
par(mfrow=c(2,2))
plot(res, cex=0.2, pch=19, col="skyblue2")
par(mfrow=c(1,1))

# by modeling log(bikers), the predicted (average) number of bikers is always
# going to be positive when we back-transform via exponentiation
100 * mean(exp(predict(res)) < 0)

############################################################################