############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2022-09-22
#
# Topic(s):
# - An Introduction to Statistical Learning (https://www.statlearning.com)
# - Section(s): 7.6 - 7.8
#
# last updated: 2022-09-23

############################################################################

### brief recap of smoothing splines (section 7.5) and some additions

# install (if necessary) the ISLR2
#install.packages("ISLR2")

# load the ISLR2 package
library(ISLR2)

# load the splines package
library(splines)

# copy the Wage dataset to 'dat'
dat <- Wage

# plot the data
plot(wage ~ age, data=dat, pch=21, col="darkgray", bg="lightgray", cex=0.7,
     xlab="Age", ylab="Wage", bty="l")

############################################################################

# natural spline model with knots at every unique value of age

knots <- sort(unique(dat$age))
knots <- knots[-c(1, length(knots))]
res0 <- lm(wage ~ ns(age, knots=knots), data=dat)

# add the regression line/curve to the plot
newdat <- data.frame(age=seq(18,80,length=1000))
pred <- predict(res0, newdata=newdat)
lines(newdat$age, pred, col="blue", lwd=8)

# fit smoothing spline, setting the effective degrees of freedom to 16 (which
# implies a certain value for lambda) and ensure that all unique values of age
# are used as the knot positions
res1 <- smooth.spline(dat$age, dat$wage, df=16, all.knots=TRUE)
res1

# add the regression line/curve to the plot
lines(res1, col="red", lwd=8)

# smoothing spline with lower effective degrees of freedom (so more smoothing)
res2 <- smooth.spline(dat$age, dat$wage, df=8, all.knots=TRUE)
res2

# add the regression line/curve to the plot
lines(res2, col="green4", lwd=8)

# try to manually fit the smoothing spline

X <- cbind(1, ns(dat$age, knots=knots))
y <- dat$wage
Xd <- cbind(1, ns(seq(min(dat$age), max(dat$age), length=10000), knots=knots))

fitfun <- function(par, X, y, Xd, lambda=0) {
   pred <- c(X %*% par)
   rss <- sum((y - pred)^2)
   pred <- c(Xd %*% par)
   penalty <- sum((diff(diff(pred)))^2)
   rss + lambda * penalty
}

# first try to replicate the natural spline model (like res0)
sav0 <- nlminb(c(mean(dat$wage), rep(0,ncol(X)-1)), fitfun, X=X, y=y, Xd=Xd)
pred0 <- c(cbind(1, ns(newdat$age, knots=knots)) %*% sav0$par)
lines(newdat$age, pred0, col="lightblue", lwd=4)

# now setting lambda to a value that gives essentially the same result as the
# smoothing spline with df=16 (like res1) (note: the computation of the
# penalty term in fitfun() is really simplistic and the scaling of lambda is
# very different, but with some trial and error, we can find a lambda value
# here that gives essentially the same level of smoothing)
sav1 <- nlminb(sav0$par, fitfun, X=X, y=y, Xd=Xd, lambda=12^8)
pred1 <- c(cbind(1, ns(newdat$age, knots=knots)) %*% sav1$par)
lines(newdat$age, pred1, col="orange", lwd=4)

# smoothing spline with a higher level of smoothing (like res2)
sav2 <- nlminb(sav1$par, fitfun, X=X, y=y, Xd=Xd, lambda=18^8)
pred2 <- c(cbind(1, ns(newdat$age, knots=knots)) %*% sav2$par)
lines(newdat$age, pred2, col="lightgreen", lwd=4)

############################################################################

### 7.6: Local Regression

# plot the data
plot(wage ~ age, data=dat, pch=21, col="darkgray", bg="lightgray", cex=0.7,
     xlab="Age", ylab="Wage", bty="l")

# fit local regression model using a span of 0.2 (note: by default, loess()
# uses a quadratic polynomial for the local regression models, while the book
# describes using a linear model in Algorithm 7.1)
res1 <- loess(wage ~ age, data=dat, span=0.2)
res1

# add the smoother line to the plot
newdat <- data.frame(age=seq(18,80,length=1000))
pred1 <- predict(res1, newdata=newdat)
lines(newdat$age, pred1, col="red", lwd=8)

# increase the span to 0.7
res2 <- loess(wage ~ age, data=dat, span=0.7)
res2

# add the smoother line to the plot
pred2 <- predict(res2, newdata=newdat)
lines(newdat$age, pred2, col="blue", lwd=8)

# add a legend
legend("topright", inset=.02, lty="solid", col=c("red","blue"), lwd=5,
       legend=c("Span is 0.2 (16.4 Degrees of Freedom)",
                "Span is 0.7 (5.3 Degrees of Freedom)"), bg="white")

# try to obtain the smoother manually without using loess()

span <- 0.2

xs <- seq(18,80,length=100)
pred <- rep(NA, length(xs))

for (i in 1:length(xs)) {

   dist    <- abs(dat$age - xs[i])
   incl    <- dist <= quantile(dist, prob=span)
   weights <- rep(0, nrow(dat))
   weights[incl] <- (1 - (dist[incl] / max(dist[incl]))^3)^3
   tmp <- lm(wage ~ age + I(age^2), data=dat, weights=weights)
   pred[i] <- predict(tmp, newdata=data.frame(age = xs[i]))

}

lines(xs, pred, lwd=4, col="orange")

# this gets us quite close to the red line; the orange line seems a bit more
# wiggly, so there might be some additional implementation details that are
# not described in the book / help page for loess()

############################################################################

# local regression using unit weights (so all nearby points are weighted
# equally) and using a simple regression model with just an intercept

# plot the data
plot(wage ~ age, data=dat, pch=21, col="darkgray", bg="lightgray", cex=0.7,
     xlab="Age", ylab="Wage", bty="l")

span <- 0.2

xs <- seq(18,80,length=1000)
pred <- rep(NA, length(xs))

for (i in 1:length(xs)) {

   dist    <- abs(dat$age - xs[i])
   incl    <- dist <= quantile(dist, prob=span)
   weights <- rep(0, nrow(dat))
   weights[incl] <- 1
   tmp <- lm(wage ~ 1, data=dat, weights=weights)
   pred[i] <- predict(tmp, newdata=data.frame(age = xs[i]))

}

lines(xs, pred, lwd=8, col="darkgreen")

# function to do k-nearest neighbor for a single predictor x
knn <- function(x, y, x0, k) {
   dist <- (x - x0)^2
   mean(y[order(dist)[1:k]])
}

for (i in 1:length(xs)) {
   pred[i] <- knn(dat$age, dat$wage, x0=xs[i], k=nrow(dat)*span)
}

lines(xs, pred, lwd=4, col="green")

# so doing local 'regression' where we use unit weights and the regression
# model is a very simple one with just an intercept is essentially the same as
# doing k-nearest neighbor, where k = n * span; note that the way this is
# implemented above isn't exactly the same, because when we do the local
# regression, we might end up with a fraction of the total sample size that is
# slightly different than the desired span and in knn() we just take the first
# k nearest neighbors in the dataset even if there are more neighbors with the
# same smallest distance, but in principle these two approaches are the same
# in this special case

############################################################################

# 10-fold cross validation to choose the span value

spans <- seq(0.2, 1, length=100)

set.seed(1234)

split <- sample(rep_len(1:10, nrow(dat)))
pred  <- rep(NA, nrow(dat))

mse  <- rep(NA, length(spans))

for (i in 1:length(spans)) {

   for (j in 1:10) {

      res <- loess(wage ~ age, data=dat, span=spans[i], subset=split!=j)
      pred[split == j] <- predict(res, newdata=dat[split==j,])

   }

   mse[i] <- mean((dat$wage - pred)^2)

}

plot(spans, mse, type="o", pch=19, lwd=2, xlab="Value for span", ylab="MSE")

# the value of 'spans' that gives the lowest mse
span <- spans[which.min(mse)]
span

abline(v=span, lty="dotted")

# fit the local regression model with this value of span to all data
res <- loess(wage ~ age, data=dat, span=span)

# plot the data
plot(wage ~ age, data=dat, pch=21, col="darkgray", bg="lightgray", cex=0.7,
     xlab="Age", ylab="Wage", bty="l")

# add the line based on this model
newdat <- data.frame(age=seq(18,80,length=1000))
pred <- predict(res, newdata=newdat)
lines(newdat$age, pred, col="red", lwd=8)

############################################################################

### 7.7: Generalized Additive Models

# 7.7.1: GAMs for Regression Problems

# install the gam package (if necessary)
#install.packages("gam")

# load the gam package
library(gam)

# fit GAM with natural cubic splines for year and age (with 4 and 5 degrees of
# freedom, respectively) and education as a factor
res1 <- lm(wage ~ ns(year, df=4) + ns(age, df=5) + education, data=dat)
summary(res1)

# plot the marginal relationships
par(mfrow=c(1,3))
plot.Gam(res1, se=TRUE, col="red", lwd=6, scale=70)

############################################################################

# it is not 100% clear to me how plot.Gam() does this (especially the scaling
# of the y-axis is strange); but we can do the same thing manually using for
# example the emmeans package

# install the emmeans package (if necessary)
#install.packages("emmeans")

# load the emmeans package
library(emmeans)

# predict wage for different values of year, holding age constant at its mean
# and giving weights to the different educational levels in accordance with
# the frequencies as observed in our data (which we can think of as holding
# education constant at the mean educational level of the sample)

newdat <- seq(min(dat$year), max(dat$year), length=1000)
pred <- emmeans(res1, specs = ~ year, at=list(year=newdat), weights="proportional")
pred <- summary(pred)

par(mfrow=c(1,3), mar=c(5,4.5,4,2))

plot(newdat, pred$emmean, type="l", lwd=4, col="red", xlab="year",
     ylab=expression(f[1](year)), ylim=c(90,150))
lines(newdat, pred$lower.CL, lty="dotted")
lines(newdat, pred$upper.CL, lty="dotted")

# do the same thing with age (holding year constant at its mean and holding
# education constant as described above)

newdat <- seq(min(dat$age), max(dat$age), length=1000)
pred <- emmeans(res1, specs = ~ age, at=list(age=newdat), weights="proportional")
pred <- summary(pred)

plot(newdat, pred$emmean, type="l", lwd=4, col="red", xlab="age",
     ylab=expression(f[2](age)), ylim=c(70,130))
lines(newdat, pred$lower.CL, lty="dotted")
lines(newdat, pred$upper.CL, lty="dotted")

# do the same thing with education (holding age and year constant at their means)

newdat <- levels(dat$education)
pred <- emmeans(res1, specs = ~ education, at=list(education=newdat), weights="proportional")
pred <- summary(pred)

plot(1:5, pred$emmean, pch=19, xlab="education", ylab=expression(f[3](education)),
     xlim=c(0.5,5.5), ylim=c(80,170))
arrows(1:5, pred$lower.CL, 1:5, pred$upper.CL, angle=90, code=3, length=0.1)

############################################################################

# fit GAM with natural smoothing splines for year and age (with 4 and 5
# degrees of freedom, respectively) and education as a factor
res2 <- gam(wage ~ s(year, df=4) + s(age, df=5) + education, data=dat)
summary(res2)

# plot the marginal relationships
par(mfrow=c(1,3))
plot(res2, se=TRUE, col="blue", lwd=6, scale=70)

############################################################################

# 7.7.2: GAMs for Classification Problems

# dichotomize wage into high wage (more than 250k) versus not
dat$highwage <- ifelse(dat$wage > 250, 1, 0)

# logistic regression model with year as a linear term, age as a smoothing
# spline (with 5 degrees of freedom), and education as a factor
res3 <- gam(highwage ~ year + s(age, df=5) + education, data=dat, family=binomial)
summary(res3)

# plot the marginal relationships
par(mfrow=c(1,3))
plot(res3, se=TRUE, col="darkgreen", lwd=6, scale=10)

# exclude the people with the lowest education level
res4 <- gam(highwage ~ year + s(age, df=5) + education, data=dat,
            family=binomial, subset=education!="1. < HS Grad")
summary(res4)

# plot the marginal relationships
par(mfrow=c(1,3))
plot(res4, se=TRUE, col="darkgreen", lwd=6, scale=10)

############################################################################

### 7.8 Lab: Non-linear Modeling

# most things skipped, because we have already covered many things while
# working through the chapter (and at times at an even deeper level)

# but let's do a GAM with a smoothing spline for year and a local regression
# smoother for age (and education as before as a factor)
res5 <- gam(wage ~ s(year, df=4) + lo(age, span=0.7) + education, data=dat)
summary(res5)

# plot the marginal relationships
par(mfrow=c(1,3))
plot(res5, se=TRUE, col="darkgreen", lwd=6, scale=70)

# GAM with a local regression surface for year and age together
res6 <- gam(wage ~ lo(year, age, span=0.5) + education, data=dat)
summary(res6)

# plot the marginal relationships
par(mfrow=c(1,2))
plot(res6, se=TRUE, scale=70)

############################################################################

### Extra: Non-Linear Regression (not covered in the book)

# https://en.wikipedia.org/wiki/Nonlinear_regression

# simulate some data from a Michaelis–Menten model

set.seed(1234)

n <- 250

x <- runif(n, 0, 50)
y <- 5 * x / (3 + x) + rnorm(n, 0, 0.2)

dat <- data.frame(x=x, y=y)
rm(x,y)

# plot the data
plot(y ~ x, data=dat, pch=19, col="darkgray", ylim=c(0,6), bty="l")

# try to capture the relationship between x and y using a quadratic polynomial
res <- lm(y ~ x + I(x^2), data=dat)
summary(res)

newdat <- data.frame(x = seq(0, 50, length=1000))
pred <- predict(res, newdata=newdat)
lines(newdat$x, pred, lwd=6, col="blue")

# try a cubic model
res <- lm(y ~ x + I(x^2) + I(x^3), data=dat)
summary(res)

pred <- predict(res, newdata=newdat)
lines(newdat$x, pred, lwd=6, col="red")

# how about a smoothing spline?
res <- smooth.spline(dat$x, dat$y, df=8)
lines(res, col="darkgreen", lwd=6)

# add a legend
legend("topleft", inset=.02, lty="solid", col=c("red","blue","darkgreen"), lwd=5,
       legend=c("Quadratic Polynomial", "Cubic Polynomial", "Smoothing Spline"))

# now fit a non-linear regression model where the functional form of the model
# matches the underlying true model
res <- nls(y ~ beta1 * x / (beta2 + x), start = list(beta1=2, beta2=1), data=dat)
summary(res)

pred <- predict(res, newdata=newdat)
plot(y ~ x, data=dat, pch=19, col="darkgray", ylim=c(0,6), bty="l")
lines(newdat$x, pred, lwd=6, col="blue")

# beta1 is the upper asymptote of y as x goes to infinity
# beta2 is the value of x where y is half way between 0 and beta1
abline(h=coef(res)[1], lty="dotted")
abline(v=coef(res)[2], lty="dotted")
abline(h=coef(res)[1]/2, lty="dotted")

############################################################################