############################################################################

# Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams)
#
# By:   Wolfgang Viechtbauer (https://www.wvbauer.com)
# Date: 2022-11-24
#
# Topic(s):
# - An Introduction to Statistical Learning (https://www.statlearning.com)
# - Section(s): 10.3 - 10.4
#
# last updated: 2022-12-02

############################################################################

# to supress info and warning messages when loading tensorflow
Sys.setenv(TF_CPP_MIN_LOG_LEVEL = "2")

### 10.3: Convolutional Neural Networks

# load the keras package (see the script from the last session on how to set
# up keras, which requires some additional software)
library(keras)

# the CIFAR100 dataset: https://www.cs.toronto.edu/~kriz/cifar.html
#
# with the dataset_cifar100() function, we can download the CIFAR100 dataset
# and have it directly available for processing with R
cifar100 <- dataset_cifar100()

# note: this downloads the dataset into the ~/.keras/datasets directory

# cifar100 is a list with two elements called 'train' and 'test'

# check the variable names in the dataset
names(cifar100)

# each of these elements is a list with elements 'x' and 'y' for the image
# data and the labels
str(cifar100$train)

# put the image data into x_train and the labels into g_train for the training data
x_train <- cifar100$train$x
g_train <- cifar100$train$y

# do the same for the test data
x_test  <- cifar100$test$x
g_test  <- cifar100$test$y

# x_train is an array where the first dimension corresponds to the 50,000
# images, the second and third dimensions are the 32x32 pixel locations, and
# the fourth dimension provides the colors intensities for red-green-blue
dim(x_train)

# rescale the color intensities to the 0-1 range
x_train <- x_train / 255
x_test  <- x_test  / 255

# install the jpeg package
#install.packages("jpeg")

# load the jpeg package
library(jpeg)

# random sample 25 numbers from 1:50000
index <- sample(50000, 25)

# draw the image for these 25 randomly selected images
par(mar=c(0,0,0,0), mfrow=c(5,5))
for (i in index)
   plot(as.raster(x_train[i,,,]))

# there are 100 classes and for each there are 500 images
table(g_train)

# turn the class labels into 100 dummy variables
y_train <- to_categorical(g_train, 100)
dim(y_train)

# set the seed to make results reproducible
#use_session_with_seed(42) # this will be deprecated in the future
tensorflow::set_random_seed(42)

# set up the model
model <- keras_model_sequential() |>
   layer_conv_2d(input_shape=c(32,32,3), filters=32, kernel_size=c(3,3), padding="same", activation="relu") |>
   layer_max_pooling_2d(pool_size=c(2,2)) |>
   layer_conv_2d(filters=64, kernel_size=c(3,3), padding="same", activation="relu") |>
   layer_max_pooling_2d(pool_size=c(2,2)) |>
   layer_conv_2d(filters=128, kernel_size=c(3,3), padding="same", activation="relu") |>
   layer_max_pooling_2d(pool_size=c(2,2)) |>
   layer_conv_2d(filters=256, kernel_size=c(3,3), padding="same", activation="relu") |>
   layer_max_pooling_2d(pool_size=c(2,2)) |>
   layer_flatten() |>
   layer_dropout(rate=0.5) |>
   layer_dense(units=512, activation="relu") |>
   layer_dense(units=100, activation="softmax") |>
   compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics="accuracy")

# summary information about the model
summary(model)

# fit model (this might take several minutes)
system.time(res <- fit(model, x_train, y_train, epochs=30, batch_size=128, validation_split=0.2))

# predicted probabilities for each class in the test data
pred <- predict(model, x_test)

# predicted class
pred <- apply(pred, 1, which.max) - 1

# test error rate (about 54%)
mean(g_test != pred)

# accuracy (about 46%)
mean(g_test == pred)

############################################################################

# compare this to multinomial logistic regression where we use each of the
# 32*32*3 = 3072 features as a predictor (it is silly to even consider this,
# but let's see what happens)

# set the seed to make results reproducible
tensorflow::set_random_seed(42)

# set up the model
model <- keras_model_sequential() |>
   layer_flatten(input_shape=c(32,32,3)) |>
   layer_dense(units=100, activation="softmax") |>
   compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics="accuracy")

# summary information about the model
summary(model)

# fit model (this might take several minutes)
system.time(res <- fit(model, x_train, y_train, epochs=30, batch_size=128, validation_split=0.2))

# predicted probabilities for each class in the test data
pred <- predict(model, x_test)

# predicted class
pred <- apply(pred, 1, which.max) - 1

# accuracy (about 10%, which is better than randomly chosen one of the 100
# labels, which would yield an accuracy of 1%)
mean(g_test == pred)

# accuracy when randomly choosing one of the 100 labels (about 1%)
mean(g_test == sample(100, 10000, replace=TRUE) - 1)

############################################################################

# try an even simpler multionomial logistic regression model that only uses
# the mean of the 32x32 pixels for each color as predictors (so in essence,
# these variables reflect how much red, green, and blue is in an image)

# compute the mean for each color
x_train_col <- apply(x_train, c(1,4), mean)
x_test_col  <- apply(x_test,  c(1,4), mean)

# examine the first 6 rows of x_train
head(x_train_col)

# set the seed to make results reproducible
tensorflow::set_random_seed(42)

# set up the model
model <- keras_model_sequential() |>
   layer_dense(units=100, input_shape=3, activation="softmax") |>
   compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics="accuracy")

# summary information about the model
summary(model)

# fit model
system.time(res <- fit(model, x_train_col, y_train, epochs=30, batch_size=128, validation_split=0.2))

# accuracy (about 3-4%)
pred <- predict(model, x_test_col)
pred <- apply(pred, 1, which.max) - 1
mean(g_test == pred)

############################################################################

### 10.4: Document Classification

# we will try some document classification with the IMDB dataset

# dataset_imdb() can be used to download the dataset
imdb <- dataset_imdb(num_words=10000)

# note: only the 10,000 most frequent words are included

# a list with elements 'train and 'test'
names(imdb)

# each contains elements 'x' and 'y' with the features (the words used in a
# particular review indexed by numbers) and the labels (whether the review was
# a positive or negative review)

x_train <- imdb$train$x
y_train <- imdb$train$y

x_test <- imdb$test$x
y_test <- imdb$test$y

# for example, the first review used these words
x_train[[1]]

# and it was a positive review (1 = positive, 0 = negative)
y_train[1]

# get the word index
word_index <- dataset_imdb_word_index()

# the numbers of the words are such that 1 is the most commonly used word, 2
# is the second most commonly used word, and so on; so we can examine the 100
# most commonly used words as follows (note: have to add 3 because that is how
# the index is set up)
sort(unlist(word_index)+3)[1:100]

# function to decode a review
decode_review <- function(text, word_index) {
   word <- names(word_index)
   idx <- unlist(word_index, use.names = FALSE)
   word <- c("<PAD>", "<START>", "<UNK>", "<UNUSED>", word)
   idx <- c(0:3, idx + 3)
   words <- word[match(text, idx, 2)]
   paste(words, collapse = " ")
}

# do this for the first review
decode_review(x_train[[1]], word_index)

# load the Matrix package
library(Matrix)

# function to 'one-hot' encode the reviews
one_hot <- function(sequences, dimension) {
   seqlen <- sapply(sequences, length)
   n <- length(seqlen)
   rowind <- rep(1:n, seqlen)
   colind <- unlist(sequences)
   sparseMatrix(i = rowind, j = colind, dims = c(n, dimension))
}

# do this for the training and test data
x_train_1h <- one_hot(x_train, 10000)
x_test_1h  <- one_hot(x_test, 10000)

# so each of these is now a matrix with 10000 columns where each column
# corresponds to one of the words and it indicates whether that word was used
# in a review or not
dim(x_train_1h)

# proportion of entries that are not zero
nnzero(x_train_1h) / (25000 * 10000)
sum(x_train_1h > 0) / (25000 * 10000)

# install the glmnet package
#install.packages("glmnet")

# load the glmnet package
library(glmnet)

# set the seed to make things reproducible
set.seed(1234)

# 10-fold cross-validation to determine an appropriate value for lambda
res.cv <- cv.glmnet(x_train_1h, y_train, family="binomial", standardize=FALSE,
                    type.measure="class", trace.it=1)

# plot the results
plot(res.cv)

# lambda value with the lowest cross-validated misclassification rate
res.cv$lambda.min

# fit a lasso logistic regression model to these data with the chosen lambda
res <- glmnet(x_train_1h, y_train, family="binomial", standardize=FALSE,
              lambda=res.cv$lambda.min)
res

# predict the class for the test data
pred <- predict(res, x_test_1h, type="class")

# accuracy (about 88%)
mean(y_test == pred)

# for comparison, fit the same model but using a non-sparse feature matrix

x_train_1h_non_sparse <- lapply(x_train, function(x) {
   mat <- rep(0,10000)
   mat[x] <- 1
   return(mat)
})
x_train_1h_non_sparse <- do.call(rbind, x_train_1h_non_sparse)

# this takes about 35 seconds on my computer (with the sparse feature matrix
# above, it only took 2-3 seconds)
system.time(res <- glmnet(x_train_1h_non_sparse, y_train, family="binomial",
                          standardize=FALSE, lambda=res.cv$lambda.min))
pred <- predict(res, x_test_1h, type="class")
mean(y_test == pred)

############################################################################

# now use a neural network with two hidden layers (with 16 units each)

# set the seed to make results reproducible
tensorflow::set_random_seed(42)

# set up the model
model <- keras_model_sequential() %>%
   layer_dense(input_shape=10000, units=16, activation="relu") |>
   layer_dense(units=16, activation="relu") |>
   layer_dense(units=1, activation="sigmoid") |>
   compile(optimizer="rmsprop", loss="binary_crossentropy", metrics="accuracy")

# save the initial weights
weights <- get_weights(model)

# fit the model (setting 20% of the data aside for validation)
res <- fit(model, x_train_1h, y_train, epochs=20, batch_size=512, validation_split=0.2)

# plot the loss/accuracy as a function of epochs
plot(res)

# according to this graph, the accuracy estimated from the validation data
# actually starts to go down after 5 epochs; so reset the weights to the
# initial ones and refit the model using only 5 epochs (using all data)
set_weights(model, weights)
res <- fit(model, x_train_1h, y_train, epochs=5, batch_size=512)

# predicted probabilities of being a positive review for the test data
pred <- predict(model, x_test_1h)

# accuracy (about 88%)
mean((pred > 0.5) == y_test)

############################################################################