IAML Unit 10: Discussion

keras vs nnet

keras vs. keras in tidymodels
- tidymodels as wrapper around R packages
- using keras engine in tidymodels wraps around keras
- keras available in both R and python.
- keras is wrapper around Tensorflow
nnet in tidymodels

All the terms and concepts!

layers
units
weights, biases
activation functions
gradient descent
forward and back propagation
learning rate
epochs
stochastic and mini-batch gradient descent (and Adam)
batches
drop out and regularization
deep learning

A multi-layer neural network

Code

library(ggplot2)

# Define node positions
input_layer <- data.frame(x = rep(1, 6), y = seq(1, 6))                   # 6 input units
hidden_layer1 <- data.frame(x = rep(2, 4), y = seq(1.5, 4.5, length.out = 4))  # 4 units in the first hidden layer
hidden_layer2 <- data.frame(x = rep(3, 3), y = seq(2, 4, length.out = 3))      # 3 units in the second hidden layer
output_layer <- data.frame(x = rep(4, 1), y = 3)                               # 1 output unit

# Combine layers into one data frame
nodes <- rbind(
  cbind(input_layer, layer = "Input"),
  cbind(hidden_layer1, layer = "Hidden1"),
  cbind(hidden_layer2, layer = "Hidden2"),
  cbind(output_layer, layer = "Output")
)

# Define connections (edges)
# From input to first hidden layer
edges_input_hidden1 <- data.frame(
  x_start = rep(1, 6 * 4), y_start = rep(input_layer$y, each = 4),
  x_end = rep(2, 6 * 4), y_end = rep(hidden_layer1$y, times = 6)
)

# From first hidden layer to second hidden layer
edges_hidden1_hidden2 <- data.frame(
  x_start = rep(2, 4 * 3), y_start = rep(hidden_layer1$y, each = 3),
  x_end = rep(3, 4 * 3), y_end = rep(hidden_layer2$y, times = 4)
)

# From second hidden layer to output layer
edges_hidden2_output <- data.frame(
  x_start = rep(3, 3), y_start = hidden_layer2$y,
  x_end = rep(4, 3), y_end = rep(output_layer$y, times = 3)
)

# Combine edges
edges <- rbind(edges_input_hidden1, edges_hidden1_hidden2, edges_hidden2_output)

# Plot using ggplot
ggplot() +
  geom_point(data = nodes, aes(x = x, y = y, color = layer), size = 3) +
  geom_segment(data = edges, aes(x = x_start, y = y_start, xend = x_end, yend = y_end),
               alpha = 0.2, color = "grey") +
  scale_color_manual(values = c("Input" = "blue", "Hidden1" = "green", "Hidden2" = "orange", "Output" = "red")) +
  labs(title = "Neural Network Diagram") +
  theme_minimal() +
  theme(
    legend.title = element_blank(),
    legend.position = "bottom",
    panel.grid = element_blank(),        # Remove grid lines
    axis.text = element_blank(),         # Remove axis text (numbers)
    axis.title = element_blank(),        # Remove axis titles
    axis.ticks = element_blank()         # Remove axis ticks
  )

A single neural network unit

Code

# Create input positions
inputs <- data.frame(
  x = rep(1, 3),
  y = seq(1, 3),
  label = c("X1", "X2", "X3")
)

# Create circle position for the neural network unit
unit <- data.frame(
  x = 2,
  y = 2,
  label = "Unit"
)

# Create output arrow position
output <- data.frame(
  x = 3,
  y = 2,
  label = "Output"
)

# Create arrows (connections)
arrows <- rbind(
  data.frame(x_start = rep(1, 3), y_start = seq(1, 3), x_end = 2, y_end = rep(2, 3)),
  data.frame(x_start = 2, y_start = 2, x_end = 3, y_end = 2)
)

# Plot using ggplot
ggplot() +
  # Plot inputs
  geom_point(data = inputs, aes(x = x, y = y), size = 4, color = "blue") +
  geom_text(data = inputs, aes(x = x - 0.1, y = y, label = label), hjust = 1, color = "black") +
  
  # Plot neural network unit (circle)
  geom_point(data = unit, aes(x = x, y = y), size = 8, shape = 21, fill = "green", color = "black") +
  geom_text(data = unit, aes(x = x, y = y + 0.4, label = label), color = "black") +
  
  # Plot output arrow
  geom_point(data = output, aes(x = x, y = y), size = 4, color = "red") +
  geom_text(data = output, aes(x = x + 0.1, y = y, label = label), hjust = 0, color = "black") +
  
  # Add arrows
  geom_segment(data = arrows, aes(x = x_start, y = y_start, xend = x_end, yend = y_end),
               arrow = arrow(length = unit(0.2, "cm")), color = "grey") +
  
  # Customize plot appearance
  theme_minimal() +
  theme(
    panel.grid = element_blank(),        # Remove grid lines
    axis.text = element_blank(),         # Remove axis numbers
    axis.title = element_blank(),        # Remove axis labels
    axis.ticks = element_blank()         # Remove axis ticks
  ) +
  labs(title = "Neural Network Diagram")

Activation functions

perceptron (not used in practice; historically significant)
linear (generally not used for hidden layers; Can be used for output layer in regression)
sigmoid (rarely in practice; historically significant)
softmax (used in output layer for multi-class classification)
relu (commonly used in hidden layers)

Code

library(gridExtra)

# Define activation functions
step_function <- function(x) {
  ifelse(x > 0, 1, 0)
}

linear <- function(x) {
  x
}

sigmoid <- function(x) {
  1 / (1 + exp(-x))
}

relu <- function(x) {
  ifelse(x > 0, x, 0)
}

# Generate input range
x <- seq(-10, 10, length.out = 100)

# Create individual data frames for each function
df_step <- data.frame(x = x, y = step_function(x))
df_linear <- data.frame(x = x, y = linear(x))
df_sigmoid <- data.frame(x = x, y = sigmoid(x))
df_relu <- data.frame(x = x, y = relu(x))

# Create individual ggplot objects for each function
plot_step <- ggplot(df_step, aes(x = x, y = y)) +
  geom_line(color = "blue", size = 1) +
  labs(title = "Step Function", x = "Input", y = "Output") +
  theme_minimal()

plot_sigmoid <- ggplot(df_sigmoid, aes(x = x, y = y)) +
  geom_line(color = "green", size = 1) +
  labs(title = "Sigmoid Function", x = "Input", y = "Output") +
  theme_minimal()

plot_relu <- ggplot(df_relu, aes(x = x, y = y)) +
  geom_line(color = "red", size = 1) +
  labs(title = "ReLU Function", x = "Input", y = "Output") +
  theme_minimal()

plot_linear <- ggplot(df_linear, aes(x = x, y = y)) +
  geom_line(color = "purple", size = 1) +
  labs(title = "Linear Function", x = "Input", y = "Output") +
  theme_minimal()

# Arrange the plots in a 2x2 grid
grid.arrange(plot_step, plot_sigmoid, plot_relu, plot_linear, nrow = 2, ncol = 2)

Gradient descent

Code

# Load plotly library
library(plotly)

# Generate a grid of values for b1 and b2
b1 <- seq(-10, 10, length.out = 100)
b2 <- seq(-10, 10, length.out = 100)
grid <- expand.grid(b1 = b1, b2 = b2)

# Define the paraboloid function
grid$z <- with(grid, b1^2 + b2^2)

# Convert grid to matrix for plotting
z_matrix <- matrix(grid$z, nrow = length(b1), ncol = length(b2))

# Create the 3D plot
plot_ly(
  x = ~b1, y = ~b2, z = ~z_matrix,
  type = "surface"
) %>%
  layout(
    title = "3D Bowl-like Paraboloid",
    scene = list(
      xaxis = list(title = "b1"),
      yaxis = list(title = "b2"),
      zaxis = list(title = "Loss")
    )
  )

Code

# Define a sample loss function (e.g., a quadratic function)
loss_function <- function(b1, b2) {
  (b1 - 3)^2 + (b2 - 2)^2
}

# Generate a grid of values for b1 and b2
b1 <- seq(-10, 10, length.out = 100)
b2 <- seq(-10, 10, length.out = 100)
grid <- expand.grid(b1 = b1, b2 = b2)
grid$loss <- with(grid, loss_function(b1, b2))

# Gradient descent simulation
gradient_descent <- function(initial_b1, initial_b2, learning_rate, steps) {
  path <- data.frame(b1 = numeric(steps + 1), b2 = numeric(steps + 1), loss = numeric(steps + 1))
  b1 <- initial_b1
  b2 <- initial_b2
  for (i in 1:(steps + 1)) {
    path[i, ] <- c(b1, b2, loss_function(b1, b2))
    # Compute gradients (partial derivatives)
    grad_b1 <- 2 * (b1 - 3)
    grad_b2 <- 2 * (b2 - 2)
    # Update parameters
    b1 <- b1 - learning_rate * grad_b1
    b2 <- b2 - learning_rate * grad_b2
  }
  return(path)
}

# Perform gradient descent
path <- gradient_descent(initial_b1 = -8, initial_b2 = 8, learning_rate = 0.2, steps = 15)

# Plot contour and gradient descent steps
ggplot(grid, aes(x = b1, y = b2, z = loss)) +
  geom_contour_filled(aes(fill = ..level..), alpha = 0.8) +
  geom_point(data = path, aes(x = b1, y = b2), color = "red", size = 3) +
  geom_path(data = path, aes(x = b1, y = b2), color = "red", size = 1.2) +
  labs(
    title = "Gradient Descent on a Loss Function",
    x = expression(b[1]),
    y = expression(b[2])
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# Load necessary libraries
library(ggplot2)

# Create a sample loss function (e.g., a quadratic function)
loss_function <- function(b1, b2) {
  (b1 - 3)^2 + (b2 - 2)^2
}

# Generate a grid of values for b1 and b2
b1 <- seq(-3, 3, length.out = 100)
b2 <- seq(-3, 3, length.out = 100)
grid <- expand.grid(b1 = b1, b2 = b2)
grid$loss <- with(grid, loss_function(b1, b2))

# Gradient descent simulation
gradient_descent <- function(initial_b1, initial_b2, learning_rate, steps) {
  path <- data.frame(b1 = numeric(steps + 1), b2 = numeric(steps + 1), loss = numeric(steps + 1))
  b1 <- initial_b1
  b2 <- initial_b2
  for (i in 1:(steps + 1)) {
    path[i, ] <- c(b1, b2, loss_function(b1, b2))
    # Compute gradients (partial derivatives)
    grad_b1 <- 2 * (b1 - 3)
    grad_b2 <- 2 * (b2 - 2)
    # Update parameters
    b1 <- b1 - learning_rate * grad_b1
    b2 <- b2 - learning_rate * grad_b2
  }
  return(path)
}

# Perform gradient descent
path <- gradient_descent(initial_b1 = 5, initial_b2 = 5, learning_rate = 0.1, steps = 10)

# Plot contour and gradient descent steps
ggplot(grid, aes(x = b1, y = b2, z = loss)) +
  geom_contour_filled(aes(fill = ..level..), alpha = 0.8) +
  geom_point(data = path, aes(x = b1, y = b2), color = "red", size = 3) +
  geom_path(data = path, aes(x = b1, y = b2), color = "red", size = 1) +
  labs(
    title = "Gradient Descent on a Loss Function",
    x = expression(b[1]),
    y = expression(b[2])
  ) +
  theme_minimal() +
  theme(legend.position = "none")

Addressing overfitting

L2 regularization
Dropout

Autoencoders

Code

# Define node positions
input_layer <- data.frame(x = rep(1, 32), y = seq(1, 32))
hidden_layer <- data.frame(x = rep(2, 2), y = seq(15, 16))  # Two units in the hidden layer
output_layer <- data.frame(x = rep(3, 32), y = seq(1, 32))

# Combine layers into one data frame
nodes <- rbind(
  cbind(input_layer, layer = "Input"),
  cbind(hidden_layer, layer = "Hidden"),
  cbind(output_layer, layer = "Output")
)

# Define connections (edges)
edges <- data.frame(
  x_start = rep(1, 32 * 2), y_start = rep(input_layer$y, each = 2),
  x_end = rep(2, 32 * 2), y_end = rep(hidden_layer$y, times = 32)
)
edges_out <- data.frame(
  x_start = rep(2, 2 * 32), y_start = rep(hidden_layer$y, each = 32),
  x_end = rep(3, 2 * 32), y_end = rep(output_layer$y, times = 2)
)
edges <- rbind(edges, edges_out)

# Plot using ggplot
ggplot() +
  geom_point(data = nodes, aes(x = x, y = y, color = layer), size = 3) +
  geom_segment(data = edges, aes(x = x_start, y = y_start, xend = x_end, yend = y_end),
               alpha = 0.2, color = "grey") +
  scale_color_manual(values = c("Input" = "blue", "Hidden" = "green", "Output" = "red")) +
  labs(title = "Neural Network Diagram") +
  theme_minimal() +
  theme(
    legend.title = element_blank(),
    legend.position = "bottom",
    panel.grid = element_blank(),        # Remove grid lines
    axis.text = element_blank(),         # Remove axis text (numbers)
    axis.title = element_blank(),        # Remove axis titles
    axis.ticks = element_blank()         # Remove axis ticks
  )

Overview/example
Costs/benefits vs. PCA
- autoencoders accommodate non-linearity
- lots of variants of autoencoders to address different types of data (images, time-series)
- autoencoders are very flexible (lots of parameters) - overfitting
- autoencoders are computationally costly
- autoencoders require expertise to set up and train
- PCA may be more interpretable (linear combo of features)

Comparison of statistical algorithms

linear regression (linear model)
logistic regression (from generalized linear model)
glmnet (LASSO, Ridge)
KNN
LDA, QDA
decision trees
bagged decision trees
random forest
MLPs (neural networks)