IAML Unit 10: Discussion

keras vs nnet

  • keras vs. keras in tidymodels
    • tidymodels as wrapper around R packages
    • using keras engine in tidymodels wraps around keras
    • keras available in both R and python.
    • keras is wrapper around Tensorflow
  • nnet in tidymodels

All the terms and concepts!

  • layers
  • units
  • weights, biases
  • activation functions
  • gradient descent
  • forward and back propagation
  • learning rate
  • epochs
  • stochastic and mini-batch gradient descent (and Adam)
  • batches
  • drop out and regularization
  • deep learning

A multi-layer neural network

Code
library(ggplot2)

# Define node positions
input_layer <- data.frame(x = rep(1, 6), y = seq(1, 6))                   # 6 input units
hidden_layer1 <- data.frame(x = rep(2, 4), y = seq(1.5, 4.5, length.out = 4))  # 4 units in the first hidden layer
hidden_layer2 <- data.frame(x = rep(3, 3), y = seq(2, 4, length.out = 3))      # 3 units in the second hidden layer
output_layer <- data.frame(x = rep(4, 1), y = 3)                               # 1 output unit

# Combine layers into one data frame
nodes <- rbind(
  cbind(input_layer, layer = "Input"),
  cbind(hidden_layer1, layer = "Hidden1"),
  cbind(hidden_layer2, layer = "Hidden2"),
  cbind(output_layer, layer = "Output")
)

# Define connections (edges)
# From input to first hidden layer
edges_input_hidden1 <- data.frame(
  x_start = rep(1, 6 * 4), y_start = rep(input_layer$y, each = 4),
  x_end = rep(2, 6 * 4), y_end = rep(hidden_layer1$y, times = 6)
)

# From first hidden layer to second hidden layer
edges_hidden1_hidden2 <- data.frame(
  x_start = rep(2, 4 * 3), y_start = rep(hidden_layer1$y, each = 3),
  x_end = rep(3, 4 * 3), y_end = rep(hidden_layer2$y, times = 4)
)

# From second hidden layer to output layer
edges_hidden2_output <- data.frame(
  x_start = rep(3, 3), y_start = hidden_layer2$y,
  x_end = rep(4, 3), y_end = rep(output_layer$y, times = 3)
)

# Combine edges
edges <- rbind(edges_input_hidden1, edges_hidden1_hidden2, edges_hidden2_output)

# Plot using ggplot
ggplot() +
  geom_point(data = nodes, aes(x = x, y = y, color = layer), size = 3) +
  geom_segment(data = edges, aes(x = x_start, y = y_start, xend = x_end, yend = y_end),
               alpha = 0.2, color = "grey") +
  scale_color_manual(values = c("Input" = "blue", "Hidden1" = "green", "Hidden2" = "orange", "Output" = "red")) +
  labs(title = "Neural Network Diagram") +
  theme_minimal() +
  theme(
    legend.title = element_blank(),
    legend.position = "bottom",
    panel.grid = element_blank(),        # Remove grid lines
    axis.text = element_blank(),         # Remove axis text (numbers)
    axis.title = element_blank(),        # Remove axis titles
    axis.ticks = element_blank()         # Remove axis ticks
  )

A single neural network unit

Code
# Create input positions
inputs <- data.frame(
  x = rep(1, 3),
  y = seq(1, 3),
  label = c("X1", "X2", "X3")
)

# Create circle position for the neural network unit
unit <- data.frame(
  x = 2,
  y = 2,
  label = "Unit"
)

# Create output arrow position
output <- data.frame(
  x = 3,
  y = 2,
  label = "Output"
)

# Create arrows (connections)
arrows <- rbind(
  data.frame(x_start = rep(1, 3), y_start = seq(1, 3), x_end = 2, y_end = rep(2, 3)),
  data.frame(x_start = 2, y_start = 2, x_end = 3, y_end = 2)
)

# Plot using ggplot
ggplot() +
  # Plot inputs
  geom_point(data = inputs, aes(x = x, y = y), size = 4, color = "blue") +
  geom_text(data = inputs, aes(x = x - 0.1, y = y, label = label), hjust = 1, color = "black") +
  
  # Plot neural network unit (circle)
  geom_point(data = unit, aes(x = x, y = y), size = 8, shape = 21, fill = "green", color = "black") +
  geom_text(data = unit, aes(x = x, y = y + 0.4, label = label), color = "black") +
  
  # Plot output arrow
  geom_point(data = output, aes(x = x, y = y), size = 4, color = "red") +
  geom_text(data = output, aes(x = x + 0.1, y = y, label = label), hjust = 0, color = "black") +
  
  # Add arrows
  geom_segment(data = arrows, aes(x = x_start, y = y_start, xend = x_end, yend = y_end),
               arrow = arrow(length = unit(0.2, "cm")), color = "grey") +
  
  # Customize plot appearance
  theme_minimal() +
  theme(
    panel.grid = element_blank(),        # Remove grid lines
    axis.text = element_blank(),         # Remove axis numbers
    axis.title = element_blank(),        # Remove axis labels
    axis.ticks = element_blank()         # Remove axis ticks
  ) +
  labs(title = "Neural Network Diagram")

Activation functions

  • perceptron (not used in practice; historically significant)
  • linear (generally not used for hidden layers; Can be used for output layer in regression)
  • sigmoid (rarely in practice; historically significant)
  • softmax (used in output layer for multi-class classification)
  • relu (commonly used in hidden layers)
Code
library(gridExtra)

# Define activation functions
step_function <- function(x) {
  ifelse(x > 0, 1, 0)
}

linear <- function(x) {
  x
}

sigmoid <- function(x) {
  1 / (1 + exp(-x))
}

relu <- function(x) {
  ifelse(x > 0, x, 0)
}

# Generate input range
x <- seq(-10, 10, length.out = 100)

# Create individual data frames for each function
df_step <- data.frame(x = x, y = step_function(x))
df_linear <- data.frame(x = x, y = linear(x))
df_sigmoid <- data.frame(x = x, y = sigmoid(x))
df_relu <- data.frame(x = x, y = relu(x))

# Create individual ggplot objects for each function
plot_step <- ggplot(df_step, aes(x = x, y = y)) +
  geom_line(color = "blue", size = 1) +
  labs(title = "Step Function", x = "Input", y = "Output") +
  theme_minimal()

plot_sigmoid <- ggplot(df_sigmoid, aes(x = x, y = y)) +
  geom_line(color = "green", size = 1) +
  labs(title = "Sigmoid Function", x = "Input", y = "Output") +
  theme_minimal()

plot_relu <- ggplot(df_relu, aes(x = x, y = y)) +
  geom_line(color = "red", size = 1) +
  labs(title = "ReLU Function", x = "Input", y = "Output") +
  theme_minimal()

plot_linear <- ggplot(df_linear, aes(x = x, y = y)) +
  geom_line(color = "purple", size = 1) +
  labs(title = "Linear Function", x = "Input", y = "Output") +
  theme_minimal()

# Arrange the plots in a 2x2 grid
grid.arrange(plot_step, plot_sigmoid, plot_relu, plot_linear, nrow = 2, ncol = 2)

Gradient descent

Code
# Load plotly library
library(plotly)

# Generate a grid of values for b1 and b2
b1 <- seq(-10, 10, length.out = 100)
b2 <- seq(-10, 10, length.out = 100)
grid <- expand.grid(b1 = b1, b2 = b2)

# Define the paraboloid function
grid$z <- with(grid, b1^2 + b2^2)

# Convert grid to matrix for plotting
z_matrix <- matrix(grid$z, nrow = length(b1), ncol = length(b2))

# Create the 3D plot
plot_ly(
  x = ~b1, y = ~b2, z = ~z_matrix,
  type = "surface"
) %>%
  layout(
    title = "3D Bowl-like Paraboloid",
    scene = list(
      xaxis = list(title = "b1"),
      yaxis = list(title = "b2"),
      zaxis = list(title = "Loss")
    )
  )
Code
# Define a sample loss function (e.g., a quadratic function)
loss_function <- function(b1, b2) {
  (b1 - 3)^2 + (b2 - 2)^2
}

# Generate a grid of values for b1 and b2
b1 <- seq(-10, 10, length.out = 100)
b2 <- seq(-10, 10, length.out = 100)
grid <- expand.grid(b1 = b1, b2 = b2)
grid$loss <- with(grid, loss_function(b1, b2))

# Gradient descent simulation
gradient_descent <- function(initial_b1, initial_b2, learning_rate, steps) {
  path <- data.frame(b1 = numeric(steps + 1), b2 = numeric(steps + 1), loss = numeric(steps + 1))
  b1 <- initial_b1
  b2 <- initial_b2
  for (i in 1:(steps + 1)) {
    path[i, ] <- c(b1, b2, loss_function(b1, b2))
    # Compute gradients (partial derivatives)
    grad_b1 <- 2 * (b1 - 3)
    grad_b2 <- 2 * (b2 - 2)
    # Update parameters
    b1 <- b1 - learning_rate * grad_b1
    b2 <- b2 - learning_rate * grad_b2
  }
  return(path)
}

# Perform gradient descent
path <- gradient_descent(initial_b1 = -8, initial_b2 = 8, learning_rate = 0.2, steps = 15)

# Plot contour and gradient descent steps
ggplot(grid, aes(x = b1, y = b2, z = loss)) +
  geom_contour_filled(aes(fill = ..level..), alpha = 0.8) +
  geom_point(data = path, aes(x = b1, y = b2), color = "red", size = 3) +
  geom_path(data = path, aes(x = b1, y = b2), color = "red", size = 1.2) +
  labs(
    title = "Gradient Descent on a Loss Function",
    x = expression(b[1]),
    y = expression(b[2])
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# Load necessary libraries
library(ggplot2)

# Create a sample loss function (e.g., a quadratic function)
loss_function <- function(b1, b2) {
  (b1 - 3)^2 + (b2 - 2)^2
}

# Generate a grid of values for b1 and b2
b1 <- seq(-3, 3, length.out = 100)
b2 <- seq(-3, 3, length.out = 100)
grid <- expand.grid(b1 = b1, b2 = b2)
grid$loss <- with(grid, loss_function(b1, b2))

# Gradient descent simulation
gradient_descent <- function(initial_b1, initial_b2, learning_rate, steps) {
  path <- data.frame(b1 = numeric(steps + 1), b2 = numeric(steps + 1), loss = numeric(steps + 1))
  b1 <- initial_b1
  b2 <- initial_b2
  for (i in 1:(steps + 1)) {
    path[i, ] <- c(b1, b2, loss_function(b1, b2))
    # Compute gradients (partial derivatives)
    grad_b1 <- 2 * (b1 - 3)
    grad_b2 <- 2 * (b2 - 2)
    # Update parameters
    b1 <- b1 - learning_rate * grad_b1
    b2 <- b2 - learning_rate * grad_b2
  }
  return(path)
}

# Perform gradient descent
path <- gradient_descent(initial_b1 = 5, initial_b2 = 5, learning_rate = 0.1, steps = 10)

# Plot contour and gradient descent steps
ggplot(grid, aes(x = b1, y = b2, z = loss)) +
  geom_contour_filled(aes(fill = ..level..), alpha = 0.8) +
  geom_point(data = path, aes(x = b1, y = b2), color = "red", size = 3) +
  geom_path(data = path, aes(x = b1, y = b2), color = "red", size = 1) +
  labs(
    title = "Gradient Descent on a Loss Function",
    x = expression(b[1]),
    y = expression(b[2])
  ) +
  theme_minimal() +
  theme(legend.position = "none")

Addressing overfitting

  • L2 regularization
  • Dropout

Autoencoders

Code
# Define node positions
input_layer <- data.frame(x = rep(1, 32), y = seq(1, 32))
hidden_layer <- data.frame(x = rep(2, 2), y = seq(15, 16))  # Two units in the hidden layer
output_layer <- data.frame(x = rep(3, 32), y = seq(1, 32))

# Combine layers into one data frame
nodes <- rbind(
  cbind(input_layer, layer = "Input"),
  cbind(hidden_layer, layer = "Hidden"),
  cbind(output_layer, layer = "Output")
)

# Define connections (edges)
edges <- data.frame(
  x_start = rep(1, 32 * 2), y_start = rep(input_layer$y, each = 2),
  x_end = rep(2, 32 * 2), y_end = rep(hidden_layer$y, times = 32)
)
edges_out <- data.frame(
  x_start = rep(2, 2 * 32), y_start = rep(hidden_layer$y, each = 32),
  x_end = rep(3, 2 * 32), y_end = rep(output_layer$y, times = 2)
)
edges <- rbind(edges, edges_out)

# Plot using ggplot
ggplot() +
  geom_point(data = nodes, aes(x = x, y = y, color = layer), size = 3) +
  geom_segment(data = edges, aes(x = x_start, y = y_start, xend = x_end, yend = y_end),
               alpha = 0.2, color = "grey") +
  scale_color_manual(values = c("Input" = "blue", "Hidden" = "green", "Output" = "red")) +
  labs(title = "Neural Network Diagram") +
  theme_minimal() +
  theme(
    legend.title = element_blank(),
    legend.position = "bottom",
    panel.grid = element_blank(),        # Remove grid lines
    axis.text = element_blank(),         # Remove axis text (numbers)
    axis.title = element_blank(),        # Remove axis titles
    axis.ticks = element_blank()         # Remove axis ticks
  )
  • Overview/example
  • Costs/benefits vs. PCA
    • autoencoders accommodate non-linearity
    • lots of variants of autoencoders to address different types of data (images, time-series)
    • autoencoders are very flexible (lots of parameters) - overfitting
    • autoencoders are computationally costly
    • autoencoders require expertise to set up and train
    • PCA may be more interpretable (linear combo of features)

Comparison of statistical algorithms

  • linear regression (linear model)
  • logistic regression (from generalized linear model)
  • glmnet (LASSO, Ridge)
  • KNN
  • LDA, QDA
  • decision trees
  • bagged decision trees
  • random forest
  • MLPs (neural networks)