3 Gradients

How ar we finding the model that best fits teh data? Through optimization!Here we calculate the parameters B0 and B1 using gradient descent of log-loss, gradient ascent of log likelihood and compare it with the fubction glm that already does this for us.

# Load necessary libraries
library(mlbench)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)

# Load the Pima Indians Diabetes dataset
data("PimaIndiansDiabetes")
PimaIndiansDiabetes <- PimaIndiansDiabetes %>%
  select(glucose, diabetes) %>%
  mutate(diabetes_dummy = ifelse(diabetes == "pos", 1, 0))
# Data preparation
x <- PimaIndiansDiabetes$glucose
y <- PimaIndiansDiabetes$diabetes_dummy
# Sigmoid function
sigmoid <- function(z) {
  1 / (1 + exp(-z))
}
# Log-loss function (for gradient descent) - coded equation from slides
log_loss <- function(beta_0, beta_1, x, y) {
  z <- beta_0 + beta_1 * x
  y_pred <- sigmoid(z)
  -sum(y * log(y_pred) + (1 - y) * log(1 - y_pred))
}
# Log-likelihood function (for gradient ascent)
log_likelihood <- function(beta_0, beta_1, x, y) {
  z <- beta_0 + beta_1 * x
  y_pred <- sigmoid(z)
  sum(y * log(y_pred) + (1 - y) * log(1 - y_pred))
}
# Gradient descent implementation
gradient_descent <- function(x, y, learning_rate, iterations) { # we can play with these two!
  store_beta_0 <- NULL
  store_beta_1 <- NULL
  beta_0 <- -5  # Initialize intercept #random points! Can play around with these ones too
  beta_1 <- 1  # Initialize slope #random points! Can play around with these ones too
  for (i in 1:iterations) {
    z <- beta_0 + beta_1 * x #linear regression function/logit
    y_pred <- sigmoid(z) #logistic regression
    grad_beta_0 <- mean(y_pred - y) #partial derivative of intercept
    grad_beta_1 <- mean((y_pred - y) * x) #partial derivative of slope
    beta_0 <- beta_0 - learning_rate * grad_beta_0 #gradient descent equation! Updating the points
    beta_1 <- beta_1 - learning_rate * grad_beta_1
    store_beta_0[i] <- beta_0
    store_beta_1[i] <- beta_1
  }
  data.frame(beta_1 = store_beta_1, beta_0 = store_beta_0)
}
gd_result <- gradient_descent(x, y, learning_rate = 0.1, iterations = 1000)
#View(gd_result)

Minimum point found after 10000 iterations:

paste0("Glucose/Slope:, ", gd_result$beta_1[dim(gd_result)[2]])
[1] "Glucose/Slope:, -1.230859375"
paste0("Intercept: ", gd_result$beta_0[dim(gd_result)[2]])
[1] "Intercept: -5.0295657360221"
#ggplot(gd_result  %>% add_rownames(var = "instances"), aes( as.numeric(instances), beta_0)) + 
#  geom_point() 
#ggplot(gd_result  %>% add_rownames(var = "instances"), aes( as.numeric(instances), beta_1)) + 
#  geom_point() 
# Gradient ascent implementation
gradient_ascent <- function(x, y, learning_rate = 0.0001, iterations = 1000) {
  store_beta_0 <- NULL
  store_beta_1 <- NULL
  beta_0 <- 0  # Initialize intercept
  beta_1 <- 0  # Initialize slope
  for (i in 1:iterations) {
    z <- beta_0 + beta_1 * x
    y_pred <- sigmoid(z)
    grad_beta_0 <- mean(y - y_pred)
    grad_beta_1 <- mean((y - y_pred) * x)
    beta_0 <- beta_0 + learning_rate * grad_beta_0
    beta_1 <- beta_1 + learning_rate * grad_beta_1
    store_beta_0[i] <- beta_0
    store_beta_1[i] <- beta_1
  }
  data.frame(beta_1 = store_beta_1, beta_0 = store_beta_0)
}

What do you notice as difference? The sign!

ga_result <- gradient_ascent(x, y, learning_rate = 0.01, iterations = 10000)
View(ga_result)
paste0("Glucose/Slope:, ", ga_result$beta_1[10000])
[1] "Glucose/Slope:, -0.516772238232657"
paste0("Intercept: ", ga_result$beta_0[10000])
[1] "Intercept: -5.52289060218183"

SAME!!!

If we fit the logistic regression model as before we get:

# Fit a logistic regression model using glm
glm_model <- glm(diabetes_dummy ~ glucose, data = PimaIndiansDiabetes, family = binomial)
coef(glm_model)
(Intercept)     glucose 
-5.35008039  0.03787304 
Back to top