3 Gradients

How ar we finding the model that best fits teh data? Through optimization!Here we calculate the parameters B0 and B1 using gradient descent of log-loss, gradient ascent of log likelihood and compare it with the fubction glm that already does this for us.

# Load necessary libraries
library(mlbench)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(ggplot2)

# Load the Pima Indians Diabetes dataset
data("PimaIndiansDiabetes")
PimaIndiansDiabetes <- PimaIndiansDiabetes %>%
  select(glucose, diabetes) %>%
  mutate(diabetes_dummy = ifelse(diabetes == "pos", 1, 0))

# Data preparation
x <- PimaIndiansDiabetes$glucose
y <- PimaIndiansDiabetes$diabetes_dummy

# Sigmoid function
sigmoid <- function(z) {
  1 / (1 + exp(-z))
}

# Log-loss function (for gradient descent) - coded equation from slides
log_loss <- function(beta_0, beta_1, x, y) {
  z <- beta_0 + beta_1 * x
  y_pred <- sigmoid(z)
  -sum(y * log(y_pred) + (1 - y) * log(1 - y_pred))
}

# Log-likelihood function (for gradient ascent)
log_likelihood <- function(beta_0, beta_1, x, y) {
  z <- beta_0 + beta_1 * x
  y_pred <- sigmoid(z)
  sum(y * log(y_pred) + (1 - y) * log(1 - y_pred))
}

# Gradient descent implementation
gradient_descent <- function(x, y, learning_rate, iterations) { # we can play with these two!
  store_beta_0 <- NULL
  store_beta_1 <- NULL
  beta_0 <- -5  # Initialize intercept #random points! Can play around with these ones too
  beta_1 <- 1  # Initialize slope #random points! Can play around with these ones too
  for (i in 1:iterations) {
    z <- beta_0 + beta_1 * x #linear regression function/logit
    y_pred <- sigmoid(z) #logistic regression
    grad_beta_0 <- mean(y_pred - y) #partial derivative of intercept
    grad_beta_1 <- mean((y_pred - y) * x) #partial derivative of slope
    beta_0 <- beta_0 - learning_rate * grad_beta_0 #gradient descent equation! Updating the points
    beta_1 <- beta_1 - learning_rate * grad_beta_1
    store_beta_0[i] <- beta_0
    store_beta_1[i] <- beta_1
  }
  data.frame(beta_1 = store_beta_1, beta_0 = store_beta_0)
}

gd_result <- gradient_descent(x, y, learning_rate = 0.1, iterations = 1000)
#View(gd_result)

Minimum point found after 10000 iterations:

paste0("Glucose/Slope:, ", gd_result$beta_1[dim(gd_result)[2]])

[1] "Glucose/Slope:, -1.230859375"

paste0("Intercept: ", gd_result$beta_0[dim(gd_result)[2]])

[1] "Intercept: -5.0295657360221"

#ggplot(gd_result  %>% add_rownames(var = "instances"), aes( as.numeric(instances), beta_0)) + 
#  geom_point()

#ggplot(gd_result  %>% add_rownames(var = "instances"), aes( as.numeric(instances), beta_1)) + 
#  geom_point()

# Gradient ascent implementation
gradient_ascent <- function(x, y, learning_rate = 0.0001, iterations = 1000) {
  store_beta_0 <- NULL
  store_beta_1 <- NULL
  beta_0 <- 0  # Initialize intercept
  beta_1 <- 0  # Initialize slope
  for (i in 1:iterations) {
    z <- beta_0 + beta_1 * x
    y_pred <- sigmoid(z)
    grad_beta_0 <- mean(y - y_pred)
    grad_beta_1 <- mean((y - y_pred) * x)
    beta_0 <- beta_0 + learning_rate * grad_beta_0
    beta_1 <- beta_1 + learning_rate * grad_beta_1
    store_beta_0[i] <- beta_0
    store_beta_1[i] <- beta_1
  }
  data.frame(beta_1 = store_beta_1, beta_0 = store_beta_0)
}

What do you notice as difference? The sign!

ga_result <- gradient_ascent(x, y, learning_rate = 0.01, iterations = 10000)
View(ga_result)

paste0("Glucose/Slope:, ", ga_result$beta_1[10000])

[1] "Glucose/Slope:, -0.516772238232657"

paste0("Intercept: ", ga_result$beta_0[10000])

[1] "Intercept: -5.52289060218183"

SAME!!!

If we fit the logistic regression model as before we get:

# Fit a logistic regression model using glm
glm_model <- glm(diabetes_dummy ~ glucose, data = PimaIndiansDiabetes, family = binomial)
coef(glm_model)

(Intercept)     glucose 
-5.35008039  0.03787304

--- title: "3 Gradients" format: html --- How ar we finding the model that best fits teh data? Through optimization!Here we calculate the parameters B0 and B1 using gradient descent of log-loss, gradient ascent of log likelihood and compare it with the fubction `glm` that already does this for us. ```{r} # Load necessary libraries library(mlbench) library(dplyr) library(ggplot2) # Load the Pima Indians Diabetes dataset data("PimaIndiansDiabetes") PimaIndiansDiabetes <- PimaIndiansDiabetes %>% select(glucose, diabetes) %>% mutate(diabetes_dummy = ifelse(diabetes == "pos", 1, 0)) ``` ```{r} # Data preparation x <- PimaIndiansDiabetes$glucose y <- PimaIndiansDiabetes$diabetes_dummy ``` ```{r} # Sigmoid function sigmoid <- function(z) { 1 / (1 + exp(-z)) } ``` ```{r} # Log-loss function (for gradient descent) - coded equation from slides log_loss <- function(beta_0, beta_1, x, y) { z <- beta_0 + beta_1 * x y_pred <- sigmoid(z) -sum(y * log(y_pred) + (1 - y) * log(1 - y_pred)) } ``` ```{r} # Log-likelihood function (for gradient ascent) log_likelihood <- function(beta_0, beta_1, x, y) { z <- beta_0 + beta_1 * x y_pred <- sigmoid(z) sum(y * log(y_pred) + (1 - y) * log(1 - y_pred)) } ``` ```{r} # Gradient descent implementation gradient_descent <- function(x, y, learning_rate, iterations) { # we can play with these two! store_beta_0 <- NULL store_beta_1 <- NULL beta_0 <- -5 # Initialize intercept #random points! Can play around with these ones too beta_1 <- 1 # Initialize slope #random points! Can play around with these ones too for (i in 1:iterations) { z <- beta_0 + beta_1 * x #linear regression function/logit y_pred <- sigmoid(z) #logistic regression grad_beta_0 <- mean(y_pred - y) #partial derivative of intercept grad_beta_1 <- mean((y_pred - y) * x) #partial derivative of slope beta_0 <- beta_0 - learning_rate * grad_beta_0 #gradient descent equation! Updating the points beta_1 <- beta_1 - learning_rate * grad_beta_1 store_beta_0[i] <- beta_0 store_beta_1[i] <- beta_1 } data.frame(beta_1 = store_beta_1, beta_0 = store_beta_0) } ``` ```{r} gd_result <- gradient_descent(x, y, learning_rate = 0.1, iterations = 1000) #View(gd_result) ``` Minimum point found after 10000 iterations: ```{r} paste0("Glucose/Slope:, ", gd_result$beta_1[dim(gd_result)[2]]) paste0("Intercept: ", gd_result$beta_0[dim(gd_result)[2]]) ``` ```{r} #ggplot(gd_result %>% add_rownames(var = "instances"), aes( as.numeric(instances), beta_0)) + # geom_point() ``` ```{r} #ggplot(gd_result %>% add_rownames(var = "instances"), aes( as.numeric(instances), beta_1)) + # geom_point() ``` ```{r} # Gradient ascent implementation gradient_ascent <- function(x, y, learning_rate = 0.0001, iterations = 1000) { store_beta_0 <- NULL store_beta_1 <- NULL beta_0 <- 0 # Initialize intercept beta_1 <- 0 # Initialize slope for (i in 1:iterations) { z <- beta_0 + beta_1 * x y_pred <- sigmoid(z) grad_beta_0 <- mean(y - y_pred) grad_beta_1 <- mean((y - y_pred) * x) beta_0 <- beta_0 + learning_rate * grad_beta_0 beta_1 <- beta_1 + learning_rate * grad_beta_1 store_beta_0[i] <- beta_0 store_beta_1[i] <- beta_1 } data.frame(beta_1 = store_beta_1, beta_0 = store_beta_0) } ``` What do you notice as difference? The sign! ```{r} ga_result <- gradient_ascent(x, y, learning_rate = 0.01, iterations = 10000) View(ga_result) ``` ```{r} paste0("Glucose/Slope:, ", ga_result$beta_1[10000]) paste0("Intercept: ", ga_result$beta_0[10000]) ``` SAME!!! If we fit the logistic regression model as before we get: ```{r} # Fit a logistic regression model using glm glm_model <- glm(diabetes_dummy ~ glucose, data = PimaIndiansDiabetes, family = binomial) coef(glm_model) ```