Logistic Regression Practical

Classes PowerPoint Presentation

#| echo: false #| results: hide #| output: false #| include: false

library(tidyverse) 
library(caret) 
theme_set(theme_bw())

Load the data and remove NAs

data("PimaIndiansDiabetes2", package = "mlbench")
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)

Inspect the data

sample_n(PimaIndiansDiabetes2, 3)

    pregnant glucose pressure triceps insulin mass pedigree age diabetes
520        6     129       90       7     326 19.6    0.582  60      neg
214        0     140       65      26     130 42.6    0.431  24      pos
225        1     100       66      15      56 23.6    0.666  26      neg

Split the data into training and test set

set.seed(123)
training.samples <- PimaIndiansDiabetes2$diabetes %>% 
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- PimaIndiansDiabetes2[training.samples, ]
test.data <- PimaIndiansDiabetes2[-training.samples, ]

Fit the model

model <- glm( diabetes ~., data = train.data, family = binomial)

Summarize the model

summary(model)


Call:
glm(formula = diabetes ~ ., family = binomial, data = train.data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.5832  -0.6544  -0.3292   0.6248   2.5968  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.053e+01  1.440e+00  -7.317 2.54e-13 ***
pregnant     1.005e-01  6.127e-02   1.640  0.10092    
glucose      3.710e-02  6.486e-03   5.719 1.07e-08 ***
pressure    -3.876e-04  1.383e-02  -0.028  0.97764    
triceps      1.418e-02  1.998e-02   0.710  0.47800    
insulin      5.940e-04  1.508e-03   0.394  0.69371    
mass         7.997e-02  3.180e-02   2.515  0.01190 *  
pedigree     1.329e+00  4.823e-01   2.756  0.00585 ** 
age          2.718e-02  2.020e-02   1.346  0.17840    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 398.80  on 313  degrees of freedom
Residual deviance: 267.18  on 305  degrees of freedom
AIC: 285.18

Number of Fisher Scoring iterations: 5

Make predictions

probabilities <- model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")

Model accuracy

mean(predicted.classes == test.data$diabetes)

[1] 0.7564103

model <- glm( diabetes ~ glucose, data = train.data, family = binomial)
summary(model)$coef

               Estimate  Std. Error   z value     Pr(>|z|)
(Intercept) -6.15882009 0.700096646 -8.797100 1.403974e-18
glucose      0.04327234 0.005341133  8.101716 5.418949e-16

newdata <- data.frame(glucose = c(20,  180))
probabilities <- model %>% predict(newdata, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
predicted.classes

    1     2 
"neg" "pos"

train.data %>%
  mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>%
  ggplot(aes(glucose, prob)) +
  geom_point(alpha = 0.2) +
  geom_smooth(method = "glm", method.args = list(family = "binomial")) +
  labs(
    title = "Logistic Regression Model", 
    x = "Plasma Glucose Concentration",
    y = "Probability of being diabete-pos"
  )

`geom_smooth()` using formula = 'y ~ x'

model <- glm( diabetes ~ glucose + mass + pregnant, 
              data = train.data, family = binomial)
summary(model)$coef

               Estimate  Std. Error   z value     Pr(>|z|)
(Intercept) -9.32369818 1.125997285 -8.280391 1.227711e-16
glucose      0.03886154 0.005404219  7.190962 6.433636e-13
mass         0.09458458 0.023529905  4.019760 5.825738e-05
pregnant     0.14466661 0.045125729  3.205857 1.346611e-03

model <- glm( diabetes ~., data = train.data, family = binomial)
summary(model)$coef

                 Estimate  Std. Error     z value     Pr(>|z|)
(Intercept) -1.053400e+01 1.439679266 -7.31690975 2.537464e-13
pregnant     1.005031e-01 0.061266974  1.64041157 1.009196e-01
glucose      3.709621e-02 0.006486093  5.71934633 1.069346e-08
pressure    -3.875933e-04 0.013826185 -0.02803328 9.776356e-01
triceps      1.417771e-02 0.019981885  0.70952823 4.779967e-01
insulin      5.939876e-04 0.001508231  0.39383055 6.937061e-01
mass         7.997447e-02 0.031798907  2.51500698 1.190300e-02
pedigree     1.329149e+00 0.482291020  2.75590704 5.852963e-03
age          2.718224e-02 0.020199295  1.34570257 1.783985e-01

coef(model)

  (Intercept)      pregnant       glucose      pressure       triceps 
-1.053400e+01  1.005031e-01  3.709621e-02 -3.875933e-04  1.417771e-02 
      insulin          mass      pedigree           age 
 5.939876e-04  7.997447e-02  1.329149e+00  2.718224e-02

model <- glm( diabetes ~ pregnant + glucose + pressure + mass + pedigree, 
              data = train.data, family = binomial)

probabilities <- model %>% predict(test.data, type = "response")
head(probabilities)

       19        21        32        55        64        71 
0.1352603 0.5127526 0.6795376 0.7517408 0.2734867 0.1648174

contrasts(test.data$diabetes)

    pos
neg   0
pos   1

predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
head(predicted.classes)

   19    21    32    55    64    71 
"neg" "pos" "pos" "pos" "neg" "neg"

mean(predicted.classes == test.data$diabetes)

[1] 0.7564103

library("mgcv")

Loading required package: nlme

Attaching package: 'nlme'

The following object is masked from 'package:dplyr':

    collapse

This is mgcv 1.8-42. For overview type 'help("mgcv-package")'.

# Fit the model
gam.model <- gam(diabetes ~ s(glucose) + mass + pregnant,
                 data = train.data, family = "binomial")
# Summarize model
summary(gam.model )


Family: binomial 
Link function: logit 

Formula:
diabetes ~ s(glucose) + mass + pregnant

Parametric coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -4.59794    0.86982  -5.286 1.25e-07 ***
mass         0.09458    0.02353   4.020 5.83e-05 ***
pregnant     0.14467    0.04513   3.206  0.00135 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
           edf Ref.df Chi.sq p-value    
s(glucose)   1      1  51.71  <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.339   Deviance explained = 29.8%
UBRE = -0.083171  Scale est. = 1         n = 314

# Make predictions
probabilities <- gam.model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities> 0.5, "pos", "neg")
# Model Accuracy
mean(predicted.classes == test.data$diabetes)

[1] 0.7820513

optional ========================================================================

Example implementation from scratch:

library(ggplot2)
library(dplyr)


N <- 200 # number of points per class
D <- 2 # dimensionality, we use 2D data for easy visualization
K <- 2 # number of classes, binary for logistic regression
X <- data.frame() # data matrix (each row = single example, can view as xy coordinates)
y <- data.frame() # class labels

set.seed(56)

t <- seq(0,1,length.out = N) 
for (j in (1:K)){
  # t, m are parameters of parametric equations x1, x2
  # t <- seq(0,1,length.out = N) 
  # add randomness 
  m <- rnorm(N, j+0.5, 0.25) 
  Xtemp <- data.frame(x1 = 3*t , x2 = m - t) 
  ytemp <- data.frame(matrix(j-1, N, 1))
  X <- rbind(X, Xtemp)
  y <- rbind(y, ytemp)
}

data <- cbind(X,y)
colnames(data) <- c(colnames(X), 'label')

# create dir images
#dir.create(file.path('.', 'images'), showWarnings = FALSE)

# lets visualize the data:
data_plot <- ggplot(data) + geom_point(aes(x=x1, y=x2, color = as.character(label)), size = 2) + 
  scale_colour_discrete(name  ="Label") + 
  ylim(0, 3) + coord_fixed(ratio = 1) +
  ggtitle('Data to be classified') +
  theme_bw(base_size = 12) +
  theme(legend.position=c(0.85, 0.87))

#png(file.path('images', 'data_plot.png'))
print(data_plot)

#dev.off()

#sigmoid function, inverse of logit
sigmoid <- function(z){1/(1+exp(-z))}

#cost function
cost <- function(theta, X, y){
  m <- length(y) # number of training examples
  h <- sigmoid(X %*% theta)
  J <- (t(-y)%*%log(h)-t(1-y)%*%log(1-h))/m
  J
}

#gradient function
grad <- function(theta, X, y){
  m <- length(y) 
  
  h <- sigmoid(X%*%theta)
  grad <- (t(X)%*%(h - y))/m
  grad
}

logisticReg <- function(X, y){
  #remove NA rows
  X <- na.omit(X)
  y <- na.omit(y)
  #add bias term and convert to matrix
  X <- mutate(X, bias =1)
  #move the bias column to col1
  X <- as.matrix(X[, c(ncol(X), 1:(ncol(X)-1))])
  y <- as.matrix(y)
  #initialize theta
  theta <- matrix(rep(0, ncol(X)), nrow = ncol(X))
  #use the optim function to perform gradient descent
  costOpti <- optim(theta, fn = cost, gr = grad, X=X, y=y)
  #return coefficients
  return(costOpti$par)
}

logisticProb <- function(theta, X){
  X <- na.omit(X)
  #add bias term and convert to matrix
  X <- mutate(X, bias =1)
  X <- as.matrix(X[,c(ncol(X), 1:(ncol(X)-1))])
  return(sigmoid(X%*%theta))
}

logisticPred <- function(prob){
  return(round(prob, 0))
}

# training
theta <- logisticReg(X, y)
prob <- logisticProb(theta, X)
pred <- logisticPred(prob)

# generate a grid for decision boundary, this is the test set
grid <- expand.grid(seq(0, 3, length.out = 100), seq(0, 3, length.out = 100))
# predict the probability
probZ <- logisticProb(theta, grid)
# predict the label
Z <- logisticPred(probZ)
gridPred = cbind(grid, Z)

# decision boundary visualization
p <- ggplot() + geom_point(data = data, aes(x=x1, y=x2, color = as.character(label)), size = 2, show.legend = F) + 
  geom_tile(data = gridPred, aes(x = grid[, 1],y = grid[, 2], fill=as.character(Z)), alpha = 0.3, show.legend = F)+ 
  ylim(0, 3) +
  ggtitle('Decision Boundary for Logistic Regression') +
  coord_fixed(ratio = 1) +
  theme_bw(base_size = 12) 

#png(file.path('images', 'logistic_regression.png'))
print(p)

Warning: Removed 200 rows containing missing values (`geom_tile()`).

#dev.off()