Titanic Series (Part 1) - Logistic Regression

Peng Chen

May 8, 2021

library(titanic)
library(tidyverse)
titanic_df <- titanic_train %>% 
  as_tibble() %>% 
  janitor::clean_names() %>% 
  mutate(
    survived = case_when(
      survived == "1" ~ "survived",
      TRUE ~ "died"
    ) %>%
      as.factor() %>%
      fct_relevel(c("died", "survived"))
  ) %>%
  mutate(
    cabin = case_when(
      cabin == "" ~ NA_character_,
      TRUE ~ cabin
    ),
     embarked = case_when(
      embarked == "" ~ NA_character_,
      TRUE ~ embarked
    ),
    across(c(pclass, sex, embarked), as.factor),
    passenger_id = as.character(passenger_id)
  )
# skimr::skim(titanic_df)
glimpse(titanic_df)
## Rows: 891
## Columns: 12
## $ passenger_id <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11",…
## $ survived     <fct> died, survived, survived, survived, died, died, died, di…
## $ pclass       <fct> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3,…
## $ name         <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (…
## $ sex          <fct> male, female, female, female, male, male, male, male, fe…
## $ age          <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14…
## $ sib_sp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1,…
## $ parch        <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0,…
## $ ticket       <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "…
## $ fare         <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.862…
## $ cabin        <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, "G6", …
## $ embarked     <fct> S, C, S, S, S, Q, S, S, S, C, S, S, S, S, S, S, Q, S, S,…
titanic_df %>% 
  count(survived) %>% 
  mutate(pct = n / sum(n))
## # A tibble: 2 x 3
##   survived     n   pct
## * <fct>    <int> <dbl>
## 1 died       549 0.616
## 2 survived   342 0.384
library(tidymodels)
set.seed(123)
titanic_split <- initial_split(titanic_df, 0.75, strata = survived)
titanic_training <- titanic_split %>% training()
titanic_testing <- titanic_split %>% testing()

logistic_model <- logistic_reg() %>% 
  set_engine("glm") %>% 
  set_mode("classification")
titanic_recipe <- recipe(
  survived ~ fare + sex + sib_sp + parch + pclass,
  data = titanic_training
) %>% 
  step_corr(all_numeric(), threshold = 0.8) %>% 
  step_normalize(all_numeric()) %>% 
  step_dummy(all_nominal(), -all_outcomes())

titanic_recipe_prep <- titanic_recipe %>% prep(titanic_training)
titanic_training_prep <- titanic_recipe_prep %>% bake(titanic_training)
titanic_testing_prep <- titanic_recipe_prep %>% bake(titanic_testing)

logistic_fit <- logistic_model %>% 
  fit(survived ~ ., titanic_training_prep)
levels(titanic_training_prep$survived)
## [1] "died"     "survived"
tidy(logistic_fit)
## # A tibble: 7 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   2.05       0.289     7.10  1.23e-12
## 2 fare          0.218      0.150     1.45  1.48e- 1
## 3 sib_sp       -0.295      0.130    -2.28  2.26e- 2
## 4 parch        -0.0813     0.107    -0.760 4.47e- 1
## 5 sex_male     -2.77       0.223   -12.4   1.83e-35
## 6 pclass_X2    -0.551      0.334    -1.65  9.91e- 2
## 7 pclass_X3    -1.37       0.309    -4.44  9.01e- 6
pred_class <- logistic_fit %>% 
  predict(titanic_testing_prep, type = "class")
pred_prob <- logistic_fit %>% 
  predict(titanic_testing_prep, type = "prob")
titantic_testing_results <- titanic_testing %>% 
  select(survived) %>% 
  bind_cols(pred_class, pred_prob)
titantic_testing_results %>% 
  conf_mat(survived, .pred_class)
##           Truth
## Prediction died survived
##   died      118       29
##   survived   19       56
titantic_testing_results %>% 
  conf_mat(survived, .pred_class) %>% 
  autoplot(type = "mosaic")

titantic_testing_results %>% 
  sens(survived, .pred_class, event_level = "second")
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 sens    binary         0.659
titantic_testing_results %>% 
  spec(survived, .pred_class, event_level = "second")
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 spec    binary         0.861
titantic_testing_results %>% accuracy(survived, .pred_class)
## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.784
titantic_testing_results %>% 
  roc_curve(truth = survived, .pred_survived, event_level = "second") 
## # A tibble: 161 x 3
##    .threshold specificity sensitivity
##         <dbl>       <dbl>       <dbl>
##  1  -Inf          0                 1
##  2     0.0570     0                 1
##  3     0.0582     0.00730           1
##  4     0.0746     0.0146            1
##  5     0.0748     0.0219            1
##  6     0.0823     0.0292            1
##  7     0.0878     0.0365            1
##  8     0.0901     0.0438            1
##  9     0.0902     0.0511            1
## 10     0.0916     0.0584            1
## # … with 151 more rows
titantic_testing_results %>% 
  roc_curve(truth = survived, .pred_survived, event_level = "second") %>% 
  autoplot()

titantic_testing_results %>% 
  roc_auc(truth = survived, .pred_survived, event_level = "second")
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.847
titanic_metrics <- metric_set(roc_auc, sens, spec, accuracy)
titantic_testing_results %>% 
  titanic_metrics(
    truth = survived, estimate = .pred_class, .pred_survived,
    event_level = "second"
  )
## # A tibble: 4 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 sens     binary         0.659
## 2 spec     binary         0.861
## 3 accuracy binary         0.784
## 4 roc_auc  binary         0.847