Titanic Series (Part 5) - Logistic Regression, Cross-Validation

Peng Chen

May 22, 2021

library(titanic)
library(tidyverse)
titanic_df <- titanic_train %>% 
  as_tibble() %>% 
  janitor::clean_names() %>% 
  mutate(
    survived = case_when(
      survived == "1" ~ "survived",
      TRUE ~ "died"
    ) %>%
      as.factor() %>%
      fct_relevel(c("died", "survived"))
  ) %>%
  mutate(
    cabin = case_when(
      cabin == "" ~ NA_character_,
      TRUE ~ cabin
    ),
     embarked = case_when(
      embarked == "" ~ NA_character_,
      TRUE ~ embarked
    ),
    across(c(pclass, sex, embarked), as.factor),
    passenger_id = as.character(passenger_id)
  )
library(tidymodels)
set.seed(123)
titanic_split <- initial_split(titanic_df, 0.75, strata = survived)
titanic_training <- titanic_split %>% training()
titanic_testing <- titanic_split %>% testing()

titanic_recipe <- recipe(
  survived ~ fare + sex + sib_sp + parch + pclass,
  data = titanic_training
) %>% 
  step_corr(all_numeric(), threshold = 0.8) %>% 
  step_normalize(all_numeric()) %>% 
  step_dummy(all_nominal(), -all_outcomes())

logistic_model <- logistic_reg() %>% 
  set_engine("glm") %>% 
  set_mode("classification")

titanic_wkfl <- workflow() %>% 
  add_recipe(titanic_recipe) %>% 
  add_model(logistic_model)

titanic_folds <- vfold_cv(
  data = titanic_training,
  v =10, strata = survived
)

titanic_metrics <- metric_set(sens, spec, accuracy, roc_auc)

control <- control_resamples(save_pred = TRUE)

logistic_rs_fit <- fit_resamples(
  object = titanic_wkfl,
  resamples = titanic_folds,
  metrics = titanic_metrics,
  control = control
)

logistic_rs_fit_results <- logistic_rs_fit %>% 
  collect_predictions()

logistic_rs_fit_results %>% 
  titanic_metrics(
    truth = survived, estimate = .pred_class, 
    .pred_survived, event_level = "second"
  )
## # A tibble: 4 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 sens     binary         0.689
## 2 spec     binary         0.867
## 3 accuracy binary         0.798
## 4 roc_auc  binary         0.822
logistic_last_fit <- last_fit(
  object = titanic_wkfl,
  split = titanic_split, 
  metrics = titanic_metrics
)

logistic_last_fit %>% 
  collect_predictions() %>% 
  titanic_metrics(
    truth = survived, estimate = .pred_class, 
    .pred_survived, event_level = "second"
  )
## # A tibble: 4 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 sens     binary         0.659
## 2 spec     binary         0.861
## 3 accuracy binary         0.784
## 4 roc_auc  binary         0.847
logistic_last_fit$.workflow[[1]] %>% tidy()
## # A tibble: 7 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   2.05       0.289     7.10  1.23e-12
## 2 fare          0.218      0.150     1.45  1.48e- 1
## 3 sib_sp       -0.295      0.130    -2.28  2.26e- 2
## 4 parch        -0.0813     0.107    -0.760 4.47e- 1
## 5 sex_male     -2.77       0.223   -12.4   1.83e-35
## 6 pclass_X2    -0.551      0.334    -1.65  9.91e- 2
## 7 pclass_X3    -1.37       0.309    -4.44  9.01e- 6