Titanic Series (Part 6) - Regularized Logistic Regression (Tuned)

Peng Chen

May 22, 2021

library(titanic)
library(tidyverse)
titanic_df <- titanic_train %>% 
  as_tibble() %>% 
  janitor::clean_names() %>% 
  mutate(
    survived = case_when(
      survived == "1" ~ "survived",
      TRUE ~ "died"
    ) %>%
      as.factor() %>%
      fct_relevel(c("died", "survived"))
  ) %>%
  mutate(
    cabin = case_when(
      cabin == "" ~ NA_character_,
      TRUE ~ cabin
    ),
     embarked = case_when(
      embarked == "" ~ NA_character_,
      TRUE ~ embarked
    ),
    across(c(pclass, sex, embarked), as.factor),
    passenger_id = as.character(passenger_id)
  )
library(tidymodels)
set.seed(123)
titanic_split <- initial_split(titanic_df, 0.75, strata = survived)
titanic_training <- titanic_split %>% training()
titanic_testing <- titanic_split %>% testing()

titanic_recipe <- recipe(
  survived ~ fare + sex + sib_sp + parch + pclass,
  data = titanic_training
) %>% 
  step_corr(all_numeric(), threshold = 0.8) %>% 
  step_normalize(all_numeric()) %>% 
  step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE)

lasso_logistic_model <- logistic_reg(penalty = tune(), mixture = 1) %>% 
  set_engine("glmnet") %>% 
  set_mode("classification")

titanic_wkfl <- workflow() %>% 
  add_recipe(titanic_recipe) %>% 
  add_model(lasso_logistic_model)

penalty_grid <- grid_regular(
  parameters(titanic_wkfl), levels = 100
)

titanic_folds <- vfold_cv(
  data = titanic_training, v = 10, strata = survived
)

titanic_metrics <- metric_set(roc_auc, sens, spec, accuracy)

lasso_logistic_tune_fit <- tune_grid(
  object = titanic_wkfl,
  resamples = titanic_folds,
  grid = penalty_grid,
  metrics = titanic_metrics
)

lasso_logistic_tune_fit %>% show_best("roc_auc")
## # A tibble: 5 x 7
##   penalty .metric .estimator  mean     n std_err .config               
##     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                 
## 1  0.0192 roc_auc binary     0.833    10  0.0172 Preprocessor1_Model083
## 2  0.0152 roc_auc binary     0.833    10  0.0172 Preprocessor1_Model082
## 3  0.0242 roc_auc binary     0.833    10  0.0171 Preprocessor1_Model084
## 4  0.0120 roc_auc binary     0.831    10  0.0170 Preprocessor1_Model081
## 5  0.0486 roc_auc binary     0.830    10  0.0168 Preprocessor1_Model087
titanic_wkfl <- titanic_wkfl %>% 
  finalize_workflow(
    lasso_logistic_tune_fit %>% select_best("roc_auc")
  )

titanic_last_fit <- titanic_wkfl %>% 
  last_fit(titanic_split)

titanic_last_fit %>% 
  collect_predictions() %>% 
  titanic_metrics(
    truth = survived, estimate = .pred_class, 
    .pred_survived, event_level = "second"
  )
## # A tibble: 4 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 sens     binary         0.659
## 2 spec     binary         0.847
## 3 accuracy binary         0.775
## 4 roc_auc  binary         0.851
titanic_last_fit$.workflow[[1]] %>% tidy()
## # A tibble: 9 x 3
##   term         estimate penalty
##   <chr>           <dbl>   <dbl>
## 1 (Intercept) -1.15e+ 0  0.0192
## 2 fare         6.53e- 2  0.0192
## 3 sib_sp      -1.16e- 1  0.0192
## 4 parch        0.        0.0192
## 5 sex_female   2.38e+ 0  0.0192
## 6 sex_male    -3.89e-14  0.0192
## 7 pclass_X1    4.95e- 1  0.0192
## 8 pclass_X2    0.        0.0192
## 9 pclass_X3   -7.10e- 1  0.0192
ridge_logistic_model <- logistic_reg(penalty = tune(), mixture = 0) %>% 
  set_engine("glmnet") %>% 
  set_mode("classification")

titanic_wkfl <- titanic_wkfl %>% update_model(ridge_logistic_model)

ridge_logistic_tune_fit <- tune_grid(
  object = titanic_wkfl,
  resamples = titanic_folds,
  grid = penalty_grid,
  metrics = titanic_metrics
)

ridge_logistic_tune_fit %>% show_best("spec")
## # A tibble: 5 x 7
##    penalty .metric .estimator  mean     n std_err .config               
##      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                 
## 1 1.00e-10 spec    binary     0.689    10  0.0211 Preprocessor1_Model001
## 2 1.26e-10 spec    binary     0.689    10  0.0211 Preprocessor1_Model002
## 3 1.59e-10 spec    binary     0.689    10  0.0211 Preprocessor1_Model003
## 4 2.01e-10 spec    binary     0.689    10  0.0211 Preprocessor1_Model004
## 5 2.54e-10 spec    binary     0.689    10  0.0211 Preprocessor1_Model005
titanic_wkfl <- titanic_wkfl %>% 
  finalize_workflow(
    ridge_logistic_tune_fit %>% select_best("spec")
  )

titanic_last_fit <- titanic_wkfl %>% 
  last_fit(titanic_split)

titanic_last_fit %>% 
  collect_predictions() %>% 
  titanic_metrics(
    truth = survived, estimate = .pred_class, 
    .pred_survived, event_level = "second"
  )
## # A tibble: 4 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 sens     binary         0.659
## 2 spec     binary         0.861
## 3 accuracy binary         0.784
## 4 roc_auc  binary         0.847
titanic_last_fit$.workflow[[1]] %>% tidy() 
## # A tibble: 9 x 3
##   term        estimate      penalty
##   <chr>          <dbl>        <dbl>
## 1 (Intercept)  -0.0966 0.0000000001
## 2 fare          0.188  0.0000000001
## 3 sib_sp       -0.219  0.0000000001
## 4 parch        -0.0566 0.0000000001
## 5 sex_female    1.25   0.0000000001
## 6 sex_male     -1.24   0.0000000001
## 7 pclass_X1     0.644  0.0000000001
## 8 pclass_X2     0.170  0.0000000001
## 9 pclass_X3    -0.582  0.0000000001