Titanic Series (Part 3) - Ridge Logistic Regression (Untuned)

Peng Chen

May 22, 2021

library(titanic)
library(tidyverse)
titanic_df <- titanic_train %>% 
  as_tibble() %>% 
  janitor::clean_names() %>% 
  mutate(
    survived = case_when(
      survived == "1" ~ "survived",
      TRUE ~ "died"
    ) %>%
      as.factor() %>%
      fct_relevel(c("died", "survived"))
  ) %>%
  mutate(
    cabin = case_when(
      cabin == "" ~ NA_character_,
      TRUE ~ cabin
    ),
     embarked = case_when(
      embarked == "" ~ NA_character_,
      TRUE ~ embarked
    ),
    across(c(pclass, sex, embarked), as.factor),
    passenger_id = as.character(passenger_id)
  )
# skimr::skim(titanic_df)
library(tidymodels)
set.seed(123)
titanic_split <- initial_split(titanic_df, 0.75, strata = survived)
titanic_training <- titanic_split %>% training()
titanic_testing <- titanic_split %>% testing()

logistic_model <- logistic_reg(penalty = 0.1, mixture = 0) %>% 
  set_engine("glmnet") %>% 
  set_mode("classification")
titanic_recipe <- recipe(
  survived ~ fare + sex + sib_sp + parch + pclass,
  data = titanic_training
) %>% 
  step_corr(all_numeric(), threshold = 0.8) %>% 
  step_normalize(all_numeric()) %>% 
  step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE)

titanic_recipe_prep <- titanic_recipe %>% prep(titanic_training)
titanic_training_prep <- titanic_recipe_prep %>% bake(titanic_training)
titanic_testing_prep <- titanic_recipe_prep %>% bake(titanic_testing)

logistic_fit <- logistic_model %>% 
  fit(survived ~ ., titanic_training_prep)
levels(titanic_training_prep$survived)
## [1] "died"     "survived"
tidy(logistic_fit) 
## # A tibble: 9 x 3
##   term        estimate penalty
##   <chr>          <dbl>   <dbl>
## 1 (Intercept)  -0.155      0.1
## 2 fare          0.154      0.1
## 3 sib_sp       -0.135      0.1
## 4 parch        -0.0233     0.1
## 5 sex_female    1.01       0.1
## 6 sex_male     -1.01       0.1
## 7 pclass_X1     0.509      0.1
## 8 pclass_X2     0.158      0.1
## 9 pclass_X3    -0.476      0.1
pred_class <- logistic_fit %>% 
  predict(titanic_testing_prep, type = "class")
pred_prob <- logistic_fit %>% 
  predict(titanic_testing_prep, type = "prob")
titantic_testing_results <- titanic_testing %>% 
  select(survived) %>% 
  bind_cols(pred_class, pred_prob)

titantic_testing_results %>% 
  conf_mat(survived, .pred_class) %>% 
  autoplot(type = "mosaic")

titanic_metrics <- metric_set(roc_auc, sens, spec, accuracy)
titantic_testing_results %>% 
  titanic_metrics(
    truth = survived, estimate = .pred_class, .pred_survived,
    event_level = "second"
  )
## # A tibble: 4 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 sens     binary         0.659
## 2 spec     binary         0.861
## 3 accuracy binary         0.784
## 4 roc_auc  binary         0.848