library(titanic)
library(tidyverse)
titanic_df <- titanic_train %>%
as_tibble() %>%
janitor::clean_names() %>%
mutate(
survived = case_when(
survived == "1" ~ "survived",
TRUE ~ "died"
) %>%
as.factor() %>%
fct_relevel(c("died", "survived"))
) %>%
mutate(
cabin = case_when(
cabin == "" ~ NA_character_,
TRUE ~ cabin
),
embarked = case_when(
embarked == "" ~ NA_character_,
TRUE ~ embarked
),
across(c(pclass, sex, embarked), as.factor),
passenger_id = as.character(passenger_id)
)
library(tidymodels)
set.seed(123)
titanic_split <- initial_split(titanic_df, 0.75, strata = survived)
titanic_training <- titanic_split %>% training()
titanic_testing <- titanic_split %>% testing()
titanic_recipe <- recipe(
survived ~ fare + sex + sib_sp + parch + pclass,
data = titanic_training
) %>%
step_corr(all_numeric(), threshold = 0.8) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE)
lasso_logistic_model <- logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet") %>%
set_mode("classification")
titanic_wkfl <- workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(lasso_logistic_model)
penalty_grid <- grid_regular(
parameters(titanic_wkfl), levels = 100
)
titanic_folds <- vfold_cv(
data = titanic_training, v = 10, strata = survived
)
titanic_metrics <- metric_set(roc_auc, sens, spec, accuracy)
lasso_logistic_tune_fit <- tune_grid(
object = titanic_wkfl,
resamples = titanic_folds,
grid = penalty_grid,
metrics = titanic_metrics
)
lasso_logistic_tune_fit %>% show_best("roc_auc")
## # A tibble: 5 x 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0192 roc_auc binary 0.833 10 0.0172 Preprocessor1_Model083
## 2 0.0152 roc_auc binary 0.833 10 0.0172 Preprocessor1_Model082
## 3 0.0242 roc_auc binary 0.833 10 0.0171 Preprocessor1_Model084
## 4 0.0120 roc_auc binary 0.831 10 0.0170 Preprocessor1_Model081
## 5 0.0486 roc_auc binary 0.830 10 0.0168 Preprocessor1_Model087
titanic_wkfl <- titanic_wkfl %>%
finalize_workflow(
lasso_logistic_tune_fit %>% select_best("roc_auc")
)
titanic_last_fit <- titanic_wkfl %>%
last_fit(titanic_split)
titanic_last_fit %>%
collect_predictions() %>%
titanic_metrics(
truth = survived, estimate = .pred_class,
.pred_survived, event_level = "second"
)
## # A tibble: 4 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.659
## 2 spec binary 0.847
## 3 accuracy binary 0.775
## 4 roc_auc binary 0.851
titanic_last_fit$.workflow[[1]] %>% tidy()
## # A tibble: 9 x 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) -1.15e+ 0 0.0192
## 2 fare 6.53e- 2 0.0192
## 3 sib_sp -1.16e- 1 0.0192
## 4 parch 0. 0.0192
## 5 sex_female 2.38e+ 0 0.0192
## 6 sex_male -3.89e-14 0.0192
## 7 pclass_X1 4.95e- 1 0.0192
## 8 pclass_X2 0. 0.0192
## 9 pclass_X3 -7.10e- 1 0.0192
ridge_logistic_model <- logistic_reg(penalty = tune(), mixture = 0) %>%
set_engine("glmnet") %>%
set_mode("classification")
titanic_wkfl <- titanic_wkfl %>% update_model(ridge_logistic_model)
ridge_logistic_tune_fit <- tune_grid(
object = titanic_wkfl,
resamples = titanic_folds,
grid = penalty_grid,
metrics = titanic_metrics
)
ridge_logistic_tune_fit %>% show_best("spec")
## # A tibble: 5 x 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1.00e-10 spec binary 0.689 10 0.0211 Preprocessor1_Model001
## 2 1.26e-10 spec binary 0.689 10 0.0211 Preprocessor1_Model002
## 3 1.59e-10 spec binary 0.689 10 0.0211 Preprocessor1_Model003
## 4 2.01e-10 spec binary 0.689 10 0.0211 Preprocessor1_Model004
## 5 2.54e-10 spec binary 0.689 10 0.0211 Preprocessor1_Model005
titanic_wkfl <- titanic_wkfl %>%
finalize_workflow(
ridge_logistic_tune_fit %>% select_best("spec")
)
titanic_last_fit <- titanic_wkfl %>%
last_fit(titanic_split)
titanic_last_fit %>%
collect_predictions() %>%
titanic_metrics(
truth = survived, estimate = .pred_class,
.pred_survived, event_level = "second"
)
## # A tibble: 4 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.659
## 2 spec binary 0.861
## 3 accuracy binary 0.784
## 4 roc_auc binary 0.847
titanic_last_fit$.workflow[[1]] %>% tidy()
## # A tibble: 9 x 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) -0.0966 0.0000000001
## 2 fare 0.188 0.0000000001
## 3 sib_sp -0.219 0.0000000001
## 4 parch -0.0566 0.0000000001
## 5 sex_female 1.25 0.0000000001
## 6 sex_male -1.24 0.0000000001
## 7 pclass_X1 0.644 0.0000000001
## 8 pclass_X2 0.170 0.0000000001
## 9 pclass_X3 -0.582 0.0000000001