library(titanic)
library(tidyverse)
titanic_df <- titanic_train %>%
as_tibble() %>%
janitor::clean_names() %>%
mutate(
survived = case_when(
survived == "1" ~ "survived",
TRUE ~ "died"
) %>%
as.factor() %>%
fct_relevel(c("died", "survived"))
) %>%
mutate(
cabin = case_when(
cabin == "" ~ NA_character_,
TRUE ~ cabin
),
embarked = case_when(
embarked == "" ~ NA_character_,
TRUE ~ embarked
),
across(c(pclass, sex, embarked), as.factor),
passenger_id = as.character(passenger_id)
)
library(tidymodels)
set.seed(123)
titanic_split <- initial_split(titanic_df, 0.75, strata = survived)
titanic_training <- titanic_split %>% training()
titanic_testing <- titanic_split %>% testing()
titanic_recipe <- recipe(
survived ~ fare + sex + sib_sp + parch + pclass,
data = titanic_training
) %>%
step_corr(all_numeric(), threshold = 0.8) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())
logistic_model <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
titanic_wkfl <- workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(logistic_model)
titanic_folds <- vfold_cv(
data = titanic_training,
v =10, strata = survived
)
titanic_metrics <- metric_set(sens, spec, accuracy, roc_auc)
control <- control_resamples(save_pred = TRUE)
logistic_rs_fit <- fit_resamples(
object = titanic_wkfl,
resamples = titanic_folds,
metrics = titanic_metrics,
control = control
)
logistic_rs_fit_results <- logistic_rs_fit %>%
collect_predictions()
logistic_rs_fit_results %>%
titanic_metrics(
truth = survived, estimate = .pred_class,
.pred_survived, event_level = "second"
)
## # A tibble: 4 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.689
## 2 spec binary 0.867
## 3 accuracy binary 0.798
## 4 roc_auc binary 0.822
logistic_last_fit <- last_fit(
object = titanic_wkfl,
split = titanic_split,
metrics = titanic_metrics
)
logistic_last_fit %>%
collect_predictions() %>%
titanic_metrics(
truth = survived, estimate = .pred_class,
.pred_survived, event_level = "second"
)
## # A tibble: 4 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.659
## 2 spec binary 0.861
## 3 accuracy binary 0.784
## 4 roc_auc binary 0.847
logistic_last_fit$.workflow[[1]] %>% tidy()
## # A tibble: 7 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.05 0.289 7.10 1.23e-12
## 2 fare 0.218 0.150 1.45 1.48e- 1
## 3 sib_sp -0.295 0.130 -2.28 2.26e- 2
## 4 parch -0.0813 0.107 -0.760 4.47e- 1
## 5 sex_male -2.77 0.223 -12.4 1.83e-35
## 6 pclass_X2 -0.551 0.334 -1.65 9.91e- 2
## 7 pclass_X3 -1.37 0.309 -4.44 9.01e- 6