library(titanic)
library(tidyverse)
titanic_df <- titanic_train %>%
as_tibble() %>%
janitor::clean_names() %>%
mutate(
survived = case_when(
survived == "1" ~ "survived",
TRUE ~ "died"
) %>%
as.factor() %>%
fct_relevel(c("died", "survived"))
) %>%
mutate(
cabin = case_when(
cabin == "" ~ NA_character_,
TRUE ~ cabin
),
embarked = case_when(
embarked == "" ~ NA_character_,
TRUE ~ embarked
),
across(c(pclass, sex, embarked), as.factor),
passenger_id = as.character(passenger_id)
)
# skimr::skim(titanic_df)
library(tidymodels)
set.seed(123)
titanic_split <- initial_split(titanic_df, 0.75, strata = survived)
titanic_training <- titanic_split %>% training()
titanic_testing <- titanic_split %>% testing()
logistic_model <- logistic_reg(penalty = 0.1, mixture = 0) %>%
set_engine("glmnet") %>%
set_mode("classification")
titanic_recipe <- recipe(
survived ~ fare + sex + sib_sp + parch + pclass,
data = titanic_training
) %>%
step_corr(all_numeric(), threshold = 0.8) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE)
titanic_recipe_prep <- titanic_recipe %>% prep(titanic_training)
titanic_training_prep <- titanic_recipe_prep %>% bake(titanic_training)
titanic_testing_prep <- titanic_recipe_prep %>% bake(titanic_testing)
logistic_fit <- logistic_model %>%
fit(survived ~ ., titanic_training_prep)
levels(titanic_training_prep$survived)
## [1] "died" "survived"
tidy(logistic_fit)
## # A tibble: 9 x 3
## term estimate penalty
## <chr> <dbl> <dbl>
## 1 (Intercept) -0.155 0.1
## 2 fare 0.154 0.1
## 3 sib_sp -0.135 0.1
## 4 parch -0.0233 0.1
## 5 sex_female 1.01 0.1
## 6 sex_male -1.01 0.1
## 7 pclass_X1 0.509 0.1
## 8 pclass_X2 0.158 0.1
## 9 pclass_X3 -0.476 0.1
pred_class <- logistic_fit %>%
predict(titanic_testing_prep, type = "class")
pred_prob <- logistic_fit %>%
predict(titanic_testing_prep, type = "prob")
titantic_testing_results <- titanic_testing %>%
select(survived) %>%
bind_cols(pred_class, pred_prob)
titantic_testing_results %>%
conf_mat(survived, .pred_class) %>%
autoplot(type = "mosaic")
titanic_metrics <- metric_set(roc_auc, sens, spec, accuracy)
titantic_testing_results %>%
titanic_metrics(
truth = survived, estimate = .pred_class, .pred_survived,
event_level = "second"
)
## # A tibble: 4 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.659
## 2 spec binary 0.861
## 3 accuracy binary 0.784
## 4 roc_auc binary 0.848