library(titanic)
library(tidyverse)
titanic_df <- titanic_train %>%
as_tibble() %>%
janitor::clean_names() %>%
mutate(
survived = case_when(
survived == "1" ~ "survived",
TRUE ~ "died"
) %>%
as.factor() %>%
fct_relevel(c("died", "survived"))
) %>%
mutate(
cabin = case_when(
cabin == "" ~ NA_character_,
TRUE ~ cabin
),
embarked = case_when(
embarked == "" ~ NA_character_,
TRUE ~ embarked
),
across(c(pclass, sex, embarked), as.factor),
passenger_id = as.character(passenger_id)
)
# skimr::skim(titanic_df)
glimpse(titanic_df)
## Rows: 891
## Columns: 12
## $ passenger_id <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11",…
## $ survived <fct> died, survived, survived, survived, died, died, died, di…
## $ pclass <fct> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3,…
## $ name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (…
## $ sex <fct> male, female, female, female, male, male, male, male, fe…
## $ age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14…
## $ sib_sp <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1,…
## $ parch <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0,…
## $ ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "…
## $ fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.862…
## $ cabin <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, "G6", …
## $ embarked <fct> S, C, S, S, S, Q, S, S, S, C, S, S, S, S, S, S, Q, S, S,…
titanic_df %>%
count(survived) %>%
mutate(pct = n / sum(n))
## # A tibble: 2 x 3
## survived n pct
## * <fct> <int> <dbl>
## 1 died 549 0.616
## 2 survived 342 0.384
library(tidymodels)
set.seed(123)
titanic_split <- initial_split(titanic_df, 0.75, strata = survived)
titanic_training <- titanic_split %>% training()
titanic_testing <- titanic_split %>% testing()
logistic_model <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
titanic_recipe <- recipe(
survived ~ fare + sex + sib_sp + parch + pclass,
data = titanic_training
) %>%
step_corr(all_numeric(), threshold = 0.8) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal(), -all_outcomes())
titanic_recipe_prep <- titanic_recipe %>% prep(titanic_training)
titanic_training_prep <- titanic_recipe_prep %>% bake(titanic_training)
titanic_testing_prep <- titanic_recipe_prep %>% bake(titanic_testing)
logistic_fit <- logistic_model %>%
fit(survived ~ ., titanic_training_prep)
levels(titanic_training_prep$survived)
## [1] "died" "survived"
tidy(logistic_fit)
## # A tibble: 7 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.05 0.289 7.10 1.23e-12
## 2 fare 0.218 0.150 1.45 1.48e- 1
## 3 sib_sp -0.295 0.130 -2.28 2.26e- 2
## 4 parch -0.0813 0.107 -0.760 4.47e- 1
## 5 sex_male -2.77 0.223 -12.4 1.83e-35
## 6 pclass_X2 -0.551 0.334 -1.65 9.91e- 2
## 7 pclass_X3 -1.37 0.309 -4.44 9.01e- 6
pred_class <- logistic_fit %>%
predict(titanic_testing_prep, type = "class")
pred_prob <- logistic_fit %>%
predict(titanic_testing_prep, type = "prob")
titantic_testing_results <- titanic_testing %>%
select(survived) %>%
bind_cols(pred_class, pred_prob)
titantic_testing_results %>%
conf_mat(survived, .pred_class)
## Truth
## Prediction died survived
## died 118 29
## survived 19 56
titantic_testing_results %>%
conf_mat(survived, .pred_class) %>%
autoplot(type = "mosaic")
titantic_testing_results %>%
sens(survived, .pred_class, event_level = "second")
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.659
titantic_testing_results %>%
spec(survived, .pred_class, event_level = "second")
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 spec binary 0.861
titantic_testing_results %>% accuracy(survived, .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.784
titantic_testing_results %>%
roc_curve(truth = survived, .pred_survived, event_level = "second")
## # A tibble: 161 x 3
## .threshold specificity sensitivity
## <dbl> <dbl> <dbl>
## 1 -Inf 0 1
## 2 0.0570 0 1
## 3 0.0582 0.00730 1
## 4 0.0746 0.0146 1
## 5 0.0748 0.0219 1
## 6 0.0823 0.0292 1
## 7 0.0878 0.0365 1
## 8 0.0901 0.0438 1
## 9 0.0902 0.0511 1
## 10 0.0916 0.0584 1
## # … with 151 more rows
titantic_testing_results %>%
roc_curve(truth = survived, .pred_survived, event_level = "second") %>%
autoplot()
titantic_testing_results %>%
roc_auc(truth = survived, .pred_survived, event_level = "second")
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.847
titanic_metrics <- metric_set(roc_auc, sens, spec, accuracy)
titantic_testing_results %>%
titanic_metrics(
truth = survived, estimate = .pred_class, .pred_survived,
event_level = "second"
)
## # A tibble: 4 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 sens binary 0.659
## 2 spec binary 0.861
## 3 accuracy binary 0.784
## 4 roc_auc binary 0.847