Section slides here. (To convert html to pdf, press E \(\to\) Print \(\to\) Destination: Save to pdf)
R code
Show the code
# -------------------------------------------# Survival Analysis: Module 5 Code# -------------------------------------------# ---------------------------# Load Packages# ---------------------------library(tidymodels) # load tidymodelslibrary(censored)# ---------------------------# Prepare GBC Data# ---------------------------gbc <-read.table("data/gbc.txt", header =TRUE) # Load GBC datasetdf <- gbc |># calculate time to first event (relapse or death)group_by(id) |># group by idarrange(time) |># sort rows by timeslice(1) |># get the first row within each idungroup() |>mutate(surv_obj =Surv(time, status), # create the Surv object as response variable.after = id, # keep id column after surv_obj.keep ="unused"# discard original time and status columns )# ---------------------------# Data Splitting# ---------------------------set.seed(123) # set seed for reproducibilitygbc_split <-initial_split(df) # split data into training and testing setsgbc_train <-training(gbc_split) # obtain training set# ---------------------------# Preprocessing Recipe# ---------------------------gbc_recipe <-recipe(surv_obj ~ ., data = gbc_train) |># specify formulastep_mutate(grade =factor(grade),age40 =as.numeric(age >=40), # create a binary variable for age >= 40prog = prog /100, # rescale progestrg = estrg /100# rescale estrg ) |>step_dummy(grade) |>step_rm(id) # remove id# ---------------------------# Regularized Cox Model# ---------------------------cox_spec <-proportional_hazards(penalty =tune()) |># tune lambdaset_engine("glmnet") |># set engine to glmnetset_mode("censored regression") # set mode to censored regressioncox_wflow <-workflow() |>add_model(cox_spec) |># add model specificationadd_recipe(gbc_recipe) # add recipe# ---------------------------# Cross-Validation Setup# ---------------------------set.seed(123) # set seed for reproducibilitygbc_folds <-vfold_cv(gbc_train, v =10) # 10-fold cross-validation# Set evaluation metricsgbc_metrics <-metric_set(brier_survival, brier_survival_integrated, roc_auc_survival, concordance_survival)# Set evaluation time pointstime_points <-seq(0, 84, by =12) # evaluation time points# ---------------------------# Cox model CV# ---------------------------set.seed(123) # set seed for reproducibility# Tune the regularized Cox model (this will take some time)cox_res <-tune_grid( cox_wflow, resamples = gbc_folds, grid =10, # number of hyperparameter combinations to trymetrics = gbc_metrics, # evaluation metricseval_time = time_points, # evaluation time pointscontrol =control_grid(save_workflow =TRUE) # save workflow)# ---------------------------# Plot Cox Model Brier Score# ---------------------------collect_metrics(cox_res) |># collect metrics from tuning resultsfilter(.metric =="brier_survival_integrated") |># filter for Brier scoreggplot(aes(log(penalty), mean)) +# plot log-lambda vs Brier scoregeom_line() +# plot linelabs(x ="Log-lambda", y ="Overall Brier Score") +# labelstheme_classic() # classic theme# ---------------------------# Best Cox Models# ---------------------------show_best(cox_res, metric ="brier_survival_integrated", n =5) # top 5 models# ---------------------------# Random Forest Model# ---------------------------rf_spec <-rand_forest(mtry =tune(), min_n =tune()) |># tune mtry and min_nset_engine("aorsf") |># set engine to aorsfset_mode("censored regression") # set mode to censored regressionrf_wflow <-workflow() |>add_model(rf_spec) |># add model specificationadd_recipe(gbc_recipe) # add recipe# ---------------------------# RF CV# ---------------------------set.seed(123) # set seed for reproducibility# Tune the random forest model (this will take some time)rf_res <-tune_grid( rf_wflow, resamples = gbc_folds, grid =10, # number of hyperparameter combinations to trymetrics = gbc_metrics, # evaluation metricseval_time = time_points # evaluation time points)# ---------------------------# View RF Validation Metrics# ---------------------------collect_metrics(rf_res) |>head() # collect metrics from tuning results# ---------------------------# Best RF Models# ---------------------------show_best(rf_res, metric ="brier_survival_integrated", n =5) # top 5 models# ---------------------------# Finalize and Fit Best RF Model# ---------------------------param_best <-select_best(rf_res, metric ="brier_survival_integrated") rf_final_wflow <-finalize_workflow(rf_wflow, param_best) # finalize workflowset.seed(123) # set seed for reproducibilityfinal_rf_fit <-last_fit( rf_final_wflow, split = gbc_split, # use the original splitmetrics = gbc_metrics, # evaluation metricseval_time = time_points # evaluation time points)# ---------------------------# Test Performance - RF Model# ---------------------------collect_metrics(final_rf_fit) |># collect overall performance metricsfilter(.metric %in%c("concordance_survival", "brier_survival_integrated")) roc_test <-collect_metrics(final_rf_fit) |>filter(.metric =="roc_auc_survival") |>rename(mean = .estimate)roc_test |>ggplot(aes(.eval_time, mean)) +geom_line() +labs(x ="Time (months)", y ="ROC AUC") +theme_classic()# ---------------------------# Predict with Final RF Model# ---------------------------gbc_rf <-extract_workflow(final_rf_fit) # extract the fitted workflowgbc_5 <-testing(gbc_split) |>slice(1:5) # take first 5 rows of test datapredict(gbc_rf, new_data = gbc_5, type ="time") # predict survival times