Chapter 15 - Machine Learning in Survival Analysis
Department of Biostatistics & Medical Informatics
University of Wisconsin-Madison
Regularized Cox regression models
Nonparametric regression by survival trees
Building prediction models for German Breast Cancer study
\[\newcommand{\d}{{\rm d}}\] \[\newcommand{\T}{{\rm T}}\] \[\newcommand{\dd}{{\rm d}}\] \[\newcommand{\cc}{{\rm c}}\] \[\newcommand{\pr}{{\rm pr}}\] \[\newcommand{\var}{{\rm var}}\] \[\newcommand{\se}{{\rm se}}\] \[\newcommand{\indep}{\perp \!\!\! \perp}\] \[\newcommand{\Pn}{n^{-1}\sum_{i=1}^n}\] \[ \newcommand\mymathop[1]{\mathop{\operatorname{#1}}} \] \[ \newcommand{\Ut}{{n \choose 2}^{-1}\sum_{i<j}\sum} \]
glmnet::glmnet()
(I)Z
: covariate matrix; alpha
: \(\alpha\)glmnet::glmnet()
(II)CV results
# the optimal lambda
obj.cv$lambda.min
log(obj.cv$lambda.min)
#> [1] -3.886169
# the beta at optimal lambda
beta <- coef(obj.cv, s = "lambda.min")
# the non-zero coefficients
beta.selected <- beta[abs(beta[,1])>0,]
# print out the non-zero coefficients
beta.selected
#> hormone age40 size grade nodes
#> -0.371809445 0.455421368 0.003448500 0.201404194 0.040170638
#> prog
#> -0.002908887
rpart::rpart()
xval = k
: \(k\)-fold cross-validation; minbucket
: minimum size of terminal node; cp
: minimum reduction of impurity measure for a split# grow the tree, with cross-validation
obj <- rpart(Surv(time, status) ~ covariates,
control = rpart.control(xval = 10, minbucket = 2, cp = 0))
# cross-validation results
cptable <- obj$cptable
# complexity parameter (lambda)
CP <- cptable[, 1]
# find optimal parameter
# cptable[, 4]: error function
cp.opt <- CP[which.min(cptable[, 4])]
rpart::prune()
tree
: rpart
object for grown tree; cp
: optimal \(\lambda\)test
: test data frame# prune the tree, with optimal lambda
fit <- prune(tree = obj, cp = cp.opt)
# plot the pruned tree structure
rpart.plot(fit)
# fit$where: vector of terminal node for training data
# compute KM estimates by terminal node
km <- survfit(Surv(time, status) ~ fit$where)
## prediction on test data ---------------------------
# terminal node for test data
treeClust::rpart.predict.leaves(fit, test)
# Conduct 10-fold cross-validation (xval = 10)
obj <- rpart(Surv(time, status) ~ hormone + meno + size + grade + nodes +
prog + estrg + age,
control = rpart.control(xval = 10, minbucket = 2, cp = 0),
data = train)
printcp(obj) # xerror: objective
# CP nsplit rel.error xerror xstd
# 1 0.07556835 0 1.00000 1.00411 0.046231
# 2 0.03720019 1 0.92443 0.96817 0.047281
# 3 0.02661914 2 0.88723 0.95124 0.046567
# 4 0.01716925 3 0.86061 0.92745 0.046606 # minimizer
# 5 0.01398306 4 0.84344 0.92976 0.047514
# 6 0.01394869 5 0.82946 0.93941 0.048404
# 7 0.01055028 9 0.77120 0.97722 0.052133
ipred
randomSurvivalForest
ranger
censored
: a member of tidymodels
familyglmnet::glmnet(Z, Surv(time, status), family = “cox”, alpha = 1)
rpart:: rpart(Surv(time, status) ~ covariates)