# -------------------------------------------
# Tidy Survival Analysis: Module 2 Code
# -------------------------------------------
# ---------------------------
# 1. Load Required Packages
# ---------------------------
library(tidyverse)
# ---------------------------
# 2. Simulated Trial Dataset
# ---------------------------
# Create a toy dataset with treatment assignment, age, follow-up time, and status
df1 <- tibble(
id = 1:6,
trt = c("A", "A", "B", "B", "A", "B"),
age = c(65, 70, 58, 60, 64, 59),
time = c(5, 8, 12, 3, 2, 6),
status = c(1, 0, 1, 1, 0, 0) # 1 = event, 0 = censored
)
df1
# ---------------------------
# 3. Pipe Operator and Basic Manipulation
# ---------------------------
# Use native pipe |> to add new variable, filter, and sort
df1 |>
mutate(age_group = if_else(age >= 65, "older", "younger")) |> # categorize age
filter(trt == "A") |> # subset treatment A
arrange(time) # sort by time
# ---------------------------
# 4. Grouped Summary by Treatment Arm
# ---------------------------
# Grouped summary: number of subjects, number of events, median follow-up
df1 |>
group_by(trt) |>
summarize(
n = n(), # total subjects
events = sum(status), # count of events
median_time = median(time) # median survival time
)
# ---------------------------
# 5. Load GBC Dataset
# ---------------------------
# Load long-format data from the German Breast Cancer study
gbc <- read.table("data/gbc.txt", header = TRUE)
head(gbc)
# ---------------------------
# 6. Raw Date Variables as Strings
# ---------------------------
# Create small example dataset with character date fields
df2 <- tibble(
id = 1:3,
rand_date = c("2022-01-01", "2022-01-15", "2022-01-20"),
end_date = c("2022-04-01", "2022-06-01", "2022-03-15"),
status = c("dead", "censored", "dead")
)
df2
# ---------------------------
# 7. Parse Dates and Calculate Survival Time
# ---------------------------
# Convert character dates to Date format and compute time and status
df2 |>
mutate(
rand_date = ymd(rand_date), # convert randomization date
end_date = ymd(end_date), # convert event/censoring date
time = as.numeric(end_date - rand_date), # survival time in days
status = if_else(status == "dead", 1, 0) # binary status indicator
)
# ---------------------------
# 8. Censored Strings Parsing
# ---------------------------
# Handle messy strings like "32+" using stringr functions
MP <- c(10, "32+", 23, "25+")
df4 <- tibble(
MP = MP,
time = parse_number(MP), # extract numeric portion
status = 1 - str_detect(MP, "\\+") # 0 if "+" present (censored)
)
df4
# ---------------------------
# 9. Reshape Wide to Long Format
# ---------------------------
# Convert multiple event times to long format using pivot_longer
df6 <- tibble(
id = 1:3,
prog_time = c(10, 20, 30),
prog_status = c(1, 0, 1),
death_time = c(15, 20, 35),
death_status = c(0, 1, 1)
)
df7 <- df6 |>
pivot_longer(
cols = c(prog_time, prog_status, death_time, death_status),
names_to = c("event", ".value"),
names_pattern = "(.*)_(.*)"
)
df7
# ---------------------------
# 10. Clean Long-Format Event Data
# ---------------------------
# Remove non-events and recode death status
df7 |>
filter(!(event == "prog" & status == 0)) |> # drop non-events
mutate(status = if_else(event == "death" & status == 1, 2, status)) # 2 = death
# ---------------------------
# 11. Construct a Swimmer Plot
# ---------------------------
# Create subject-level survival data
df8 <- tibble(
time = c(101, 55, 67, 23, 45, 98, 34, 77, 91, 104, 88),
status = c(0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1),
group = c("A", "A", "A", "B", "B", "B", "A", "B", "B", "A", "B")
) |>
mutate(
id = row_number(), # assign subject IDs
.before = 1
)
# Build swimmer plot using ggplot2
fig8 <- df8 |>
ggplot(aes(x = time, y = reorder(id, time))) +
geom_linerange(aes(xmin = 0, xmax = time)) +
geom_point(aes(shape = factor(status)), size = 2.5, fill = "white") +
geom_vline(xintercept = 0, linewidth = 1) +
theme_minimal() +
scale_y_discrete(name = "Rats") +
scale_x_continuous(name = "Time (days)", breaks = seq(0, 100, 20),
expand = expansion(c(0, 0.05))) +
scale_shape_manual(values = c(23, 19),
labels = c("Censoring", "Tumor development")) +
theme(
legend.position = "top",
legend.title = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
legend.text = element_text(size = 11)
)
# ---------------------------
# 12. Create Summary Table with gtsummary
# ---------------------------
# Create example dataset for table
df9 <- tibble(
id = 1:10,
time = c(101, 55, 67, 23, 45, 98, 34, 77, 91, 104),
status = c(0, 1, 1, 0, 1, 0, 1, 0, 1, 0),
trt = c("A", "A", "B", "B", "A", "B", "A", "B", "A", "B"),
sex = c("M", "F", "M", "F", "M", "F", "M", "F", "M", "F"),
age = c(65, 70, 58, 60, 64, 59, 66, 62, 68, 61)
)
# Summarize variables by treatment arm
library(gtsummary)
df9 |>
tbl_summary(
by = trt,
include = c(sex, age, time, status),
label = list(
time = "Follow-up time (months)",
status = "Events"
)
)