% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/cv_cureem.R
\name{cv_cureem}
\alias{cv_cureem}
\title{Fit penalized mixture cure model using the E-M algorithm with
cross-validation for parameter tuning}
\usage{
cv_cureem(
  formula,
  data,
  subset,
  x_latency = NULL,
  model = c("cox", "weibull", "exponential"),
  penalty = c("lasso", "MCP", "SCAD"),
  penalty_factor_inc = NULL,
  penalty_factor_lat = NULL,
  fdr_control = FALSE,
  fdr = 0.2,
  grid_tuning = FALSE,
  thresh = 0.001,
  scale = TRUE,
  maxit = NULL,
  inits = NULL,
  lambda_inc_list = NULL,
  lambda_lat_list = NULL,
  nlambda_inc = NULL,
  nlambda_lat = NULL,
  gamma_inc = 3,
  gamma_lat = 3,
  lambda_min_ratio_inc = 0.1,
  lambda_min_ratio_lat = 0.1,
  n_folds = 5,
  measure_inc = c("c", "auc"),
  one_se = FALSE,
  cure_cutoff = 5,
  parallel = FALSE,
  seed = NULL,
  verbose = TRUE,
  na.action = na.omit,
  ...
)
}
\arguments{
\item{formula}{an object of class "\code{formula}" (or one that can be
coerced to that class): a symbolic description of the model to be fitted.
The response must be a survival object as returned by the \code{Surv}
function while the variables on the right side of the formula are the
covariates that are included in the incidence portion of the model.}

\item{data}{a data.frame in which to interpret the variables named in
the \code{formula} or in the \code{subset} argument. Rows with missing data are
omitted (only \code{na.action = na.omit} is operational) therefore users may
want to impute missing data prior to calling this function.}

\item{subset}{an optional expression indicating which subset of observations
to be used in the fitting process, either a numeric or factor variable
should be used in subset, not a character variable. All observations are
included by default.}

\item{x_latency}{specifies the variables to be included in the latency
portion of the model and can be either a matrix of predictors, a model
formula with the right hand side specifying the latency variables, or the
same data.frame passed to the \code{data} parameter. Note that when using
the model formula syntax for \code{x_latency} it cannot handle
\code{x_latency = ~ .}.}

\item{model}{type of regression model to use for the latency portion of
mixture cure model. Can be "cox", "weibull", or "exponential" (default is
"cox").}

\item{penalty}{type of penalty function. Can be "lasso", "MCP", or "SCAD"
(default is "lasso").}

\item{penalty_factor_inc}{vector of binary indicators representing the
penalty to apply to each incidence coefficient: 0 implies no shrinkage and
1 implies shrinkage. If not supplied, 1 is applied to all incidence
variables.}

\item{penalty_factor_lat}{vector of binary indicators representing the
penalty to apply to each latency coefficient: 0 implies no shrinkage and 1
implies shrinkage. If not supplied, 1 is applied to all latency variables.}

\item{fdr_control}{logical, if TRUE, model-X knockoffs are used for
FDR-controlled variable selection and indices of selected variables are
returned (default is FALSE).}

\item{fdr}{numeric value in (0, 1) range specifying the target FDR level to
use for variable selection when \code{fdr_control = TRUE} (default is 0.2).}

\item{grid_tuning}{logical, if TRUE a 2-D grid tuning approach is used to
select the optimal pair of \eqn{\lambda_b} and \eqn{\lambda_{\beta}} penalty
parameters for the incidence and latency portions of the model, respectively.
Otherwise the \eqn{\lambda_b} and \eqn{\lambda_{\beta}} are selected from a
1-D sequence and are equal to one another (default is FALSE).}

\item{thresh}{small numeric value. The iterative process stops when the
differences between successive expected penalized complete-data
log-likelihoods for both incidence and latency components are less than this
specified level of tolerance (default is 10^-3).}

\item{scale}{logical, if TRUE the predictors are centered and scaled.}

\item{maxit}{maximum number of passes over the data for each lambda. If not
specified, 100 is applied when \code{penalty = "lasso"} and 1000 is applied
when \code{penalty = "MCP"} or \code{penalty = "SCAD"}.}

\item{inits}{an optional list specifiying the initial values to be used for
model fitting as follows:
\itemize{
\item \code{itct} the incidence intercept.
\item \code{b_u} a numeric vector for the unpenalized.
incidence coefficients for the incidence portion of the model.
\item \code{beta_u} a numeric vector for unpenalized
latency coefficients in the incidence portion of the model.
\item \code{lambda} a numeric value for the rate parameter when fitting
either a Weibull or exponential MCM using \code{model = "weibull"} or
\code{model = "exponential"}.
\item \code{alpha} a numeric value for the shape parameter when fitting a
Weibull MCM using \code{model = "weibull"}.
\item \code{survprob} a numeric vector for the
latency survival probabilities \eqn{S_u(t_i|w_i)} for i=1,...,N when fitting
a Cox MCM \code{model = "cox"}.
}
Penalized coefficients are initialized to zero. If \code{inits} is not specified or improperly specified, initialization is
automatically provided by the function.}

\item{lambda_inc_list}{a numeric vector used to search for the optimal
\eqn{\lambda_b} tuning parameter. If not supplied, the function computes a
\eqn{\lambda_b} sequence based on \code{nlambda_inc} and
\code{lambda_min_ratio_inc}. If \code{grid_tuning = FALSE}, the same sequence
should be used for both \eqn{\lambda_b} and \eqn{\lambda_{\beta}}.}

\item{lambda_lat_list}{a numeric vector used to search for the optimal
\eqn{\lambda_{\beta}} tuning parameter. If not supplied, the function
computes a \eqn{\lambda_{\beta}} sequence based on \code{nlambda_lat} and
\code{lambda_min_ratio_lat}. If \code{grid_tuning = FALSE}, the same sequence
should be used for both \eqn{\lambda_b} and \eqn{\lambda_{\beta}}.}

\item{nlambda_inc}{an integer specifying the number of values to search for
the optimal \eqn{\lambda_b} tuning parameter; default is 10 if
\code{grid_tuning = TRUE} and 50 otherwise.}

\item{nlambda_lat}{an integer specifying the number of values to search
for the optimal \eqn{\lambda_{\beta}} tuning parameter; default is 10 if
\code{grid_tuning = TRUE} and 50 otherwise.}

\item{gamma_inc}{numeric value for the penalization parameter \eqn{\gamma}
for variables in the incidence portion of the model when
\code{penalty = "MCP"} or \code{penalty = "SCAD"} (default is 3).}

\item{gamma_lat}{numeric value for the penalization parameter \eqn{\gamma}
for variables in the latency portion of the model when \code{penalty = "MCP"}
or \code{penalty = "SCAD"} (default is 3).}

\item{lambda_min_ratio_inc}{numeric value in (0,1) representing the smallest
value for \eqn{\lambda_b} as a fraction of \code{lambda.max_inc}, the
data-derived entry value at which essentially all penalized variables in the
incidence portion of the model have a coefficient estimate of 0 (default is
0.1).}

\item{lambda_min_ratio_lat}{numeric value in (0.1) representing the smallest
value for \eqn{\lambda_{\beta}} as a fraction of \code{lambda.max_lat}, the
data-derived entry value at essentially all penalized variables in the
latency portion of the model have a coefficient estimate of 0 (default is
0.1).}

\item{n_folds}{an integer specifying the number of folds for the k-fold
cross-valiation procedure (default is 5).}

\item{measure_inc}{character string specifying the evaluation criterion used
in selecting the optimal \eqn{\lambda_b} which can be either
\itemize{
\item \code{"c"} specifying to use the C-statistic for cure status
weighting (CSW) method proposed by Asano and Hirakawa (2017) to
select both \eqn{\lambda_b} and \eqn{\lambda_{\beta}}
\item \code{"auc"} specifying to use the AUC for cure prediction using the
mean score imputation (MSI) method proposed by Asano et al. (2014) to select
\eqn{\lambda_b} while the C-statistic with CSW is used for
\eqn{\lambda_{\beta}}.
}}

\item{one_se}{logical, if TRUE then the one standard error rule is applied
for selecting the optimal parameters. The one standard error rule selects the
most parsimonious model having evaluation criterion no more than one standard
error worse than that of the best evaluation criterion (default is FALSE).}

\item{cure_cutoff}{numeric value representing the cutoff time value that
represents subjects not experiencing the event by this time are cured. This
value is used to produce a proxy for the unobserved cure status when
calculating C-statistic and AUC (default is 5 representing 5 years). Users
should be careful to note the time scale of their data and adjust this
according to the time scale and clinical application.}

\item{parallel}{logical. If TRUE, parallel processing is performed for K-fold
CV using \code{foreach} and the \pkg{doParallel} package is required.}

\item{seed}{optional integer representing the random seed. Setting the random
seed fosters reproducibility of the results.}

\item{verbose}{logical, if TRUE running information is printed to the console
(default is FALSE).}

\item{na.action}{this function requires complete data so \code{"na.omit"} is
invoked. Users can impute missing data as an alternative prior to model fitting.}

\item{...}{additional arguments.}
}
\value{
\item{b0}{Estimated intercept for the incidence portion of the
model.}

\item{b}{Estimated coefficients for the incidence portion of the
model.}

\item{beta}{Estimated coefficients for the latency portion of the
model.}

\item{alpha}{Estimated shape parameter if the Weibull model is fit.}

\item{rate}{Estimated rate parameter if the Weibull or exponential
model is fit.}

\item{logLik_inc}{Expected penalized complete-data log-likelihood for
the incidence portion of the model.}

\item{logLik_lat}{Expected penalized complete-data log-likelihood for
the latency portion of the model.}

\item{selected_lambda_inc}{Value of \eqn{\lambda_b} selected using
cross-validation. NULL when fdr_control is TRUE.}

\item{selected_lambda_lat}{Value of \eqn{\lambda_{\beta}} selected
using cross-validation. NULL when fdr_control is TRUE.}

\item{max_c}{Maximum C-statistic achieved.}

\item{max_auc}{Maximum AUC for cure prediction achieved; only output
when \code{measure_inc="auc"}.}

\item{selected_index_inc }{Indices of selected variables for the
incidence portion of the model when \code{fdr_control=TRUE}. If no variables
are selected, \code{int(0)} will be returned.}

\item{selected_index_lat }{Indices of selected variables for the
latency portion of the model when \code{fdr_control=TRUE}. If no variables
are selected, \code{int(0)} will be returned.}

\item{call}{the matched call.}
}
\description{
Fits penalized parametric and semi-parametric mixture cure models (MCM)
using the E-M algorithm with with k-fold cross-validation for parameter
tuning. The lasso (L1), MCP and SCAD penalty are supported for the Cox MCM
while only lasso is currently supported for parametric MCMs. When FDR
controlled variable selection is used, the model-X knockoffs method is
applied and indices of selected variables are returned.
}
\examples{
library(survival)
withr::local_seed(1234)
temp <- generate_cure_data(n = 200, j = 25, n_true = 5, a = 1.8)
training <- temp$training
fit.cv <- cv_cureem(Surv(Time, Censor) ~ .,
  data = training,
  x_latency = training, fdr_control = FALSE,
  grid_tuning = FALSE, nlambda_inc = 10, nlambda_lat = 10,
  n_folds = 2, seed = 23, verbose = TRUE
)
fit.cv.fdr <- cv_cureem(Surv(Time, Censor) ~ .,
  data = training,
  x_latency = training, model = "weibull", penalty = "lasso",
  fdr_control = TRUE, grid_tuning = FALSE, nlambda_inc = 10,
  nlambda_lat = 10, n_folds = 2, seed = 23, verbose = TRUE)
}
\references{
Archer, K. J., Fu, H., Mrozek, K., Nicolet, D., Mims, A. S.,
Uy, G. L., Stock, W., Byrd, J. C., Hiddemann, W., Braess, J.,
Spiekermann, K., Metzeler, K. H., Herold, T., Eisfeld, A.-K. (2024)
Identifying long-term survivors and those at higher or lower risk of relapse
among patients with cytogenetically normal acute myeloid leukemia using a
high-dimensional mixture cure model. \emph{Journal of Hematology & Oncology},
\bold{17}:28.
}
\seealso{
\code{\link{cureem}}
}
\keyword{models}
\keyword{regression}
