% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/explanatory_performance.R
\name{ExplanatoryPerformance}
\alias{ExplanatoryPerformance}
\title{Prediction performance in regression}
\usage{
ExplanatoryPerformance(
  xdata,
  ydata,
  stability = NULL,
  family = NULL,
  implementation = NULL,
  prediction = NULL,
  K = 1,
  tau = 0.8,
  seed = 1,
  n_thr = NULL,
  ij_method = FALSE,
  time = 1000
)
}
\arguments{
\item{xdata}{matrix of predictors with observations as rows and variables as
columns.}

\item{ydata}{optional vector or matrix of outcome(s). If \code{family} is set
to \code{"binomial"} or \code{"multinomial"}, \code{ydata} can be a vector
with character/numeric values or a factor.}

\item{stability}{output of \code{\link{VariableSelection}}. If
\code{stability=NULL} (the default), a model including all variables in
\code{xdata} as predictors is fitted. Argument \code{family} must be
provided in this case.}

\item{family}{type of regression model. Possible values include
\code{"gaussian"} (linear regression), \code{"binomial"} (logistic
regression), \code{"multinomial"} (multinomial regression), and
\code{"cox"} (survival analysis). If provided, this argument must be
consistent with input \code{stability}.}

\item{implementation}{optional function to recalibrate the model. If
\code{implementation=NULL} and \code{stability} is the output of
\code{\link{VariableSelection}}, \code{\link[stats]{lm}} (linear
regression), \code{\link[survival]{coxph}} (Cox regression),
\code{\link[stats]{glm}} (logistic regression), or
\code{\link[nnet]{multinom}} (multinomial regression) is used.}

\item{prediction}{optional function to compute predicted values from the
model recalibrated with \code{implementation}.}

\item{K}{number of training-test splits.}

\item{tau}{proportion of observations used in the training set.}

\item{seed}{value of the seed to ensure reproducibility of the results.}

\item{n_thr}{number of thresholds to use to construct the ROC curve. If
\code{n_thr=NULL}, all predicted probability values are iteratively used as
thresholds. For faster computations on large data, less thresholds can be
used. Only applicable to logistic regression.}

\item{ij_method}{logical indicating if the analysis should be done for only
one recalibration/test split with variance of the concordance index should
be computed using the infinitesimal jackknife method as implemented in
\code{\link[survival]{concordance}}. If \code{ij_method=FALSE} (the
default), the concordance indices computed for different recalibration/test
splits are reported. If \code{ij_method=TRUE}, the concordance index and
estimated confidence interval at level 0.05 are reported. Only applicable
to Cox regression.}

\item{time}{numeric indicating the time for which the survival probabilities
are computed. Only applicable to Cox regression.}
}
\value{
A list with: \item{TPR}{True Positive Rate (for logistic regression
  only).} \item{FPR}{False Positive Rate (for logistic regression only).}
  \item{AUC}{Area Under the Curve (for logistic regression only).}
  \item{concordance}{Concordance index (for Cox regression only).}
  \item{lower}{lower bound of the confidence interval at level 0.05 for the
  concordance index calculated using the infinitesimal jackknife (for Cox
  regression and with \code{ij_method=TRUE}).} \item{upper}{upper bound of
  the confidence interval at level 0.05 for the concordance index calculated
  using the infinitesimal jackknife (for Cox regression and with
  \code{ij_method=TRUE}).} \item{Beta}{matrix of estimated beta coefficients
  across the \code{K} iterations. Coefficients are extracted using the
  \code{\link[stats]{coef}} function.}
}
\description{
Calculates model performance for linear (measured by Q-squared), logistic
(AUC) or Cox (C-statistic) regression. This is done by (i) recalibrating the
model on a training set including a proportion \code{tau} of the
observations, and (ii) evaluating the performance on the remaining
observations (test set). For more reliable results, the procedure can be
repeated \code{K} times (default \code{K=1}).
}
\details{
For a fair evaluation of the prediction performance, the data is
  split into a training set (including a proportion \code{tau} of the
  observations) and test set (remaining observations). The regression model
  is fitted on the training set and applied on the test set. Performance
  metrics are computed in the test set by comparing predicted and observed
  outcomes.

  For logistic regression, a Receiver Operating Characteristic (ROC) analysis
  is performed: the True and False Positive Rates (TPR and FPR), and Area
  Under the Curve (AUC) are computed for different thresholds in predicted
  probabilities.

  For Cox regression, the Concordance Index (as implemented in
  \code{\link[survival]{concordance}}) looking at survival probabilities up
  to a specific \code{time} is computed.

  For linear regression, the squared correlation between predicted and
  observed outcome in the test set (Q-squared) is reported.
}
\examples{
\donttest{
## Logistic regression

# Data simulation
set.seed(1)
simul <- SimulateRegression(n = 1000, pk = 10, family = "binomial")

# Balanced split: 50\% variable selection set and 50\% for evaluation of performances
ids_train <- Resample(
  data = simul$ydata,
  tau = 0.5, family = "binomial"
)
xtrain <- simul$xdata[ids_train, ]
ytrain <- simul$ydata[ids_train, ]
xtest <- simul$xdata[-ids_train, ]
ytest <- simul$ydata[-ids_train, ]

# Stability selection
stab <- VariableSelection(xdata = xtrain, ydata = ytrain, family = "binomial")

# Evaluation of the performances on recalibrated models (K=1)
roc <- ExplanatoryPerformance(
  xdata = xtest, ydata = ytest,
  stability = stab, n_thr = NULL
)
PlotROC(roc)

# Using more recalibration/test splits
roc <- ExplanatoryPerformance(
  xdata = xtest, ydata = ytest,
  stability = stab, K = 100
)
boxplot(roc$AUC, ylab = "AUC")
PlotROC(roc)

# Comparison with saturated model
roc <- ExplanatoryPerformance(
  xdata = xtest, ydata = ytest,
  family = "binomial", K = 100
)
PlotROC(roc, col = "blue", col_band = "blue", add = TRUE)


## Partial Least Squares (single component)

# Stability selection
stab <- VariableSelection(
  xdata = xtrain, ydata = ytrain,
  implementation = SparsePLS,
  family = "binomial"
)
print(SelectedVariables(stab))

# Defining wrapping functions for PLS-DA
PLSDA <- function(xdata, ydata, family = "binomial") {
  model <- mixOmics::plsda(X = xdata, Y = as.factor(ydata), ncomp = 1)
  return(model)
}
PredictPLSDA <- function(xdata, model) {
  xdata <- xdata[, rownames(model$loadings$X), drop = FALSE]
  predicted <- predict(object = model, newdata = xdata)$predict[, 2, 1]
  return(predicted)
}

# Evaluation of the performances on recalibrated models (K=1)
roc <- ExplanatoryPerformance(
  xdata = xtest, ydata = ytest,
  stability = stab,
  implementation = PLSDA, prediction = PredictPLSDA
)
PlotROC(roc)


## Cox regression

# Data simulation
set.seed(1)
simul <- SimulateRegression(n = 500, pk = 50, family = "binomial")
ydata <- cbind(
  time = runif(nrow(simul$ydata), min = 100, max = 2000),
  case = simul$ydata[, 1]
) # including dummy time to event

# Balanced split: 50\% variable selection set and 50\% for evaluation of performances
ids_train <- Resample(
  data = simul$ydata,
  tau = 0.5, family = "binomial"
)
xtrain <- simul$xdata[ids_train, ]
ytrain <- ydata[ids_train, ]
xtest <- simul$xdata[-ids_train, ]
ytest <- ydata[-ids_train, ]

# Stability selection
stab <- VariableSelection(xdata = xtrain, ydata = ytrain, family = "cox")

# Evaluation of the performances on recalibrated models (K=1)
perf <- ExplanatoryPerformance(
  xdata = xtest, ydata = ytest,
  stability = stab, ij_method = TRUE
)
print(perf)

# Using more recalibration/test splits
perf <- ExplanatoryPerformance(
  xdata = xtest, ydata = ytest,
  stability = stab, K = 10, time = 1000
)
boxplot(perf$concordance)


## Linear regression

# Data simulation
set.seed(1)
simul <- SimulateRegression(n = 1000, pk = 10, family = "gaussian")

# Balanced split: 50\% variable selection set and 50\% for evaluation of performances
ids_train <- Resample(
  data = simul$ydata,
  tau = 0.5, family = "gaussian"
)
xtrain <- simul$xdata[ids_train, ]
ytrain <- simul$ydata[ids_train, ]
xtest <- simul$xdata[-ids_train, ]
ytest <- simul$ydata[-ids_train, ]

# Stability selection
stab <- VariableSelection(xdata = xtrain, ydata = ytrain, family = "gaussian")

# Evaluation of the performances on recalibrated models (K=1)
perf <- ExplanatoryPerformance(
  xdata = xtest, ydata = ytest,
  stability = stab
)
print(perf)


## Partial Least Squares (single component)

# Stability selection
stab <- VariableSelection(
  xdata = xtrain, ydata = ytrain,
  implementation = SparsePLS,
  family = "gaussian"
)
print(SelectedVariables(stab))

# Evaluation of the performances on recalibrated models (K=1)
perf <- ExplanatoryPerformance(
  xdata = xtest, ydata = ytest,
  stability = stab,
  implementation = PLS, prediction = PredictPLS
)
print(perf)
}

}
\seealso{
\code{\link{VariableSelection}}, \code{\link{Recalibrate}}

Other prediction performance functions: 
\code{\link{Incremental}()},
\code{\link{PlotIncremental}()},
\code{\link{PlotROC}()},
\code{\link{ROC}()}
}
\concept{prediction performance functions}
