% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/model_train.R
\name{ModelTrain}
\alias{ModelTrain}
\alias{ModelTrain.default}
\alias{ModelTrain.character}
\alias{ModelTrain.data.frame}
\title{Fit predictive models to sets of descriptors.}
\usage{
ModelTrain(...)

\method{ModelTrain}{default}(
  x,
  y,
  nfolds = 10,
  nsplits = 3,
  seed.in = NA,
  des.names = NA,
  models = c("NNet", "PLS", "LAR", "Lasso", "PLSLDA", "Tree", "SVM", "KNN", "RF"),
  user.params = NULL,
  verbose = FALSE,
  ...
)

\method{ModelTrain}{character}(
  descriptors,
  y,
  mols,
  nfolds = 10,
  nsplits = 3,
  seed.in = NA,
  des.names = NA,
  models = c("NNet", "PLS", "LAR", "Lasso", "PLSLDA", "Tree", "SVM", "KNN", "RF"),
  user.params = NULL,
  verbose = FALSE,
  ...
)

\method{ModelTrain}{data.frame}(
  d,
  ids = FALSE,
  xcol.lengths = ifelse(ids, length(d) - 2, length(d) - 1),
  xcols = NA,
  nfolds = 10,
  nsplits = 3,
  seed.in = NA,
  des.names = NA,
  models = c("NNet", "PLS", "LAR", "Lasso", "PLSLDA", "Tree", "SVM", "KNN", "RF"),
  user.params = NULL,
  verbose = FALSE,
  ...
)
}
\arguments{
\item{...}{Additional parameters.}

\item{x}{a list of numeric descriptor set matrices.  At the moment, only
binary and continuous descriptors are supported.  Binary descriptors should
be numeric (0 or 1).}

\item{y}{a numeric vector containing the binary or continuous response.}

\item{nfolds}{the number of folds to use for each cross
validation split.}

\item{nsplits}{the number of splits to use for repeated
cross validation.}

\item{seed.in}{a numeric vector with length equal to \code{nsplits}.
The seeds are used to randomly assign folds to observations for each
repeated cross-validation split. If \code{NA}, the first seed will be 
11111, the second will be 22222, and so on.}

\item{des.names}{a character vector specifying the names for each
descriptor
set.  The length of the vector must match the number of descriptor sets.
If \code{NA}, each descriptor set will be named "Descriptor Set i", where
i is the number of the descriptor set.}

\item{models}{a character vector specifying the regression or
classification models to use.  The strings must match models
implemented in `chemmodlab` (see Details).}

\item{user.params}{a list of data frames where each data frame contains
the parameter values for a model.  The list should have the format of
the list constructed by  \code{\link{MakeModelDefaults}}. One can construct
a list of parameters using  \code{\link{MakeModelDefaults}} and then
modify the parameters.}

\item{verbose}{verbose mode or not?}

\item{descriptors}{descriptor sets to compute}

\item{mols}{molecule file created by rcdk}

\item{d}{a data frame containing an (optional) ID column,
a response column, and descriptor columns.  The columns should be
provide in this order.}

\item{ids}{a logical.  Is an ID column provided?}

\item{xcol.lengths}{a vector of integers.  It is assumed that the columns
in \code{d} are grouped by descriptor set.  The integers specify the
number of descriptors in each descriptor set.  They should be ordered as
the descriptor sets are ordered in \code{d}.
Users can specify multiple descriptor sets. By default there is one
descriptor set, namely all columns in \code{d} except the response
column and
the optional ID column.  Specify \code{xcol.lengths} or \code{xcols},
but not both.}

\item{xcols}{A list of integer vectors.  Each vector contains
column indices
of \code{data} where a set of descriptor variables is located.
Users can specify multiple descriptor sets.  Specify \code{xcol.lengths} or \code{xcols},
but not both.}
}
\value{
A list is returned of class \code{\link{chemmodlab}} containing:
 \item{all.preds}{a list of lists of data frames.  The elements of the outer
  list correspond to each CV split performed by \code{\link{ModelTrain}}. The
  elements of the inner list correspond to each descriptor set.  For each
  descriptor set and CV split combination, the output is a dataframe
  containing all model predictions.  The first column of each data frame
  contains the true value of the response.  The remaining columns contain
  the predictions for each model.}
\item{all.probs}{a list of lists of data frames. Constructed only if there is
  a binary response.  The structure is the same as \code{all.preds}, except
  that predictions are replaced by "predicted probabilities" (i.e. estimated
  probabilities of a response
  value of one).  Predicted
  probabilities are only reported for classification models.}
\item{model.acc}{a list of lists of model accuracy measures.  The elements of
  the outer list correspond to each CV split performed by \code{ModelTrain}.
  The elements of the inner list correspond to each descriptor set.  For each
  descriptor set and CV split combination, a limited collection of 
  performance measures are given for each model fit
  to the data.  Regression models are assessed with Pearson's \eqn{r} and
  \eqn{RMSE}. Classification models are assessed with contingency tables.
  For additional model performance measures, see \code{\link{Performance}}}.
\item{classify}{a logical.  Were classification models used for binary
  response?}
\item{responses}{a numeric vector.  The observed value of the response.}
\item{data}{a list of numeric matrices.  Each matrix is a descriptor set used
  as model input.}
\item{params}{a list of data frames as made by
  \code{\link{MakeModelDefaults}}.  Each data frame contains the parameters to
  be set for a particular model.}
\item{des.names}{a character vector specifying the descriptor set names.  NA if 
  unspecified.}
\item{models}{a character vector specifying the models fit to the data.}
\item{nsplits}{number of CV splits performed.}
}
\description{
\code{ModelTrain} is a generic S3 function that fits a series of 
classification or regression
models to sets of descriptors and computes cross-validated measures
of model performance.
}
\details{
Multiple descriptor sets can be specified
by the user. For each descriptor set, repeated k-fold cross validation
is performed for the specified regression and/or classification
models.

Not all modeling strategies will be appropriate for all response
types. For example, partial least squares linear discriminant analysis
("PLSLDA")
is not directly appropriate for continuous response assays such as
percent inhibition, but it can be applied once a threshold value for
percent inhibition is used to create a binary (active/inactive) response.

See \url{https://jrash.github.io/chemmodlab/} for more 
information about the
models available (including model default parameters).
The default value for argument models includes only some of 
the possible values.

Sensible default values are selected for each
tunable model parameter, however users may set any parameter
manually using \code{\link{MakeModelDefaults}} and \code{user.params}.

\code{\link{ModelTrain}} predictions are based on k-fold cross-validation,
where the dataset is randomly divided into k parts, each containing
approximately equal numbers of compounds. Treating one of these parts
as a "test set" the remaining
k-1 parts are combined together as a "training set"
and used to build a model from the desired modeling technique and
descriptor set. This model is then applied to the "test set" to obtain
predictions. The process is repeated, holding out each of the k parts
in turn. One advantage of k-fold cross-validation is reduction in bias
from using the same data to both build and assess a model. Another
advantage is the increased precision of error estimation offered by
k-fold cross validation over a one-time split.

Recognizing that the definition of folds in k-fold cross validation
may have an impact on the observed performance measures, all models
are built using the same definition of folds. This process is repeated
to obtain multiple separate k-fold cross validation runs resulting in
multiple separate definitions of folds.  The number of these "splits"
is specified by \code{nsplits}.

Observed performance measures are
assessed across all splits using \code{\link{CombineSplits}}.  This
function assesses how sensitive performance measures are to fold
assignments, or changes to the training and test sets. 
Statistical tests are used to determine the best performing model and
descriptor set combination.
}
\section{Methods (by class)}{
\itemize{
\item \code{default}: Default S3 method

\item \code{character}: S3 method for class 'character'

\item \code{data.frame}: S3 method for class 'data.frame'
}}

\examples{

\dontrun{
# A data set with  binary response and multiple descriptor sets
data(aid364)

cml <- ModelTrain(aid364, ids = TRUE, xcol.lengths = c(24, 147),
                  des.names = c("BurdenNumbers", "Pharmacophores"))
cml
}

# A continuous response
cml <- ModelTrain(USArrests, nsplits = 2, nfolds = 2,
                  models = c("KNN", "Lasso", "Tree"))
cml

}
\seealso{
\code{\link{chemmodlab}}, \code{\link{plot.chemmodlab}},
  \code{\link{CombineSplits}},
}
\author{
Jacqueline Hughes-Oliver, Jeremy Ash, Atina Brooks
}
