% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/BIOMOD_CrossValidation.R
\name{BIOMOD_CrossValidation}
\alias{BIOMOD_CrossValidation}
\title{Custom models cross-validation procedure}
\usage{
BIOMOD_CrossValidation(
  bm.format,
  k = 5,
  nb.rep = 5,
  do.stratification = FALSE,
  method = "both",
  balance = "presences",
  do.full.models = TRUE
)
}
\arguments{
\item{bm.format}{a \code{\link{BIOMOD.formated.data-class}} or \code{\link{BIOMOD.formated.data.PA-class}} 
object returned by the \code{\link{BIOMOD_FormatingData}} function}

\item{k}{an \code{integer} corresponding to the number of bins/partitions for k-fold CV}

\item{nb.rep}{an \code{integer} corresponding to the number of repetitions of k-fold CV 
(\emph{set to \code{1} if \code{do.stratification = TRUE}})}

\item{do.stratification}{a \code{logical} defining whether stratified CV should be run}

\item{method}{a \code{character} corresponding to the CV stratification method (\emph{if 
\code{do.stratification = TRUE}}), must be \code{x}, \code{y}, \code{both}, \code{block} 
or the name of a predictor for environmental stratified CV}

\item{balance}{a \code{character} defining whether partitions should be balanced for 
\code{presences} or \code{absences} (resp. pseudo-absences or background)}

\item{do.full.models}{(\emph{optional, default} \code{TRUE}) \cr
A \code{logical} value defining whether models should be also calibrated and validated over 
the whole dataset or not}
}
\value{
A \code{matrix} or \code{data.frame} with \code{k * nb.rep} (\emph{+ 1 if 
\code{do.full.models = TRUE}}) columns that can be given to \code{data.split.table} 
parameter of \code{\link{BIOMOD_Modeling}} function.
}
\description{
This function creates a \code{matrix} or \code{data.frame} that can be given to 
\code{data.split.table} parameter of \code{\link{BIOMOD_Modeling}} function to evaluate 
models with repeated k-fold or stratified cross-validation (CV) instead of repeated split samples.
}
\details{
\bold{Stratified cross-validation} may be used to test for model overfitting and to assess 
transferability in geographic and environmental space : 
\itemize{
  \item \code{x} and \code{y} stratification was described in \emph{Wenger and Olden 2012} 
  (see  \href{https://biomodhub.github.io/biomod2/reference/BIOMOD_CrossValidation.html#References}{References}). While \code{y} 
  stratification uses \code{k} partitions along the y-gradient, \code{x} stratification does 
  the same for the x-gradient, and \code{both} combines them.
  \item \code{block} stratification was described in \emph{Muscarella et al. 2014} (see 
  \href{https://biomodhub.github.io/biomod2/reference/BIOMOD_CrossValidation.html#References}{References}). Four bins of equal size are 
  partitioned (bottom-left, bottom-right, top-left and top-right).
}

If \code{balance = 'presences'}, presences are divided (balanced) equally over the 
partitions (e.g. \emph{Fig. 1b in Muscarelly et al. 2014}). Pseudo-absences will however be 
unbalanced over the partitions especially if the presences are clumped on an edge of the 
study area.

If \code{balance = 'absences'}, absences (resp. pseudo-absences or background) are divided 
(balanced) as equally as possible between the partitions (geographical balanced bins given 
that absences are spread over the study area equally, approach similar to \emph{Fig. 1 in 
Wenger et Olden 2012}). Presences will however be unbalanced over the partitions especially 
if the presences are clumped on an edge of the study area.
}
\examples{

library(terra)
# Load species occurrences (6 species available)
data(DataSpecies)
head(DataSpecies)

# Select the name of the studied species
myRespName <- 'GuloGulo'

# Get corresponding presence/absence data
myResp <- as.numeric(DataSpecies[, myRespName])

# Get corresponding XY coordinates
myRespXY <- DataSpecies[, c('X_WGS84', 'Y_WGS84')]

# Load environmental variables extracted from BIOCLIM (bio_3, bio_4, bio_7, bio_11 & bio_12)
data(bioclim_current)
myExpl <- terra::rast(bioclim_current)

\dontshow{
myExtent <- terra::ext(0,30,45,70)
myExpl <- terra::crop(myExpl, myExtent)
}

# ---------------------------------------------------------------
# Format Data with true absences
myBiomodData <- BIOMOD_FormatingData(resp.var = myResp,
                                     expl.var = myExpl,
                                     resp.xy = myRespXY,
                                     resp.name = myRespName)

# Create default modeling options
myBiomodOptions <- BIOMOD_ModelingOptions()

 
# ---------------------------------------------------------------
# Create the different validation datasets
myBiomodCV <- BIOMOD_CrossValidation(bm.format = myBiomodData)
head(myBiomodCV)

# Several validation strategies can be combined
DataSplitTable.b <- BIOMOD_CrossValidation(bm.format = myBiomodData,
                                           k = 5,
                                           nb.rep = 2,
                                           do.full.models = FALSE)
DataSplitTable.y <- BIOMOD_CrossValidation(bm.format = myBiomodData,
                                           k = 2,
                                           do.stratification = TRUE,
                                           method = "y")
colnames(DataSplitTable.y)[1:2] <- c("RUN11", "RUN12")
myBiomodCV <- cbind(DataSplitTable.b, DataSplitTable.y)
head(myBiomodCV)

# Model single models
myBiomodModelOut <- BIOMOD_Modeling(bm.format = myBiomodData,
                                    modeling.id = 'mod.CV',
                                    models = c('RF'),
                                    bm.options = myBiomodOptions,
                                    nb.rep = 2,
                                    data.split.table = myBiomodCV,
                                    metric.eval = c('TSS','ROC'),
                                    var.import = 0,
                                    do.full.models = FALSE,
                                    seed.val = 42)

# Get evaluation scores & variables importance
myEval <- get_evaluations(myBiomodModelOut)
myEval$CV.strategy <- "Random"
myEval$CV.strategy[grepl("13", myEval$full.name)] <- "Full"
myEval$CV.strategy[grepl("11|12", myEval$full.name)] <- "Stratified"
head(myEval)

boxplot(myEval$calibration ~ interaction(myEval$algo, myEval$CV.strategy),
        xlab = "", ylab = "ROC AUC", col = rep(c("brown", "cadetblue"), 3))
boxplot(myEval$validation ~ interaction(myEval$algo, myEval$CV.strategy),
        xlab = "", ylab = "ROC AUC", col = rep(c("brown", "cadetblue"), 3))
         


}
\references{
\itemize{
  \item Muscarella, R., Galante, P.J., Soley-Guardia, M., Boria, R.A., Kass, J.M., Uriarte, M. 
  & Anderson, R.P. (2014). ENMeval: An R package for conducting spatially independent 
  evaluations and estimating optimal model complexity for Maxent ecological niche models. 
  \emph{Methods in Ecology and Evolution}, \bold{5}, 1198-1205.
  \item Wenger, S.J. & Olden, J.D. (2012). Assessing transferability of ecological models: an 
  underappreciated aspect of statistical validation. \emph{Methods in Ecology and Evolution}, 
  \bold{3}, 260-267.
}
}
\seealso{
\code{\link[ENMeval]{get.block}}, \code{\link[dismo]{kfold}}, 
\code{\link{BIOMOD_FormatingData}}, \code{\link{BIOMOD_Modeling}}

Other Main functions: 
\code{\link{BIOMOD_EnsembleForecasting}()},
\code{\link{BIOMOD_EnsembleModeling}()},
\code{\link{BIOMOD_FormatingData}()},
\code{\link{BIOMOD_LoadModels}()},
\code{\link{BIOMOD_ModelingOptions}()},
\code{\link{BIOMOD_Modeling}()},
\code{\link{BIOMOD_PresenceOnly}()},
\code{\link{BIOMOD_Projection}()},
\code{\link{BIOMOD_RangeSize}()},
\code{\link{BIOMOD_Tuning}()}
}
\author{
Frank Breiner
}
\concept{Main functions}
