% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/searchOptimalConfiguration.R
\name{searchOptimalConfiguration}
\alias{searchOptimalConfiguration}
\title{Search for an Optimal Multi-Regime (Shift) Configuration on a Phylogeny}
\usage{
searchOptimalConfiguration(
  baseline_tree,
  trait_data,
  formula = "trait_data ~ 1",
  min_descendant_tips,
  num_cores = 2,
  ic_uncertainty_threshold = 1,
  shift_acceptance_threshold = 1,
  uncertaintyweights = FALSE,
  uncertaintyweights_par = FALSE,
  plot = FALSE,
  IC = "GIC",
  store_model_fit_history = TRUE,
  verbose = FALSE,
  ...
)
}
\arguments{
\item{baseline_tree}{A rooted SIMMAP/\code{phylo} object representing the baseline
(single-regime) tree. If not SIMMAP-initialized, it should already be painted to a
single baseline state and have tip order matching \code{trait_data}.}

\item{trait_data}{A \code{matrix} or \code{data.frame} of continuous trait values with row
names matching \code{baseline_tree$tip.label} (same order). For the default
\code{formula = "trait_data ~ 1"}, \code{trait_data} is typically supplied as a numeric
matrix so that the multivariate response is interpreted correctly by \code{mvgls()}.
When using more general formulas (e.g., pGLS-style models), a \code{data.frame} with
named columns can be used instead.}

\item{formula}{Character formula passed to \code{mvgls}. Defaults to
\code{"trait_data ~ 1"}, which fits an intercept-only model treating the supplied
multivariate trait matrix as the response. This is the appropriate choice for most
morphometric data where there are no predictor variables. For more general models,
\code{formula} can reference subsets of \code{trait_data} explicitly, for example
\code{"trait_data[, 1:5] ~ 1"} to treat columns 1–5 as a multivariate response, or
\code{"trait_data[, 1:5] ~ trait_data[, 6]"} to fit a multivariate pGLS with column 6
as a predictor.}

\item{min_descendant_tips}{Integer (\eqn{\ge}1). Minimum number of tips required for an internal node
to be considered as a candidate shift (forwarded to \code{generatePaintedTrees}). Larger values
reduce the number of candidate shifts by excluding very small clades. For empirical datasets,
values around \code{10} are a reasonable starting choice and can be tuned in sensitivity analyses.}

\item{num_cores}{Integer. Number of workers for parallel candidate scoring. Uses
\code{future::plan(multicore)} on Unix outside \code{RStudio}; otherwise uses
\code{future::plan(multisession)}. During the parallel candidate-scoring blocks, BLAS/OpenMP
threads are capped to 1 (per worker) to avoid CPU oversubscription.}

\item{ic_uncertainty_threshold}{Numeric (\eqn{\ge}0). Reserved for future development
in post-search pruning and uncertainty analysis; currently not used by
\code{searchOptimalConfiguration()}.}

\item{shift_acceptance_threshold}{Numeric (\eqn{\ge}0). Minimum IC improvement
(baseline - new) required to accept a candidate shift during the forward search.
Larger values yield more conservative models. For analyses based on the Generalized
Information Criterion (\code{"GIC"}), a threshold on the order of \code{20} units is a
conservative choice that tends to admit only strongly supported shifts. Simulation
studies (Berv et al., in preparation) suggest that this choice yields good balanced
accuracy between detecting true shifts and avoiding false positives, but users should
explore alternative thresholds in sensitivity analyses for their own datasets.}

\item{uncertaintyweights}{Logical. If \code{TRUE}, compute per-shift IC weights serially by
refitting the optimized model with each shift removed in turn. Exactly one of
\code{uncertaintyweights} or \code{uncertaintyweights_par} must be \code{TRUE} to trigger
IC-weight calculations; setting both to \code{TRUE} will result in an error. When enabled,
the per-shift weights are returned in the \code{$ic_weights} component of the result.}

\item{uncertaintyweights_par}{Logical. As above, but compute per-shift IC weights in parallel
using \pkg{future.apply}. Exactly one of \code{uncertaintyweights} or
\code{uncertaintyweights_par} must be \code{TRUE} to trigger IC-weight calculations.}

\item{plot}{Logical. If \code{TRUE}, draw/update a SIMMAP plot as the search proceeds
(requires \pkg{phytools}).}

\item{IC}{Character. Which information criterion to use, one of \code{"GIC"} or \code{"BIC"}
(case-sensitive).}

\item{store_model_fit_history}{Logical. If \code{TRUE}, store a per-iteration record of fitted
models, acceptance decisions, and IC values. To keep memory usage low during the search,
per-iteration results are written to a temporary directory (\code{tempdir()}) and read back
into memory at the end of the run.}

\item{verbose}{Logical. If \code{TRUE}, report progress during candidate generation and model
fitting. By default, progress is emitted via \code{message()}. When \code{plot = TRUE} in an
interactive \code{RStudio} session, progress is written via \code{cat()} so it remains visible
while plots are updating. Set to \code{FALSE} to run quietly (default). Use
\code{suppressMessages()} (and \code{capture.output()} if needed) to silence or capture output.}

\item{...}{Additional arguments passed to \code{\link[mvMORPH]{mvgls}} (e.g., \code{method},
\code{penalty}, \code{target}, \code{error}, etc.).}
}
\value{
A named \code{list} with (at minimum):
\itemize{
\item \code{user_input}: captured call (as a list) for reproducibility.
\item \code{tree_no_uncertainty_transformed}: SIMMAP tree from the optimal (no-uncertainty) model
on the transformed scale used internally by \code{mvgls}.
\item \code{tree_no_uncertainty_untransformed}: same topology with original edge lengths restored.
\item \code{model_no_uncertainty}: the final \code{mvgls} model object.
\item \code{shift_nodes_no_uncertainty}: integer vector of accepted shift nodes.
\item \code{optimal_ic}: final IC value; \code{baseline_ic}: baseline IC.
\item \code{IC_used}: \code{"GIC"} or \code{"BIC"}; \code{num_candidates}: count of candidate one-shift models evaluated.
\item \code{model_fit_history}: if \code{store_model_fit_history = TRUE}, a list of per-iteration fits
(loaded from temporary files written during the run) and an \code{ic_acceptance_matrix}
(IC value and acceptance flag per step).
\item \code{VCVs}: named list of regime-specific VCV matrices extracted from the final model
(penalized-likelihood estimates if PL was used).
}
Additional components appear conditionally:
\itemize{
\item \code{ic_weights}: a \code{data.frame} of per-shift IC weights and evidence ratios when
\code{uncertaintyweights} or \code{uncertaintyweights_par} is \code{TRUE}.
\item \code{warnings}: character vector of warnings/errors encountered during fitting (if any).
}
}
\description{
Greedy, stepwise search for evolutionary regime shifts on a SIMMAP-style phylogeny
using multivariate \code{mvgls} fits from \pkg{mvMORPH}. The routine:
\enumerate{
\item builds one-shift candidate trees for all internal nodes meeting a tip-size threshold
(via \code{generatePaintedTrees}),
\item fits each candidate in parallel and ranks them by improvement in the chosen
information criterion (IC; \code{GIC} or \code{BIC}),
\item iteratively adds shifts that pass a user-defined acceptance threshold,
\item optionally revisits accepted shifts to prune overfitting using a small IC tolerance window,
\item optionally computes per-shift IC weights by refitting the model with each shift removed.
}

Models are fitted directly in multivariate trait space (no PCA), assuming a multi-rate
Brownian Motion with proportional VCV scaling across regimes. Extra arguments in \code{...}
are forwarded to \code{\link[mvMORPH]{mvgls}} (e.g., \code{method = "LL"} or
\code{method = "PL-LOOCV"}, \code{penalty}, \code{error = TRUE}, etc.).
}
\details{
\strong{Input requirements.}
\itemize{
\item \emph{Tree:} \code{baseline_tree} should be a rooted \code{phylo} (or SIMMAP-style) tree
with branch lengths interpreted in units of time. An ultrametric tree is not required.
\item \emph{Trait data alignment:} \code{rownames(trait_data)} must match
\code{baseline_tree$tip.label} in both names and order; any tips without data should be
pruned beforehand.
\item \emph{Data type:} \code{trait_data} is typically a numeric matrix of continuous traits;
high-dimensional settings (p \eqn{\ge} n) are supported via penalized-likelihood
\code{mvgls()} fits.
}

\strong{Search outline.}
\enumerate{
\item \emph{Baseline:} Fit \code{mvgls} on the baseline tree (single regime) to obtain the baseline IC.
\item \emph{Candidates:} Build one-shift trees for eligible internal nodes
(\code{generatePaintedTrees}); fit each with
\code{fitMvglsAndExtractGIC.formula} or \code{fitMvglsAndExtractBIC.formula}
(internal helpers; not exported) and rank by \eqn{\Delta}IC.
\item \emph{Greedy add:} Add the top candidate, refit, and accept if
\eqn{\Delta}IC \eqn{\ge} \code{shift_acceptance_threshold}; continue down the ranked list.
\item \emph{Optional IC weights:} If \code{uncertaintyweights} (or \code{uncertaintyweights_par})
is \code{TRUE}, compute an IC weight for each accepted shift by refitting the final model with that
shift removed and comparing the two ICs via \code{\link[mvMORPH]{aicw}}.
}

\strong{Parallelization.} Candidate sub-model fits are distributed with \pkg{future} + \pkg{future.apply}.
On Unix, \code{multicore} is used; on Windows, \code{multisession}. A sequential plan is restored afterward.

\strong{Plotting.} If \code{plot = TRUE}, trees are rendered with
\code{\link[phytools]{plotSimmap}()}; shift IDs are labeled with \code{\link[ape]{nodelabels}()}.

\strong{Regime VCVs.} The returned \code{$VCVs} are extracted from the fitted multi-regime model via
\code{extractRegimeVCVs} and reflect regime-specific covariance
estimates (when \code{mvgls} is fitted under a PL/ML method).

For high-dimensional trait datasets (p \eqn{\ge} n), penalized-likelihood settings in
\code{mvgls()} are often required for stable estimation. In practice, methods such as
\code{method = "LL"} or \code{method = "H&L"} combined with appropriate penalties (e.g.,
ridge-type penalties) have proven effective for intercept-only multivariate Brownian
motion models, as illustrated in the package vignettes. Users should consult the
\pkg{mvMORPH} documentation for details on available methods and penalties and
tune these choices to the structure of their data.
}
\note{
Internally, this routine coordinates multiple unexported helper functions:
\code{generatePaintedTrees}, \code{fitMvglsAndExtractGIC.formula},
\code{fitMvglsAndExtractBIC.formula}, \code{addShiftToModel},
\code{removeShiftFromTree}, and \code{extractRegimeVCVs}. Through these,
it may also invoke lower-level utilities such as \code{paintSubTree_mod}
and \code{paintSubTree_removeShift}. These helpers are internal
implementation details and are not part of the public API.
}
\section{Convergence and robustness}{

The search is greedy and may converge to a local optimum. Use a stricter
\code{shift_acceptance_threshold} to reduce overfitting, and re-run the search
with different \code{min_descendant_tips} and IC choices (\code{"GIC"} vs \code{"BIC"})
to assess stability of the inferred shifts. For a given run, the optional IC-weight
calculations (\code{uncertaintyweights} or \code{uncertaintyweights_par}) can be used
to quantify support for individual shifts. It is often helpful to repeat the analysis
under slightly different settings (e.g., thresholds or candidate-size constraints) and
compare the resulting sets of inferred shifts.
}

\examples{
library(ape)
library(phytools)
library(mvMORPH)
set.seed(1)

# Simulate a tree
tr <- pbtree(n = 50, scale = 1)

# Define two regimes: "0" (baseline) and "1" (high-rate) on a subset of tips
states <- setNames(rep("0", Ntip(tr)), tr$tip.label)
high_clade_tips <- tr$tip.label[1:20]
states[high_clade_tips] <- "1"

# Make a SIMMAP tree for the BMM simulation
simmap <- phytools::make.simmap(tr, states, model = "ER", nsim = 1)

# Simulate traits under a BMM model with ~10x higher rate in regime "1"
sigma <- list(
  "0" = diag(0.1, 2),
  "1" = diag(1.0, 2)
)
theta <- c(0, 0)

sim <- mvMORPH::mvSIM(
  tree  = simmap,
  nsim  = 1,
  model = "BMM",
  param = list(
    ntraits = 2,
    sigma   = sigma,
    theta   = theta
  )
)

# mvSIM returns either a matrix or a list of matrices depending on mvMORPH version
X <- if (is.list(sim)) sim[[1]] else sim
rownames(X) <- simmap$tip.label

# Run the search on the unpainted tree (single baseline regime)
res <- searchOptimalConfiguration(
  baseline_tree              = as.phylo(simmap),
  trait_data                 = X,
  formula                    = "trait_data ~ 1",
  min_descendant_tips        = 10,
  num_cores                  = 1,   # keep it simple / CRAN-safe
  shift_acceptance_threshold = 20,  # conservative GIC threshold
  IC                         = "GIC",
  plot                       = FALSE,
  store_model_fit_history    = FALSE,
  verbose                    = FALSE
)

res$shift_nodes_no_uncertainty
res$optimal_ic - res$baseline_ic
str(res$VCVs)
}
\seealso{
\code{\link[mvMORPH]{mvgls}}, \code{\link[mvMORPH]{GIC}}, \code{\link[stats]{BIC}},
\code{\link{plot_ic_acceptance_matrix}} for visualizing IC trajectories and shift
acceptance decisions, and \code{\link{generateViridisColorScale}} for mapping
regime-specific rates or parameters to a viridis color scale when plotting trees;
packages: \pkg{mvMORPH}, \pkg{future}, \pkg{future.apply}, \pkg{phytools}, \pkg{ape}.
}
