% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/orsf.R
\name{orsf}
\alias{orsf}
\alias{orsf_train}
\title{Oblique Random Forests}
\usage{
orsf(
  data,
  formula,
  control = NULL,
  weights = NULL,
  n_tree = 500,
  n_split = 5,
  n_retry = 3,
  n_thread = 0,
  mtry = NULL,
  sample_with_replacement = TRUE,
  sample_fraction = 0.632,
  leaf_min_events = 1,
  leaf_min_obs = 5,
  split_rule = NULL,
  split_min_events = 5,
  split_min_obs = 10,
  split_min_stat = NULL,
  oobag_pred_type = NULL,
  oobag_pred_horizon = NULL,
  oobag_eval_every = NULL,
  oobag_fun = NULL,
  importance = "anova",
  importance_max_pvalue = 0.01,
  group_factors = TRUE,
  tree_seeds = NULL,
  attach_data = TRUE,
  no_fit = FALSE,
  na_action = "fail",
  verbose_progress = FALSE,
  ...
)

orsf_train(object, attach_data = TRUE)
}
\arguments{
\item{data}{a \link{data.frame}, \link[tibble:tibble-package]{tibble}, or \link[data.table:data.table]{data.table} that contains the
relevant variables.}

\item{formula}{(\emph{formula}) Two sided formula with a single outcome.
The terms on the right are names of predictor variables, and the
symbol '.' may be used to indicate all variables in the data
except the response. The symbol '-' may also be used to indicate
removal of a predictor. Details on the response vary depending
on forest type:
\itemize{
\item \emph{Classification}: The response should be a single variable,
and that variable should have type \code{factor} in \code{data}.
\item \emph{Regression}: The response should be a single variable, and
that variable should have typee \code{double} or \code{integer} with at
least 10 unique numeric values in \code{data}.
\item \emph{Survival}: The response should include a time variable,
followed by a status variable, and may be written inside a
call to \link[survival:Surv]{Surv} (see examples).
}}

\item{control}{(\emph{orsf_control}) An object returned from one of the
\code{orsf_control} functions: \link{orsf_control_survival},
\link{orsf_control_classification}, and \link{orsf_control_regression}. If
\code{NULL} (the default) will use an accelerated control, which is the
fastest available option. For survival and classification, this is
Cox and Logistic regression with 1 iteration, and for regression
it is ordinary least squares.}

\item{weights}{(\emph{numeric vector}) Optional. If given, this input should
have length equal to \code{nrow(data)} for complete or imputed data and should
have length equal to \code{nrow(na.omit(data))} if \code{na_action} is \code{"omit"}.
As the weights vector is used to count observations and events prior to
growing a node for a tree, \code{orsf()} scales \code{weights} so that
\code{sum(weights) == nrow(data)}. This helps to make tree depth consistent
between weighted and un-weighted fits.}

\item{n_tree}{(\emph{integer}) the number of trees to grow.
Default is \code{n_tree = 500.}}

\item{n_split}{(\emph{integer}) the number of cut-points assessed when splitting
a node in decision trees. Default is \code{n_split = 5}.}

\item{n_retry}{(\emph{integer}) when a node is splittable, but the current
linear combination of inputs is unable to provide a valid split, \code{orsf}
will try again with a new linear combination based on a different set
of randomly selected predictors, up to \code{n_retry} times. Default is
\code{n_retry = 3}. Set \code{n_retry = 0} to prevent any retries.}

\item{n_thread}{(\emph{integer}) number of threads to use while growing trees, computing predictions, and computing importance. Default is 0, which allows a suitable number of threads to be used based on availability.}

\item{mtry}{(\emph{integer}) Number of predictors randomly included as candidates
for splitting a node. The default is the smallest integer greater than
the square root of the number of total predictors, i.e.,
\verb{mtry = ceiling(sqrt(number of predictors))}}

\item{sample_with_replacement}{(\emph{logical}) If \code{TRUE} (the default),
observations are sampled with replacement when an in-bag sample
is created for a decision tree. If \code{FALSE}, observations are
sampled without replacement and each tree will have an in-bag sample
containing \code{sample_fraction}\% of the original sample.}

\item{sample_fraction}{(\emph{double}) the proportion of observations that
each trees' in-bag sample will contain, relative to the number of
rows in \code{data}. Only used if \code{sample_with_replacement} is \code{FALSE}.
Default value is 0.632.}

\item{leaf_min_events}{(\emph{integer}) This input is only relevant for
survival analysis, and specifies the minimum number of events in a
leaf node. Default is \code{leaf_min_events = 1}}

\item{leaf_min_obs}{(\emph{integer}) minimum number of observations in a
leaf node. Default is \code{leaf_min_obs = 5}.}

\item{split_rule}{(\emph{character}) how to assess the quality of a potential
splitting rule for a node. Valid options for survival are:
\itemize{
\item 'logrank' : a log-rank test statistic (default).
\item 'cstat'   : Harrell's concordance statistic.
}

For classification, valid options are:
\itemize{
\item 'gini'  : gini impurity (default)
\item 'cstat' : area underneath the ROC curve (AUC-ROC)
}

For regression, valid options are:
\itemize{
\item 'variance' : variance reduction (default)
}}

\item{split_min_events}{(\emph{integer}) minimum number of events required
in a node to consider splitting it. Default is \code{split_min_events = 5}.
This input is only relevant for survival trees.}

\item{split_min_obs}{(\emph{integer}) minimum number of observations required
in a node to consider splitting it. Default is \code{split_min_obs = 10}.}

\item{split_min_stat}{(double) minimum test statistic required to split
a node. If no splits are found with a statistic exceeding \code{split_min_stat},
the given node either becomes a leaf or a retry occurs (up to \code{n_retry}
retries). Defaults are
\itemize{
\item 3.84 if \code{split_rule = 'logrank'}
\item 0.55 if \code{split_rule = 'cstat'} (see first note below)
\item 0.00 if \code{split_rule = 'gini'} (see second note below)
\item 0.00 if \code{split_rule = 'variance'}
}

\strong{Note 1} For C-statistic splitting, if C is < 0.50, we consider the statistic
value to be 1 - C to allow for good 'anti-predictive' splits. So,
if a C-statistic is initially computed as 0.1, it will be considered
as 1 - 0.10 = 0.90.

\strong{Note 2} For Gini impurity, a value of 0 and 1 usually indicate the best and
worst possible scores, respectively. To make things simple and to avoid
introducing a \code{split_max_stat} input, we flip the values of Gini
impurity so that 1 and 0 indicate the best and worst possible scores,
respectively.}

\item{oobag_pred_type}{(\emph{character}) The type of out-of-bag predictions
to compute while fitting the ensemble. Valid options for any tree type:
\itemize{
\item 'none' : don't compute out-of-bag predictions
\item 'leaf' : the ID of the predicted leaf is returned for each tree
}

Valid options for survival:
\itemize{
\item 'risk' : probability of event occurring at or before
\code{oobag_pred_horizon} (default).
\item 'surv' : 1 - risk.
\item 'chf'  : cumulative hazard function at \code{oobag_pred_horizon}.
\item 'mort' : mortality, i.e., the number of events expected if all
observations in the training data were identical to a
given observation.
}

Valid options for classification:
\itemize{
\item 'prob'  : probability of each class (default)
\item 'class' : class (i.e., which.max(prob))
}

Valid options for regression:
\itemize{
\item 'mean' : mean value (default)
}}

\item{oobag_pred_horizon}{(\emph{numeric}) A numeric value indicating what time
should be used for out-of-bag predictions. Default is the median
of the observed times, i.e., \code{oobag_pred_horizon = median(time)}.
This input is only relevant for survival trees that have prediction
type of 'risk', 'surv', or 'chf'.}

\item{oobag_eval_every}{(\emph{integer}) The out-of-bag performance of the
ensemble will be checked every \code{oobag_eval_every} trees. So, if
\code{oobag_eval_every = 10}, then out-of-bag performance is checked
after growing the 10th tree, the 20th tree, and so on. Default
is \code{oobag_eval_every = n_tree}.}

\item{oobag_fun}{(\emph{function}) to be used for evaluating out-of-bag prediction accuracy every \code{oobag_eval_every}
trees. When \code{oobag_fun = NULL} (the default), the evaluation statistic is selected based on tree type
\itemize{
\item survival: Harrell's C-statistic (1982)
\item classification: Area underneath the ROC curve (AUC-ROC)
\item regression: Traditional prediction R-squared
}

if you use your own \code{oobag_fun} note the following:
\itemize{
\item \code{oobag_fun} should have three inputs: \code{y_mat}, \code{w_vec}, and \code{s_vec}
\item For survival trees, \code{y_mat} should be a two column matrix with first column named 'time' and second named 'status'. For classification trees, \code{y_mat} should be a matrix with number of columns = number of distinct classes in the outcome. For regression, \code{y_mat} should be a matrix with one column.
\item \code{s_vec} is a numeric vector containing predictions
\item \code{oobag_fun} should return a numeric output of length 1
}

For more details, see the out-of-bag \href{https://docs.ropensci.org/aorsf/articles/oobag.html#user-supplied-out-of-bag-evaluation-functions}{vignette}.}

\item{importance}{(\emph{character}) Indicate method for variable importance:
\itemize{
\item 'none': no variable importance is computed.
\item 'anova': compute analysis of variance (ANOVA) importance
\item 'negate': compute negation importance
\item 'permute': compute permutation importance
}

For details on these methods, see \link{orsf_vi}.}

\item{importance_max_pvalue}{(\emph{double}) Only relevant if \code{importance}
is \code{"anova"}. The maximum p-value that will register as a positive
case when counting the number of times a variable was found to be
'significant' during tree growth. Default is 0.01, as recommended
by Menze et al.}

\item{group_factors}{(\emph{logical}) Only relevant if variable importance is
being estimated. if \code{TRUE}, the importance of factor variables will be reported overall by aggregating the importance of individual levels of the factor. If \code{FALSE}, the importance of individual factor levels will be returned.}

\item{tree_seeds}{(\emph{integer vector}) Optional. if specified, random seeds
will be set using the values in \code{tree_seeds[i]}  before growing tree \code{i}.
Two forests grown with the same number of trees and the same seeds will
have the exact same out-of-bag samples, making out-of-bag error
estimates of the forests more comparable. If \code{NULL} (the default),
seeds are picked at random.}

\item{attach_data}{(\emph{logical}) if \code{TRUE}, a copy of the training
data will be attached to the output. This is required if you
plan on using functions like \link{orsf_pd_oob} or \link{orsf_summarize_uni}
to interpret the forest using its training data. Default is \code{TRUE}.}

\item{no_fit}{(\emph{logical}) if \code{TRUE}, model fitting steps are defined and
saved, but training is not initiated. The object returned can be
directly submitted to \code{orsf_train()} so long as \code{attach_data} is \code{TRUE}.}

\item{na_action}{(\emph{character}) what should happen when \code{data} contains missing values (i.e., \code{NA} values). Valid options are:
\itemize{
\item 'fail' : an error is thrown if \code{data} contains \code{NA} values
\item 'omit' : rows in \code{data} with incomplete data will be dropped
\item 'impute_meanmode' : missing values for continuous and categorical variables in \code{data} will be imputed using the mean and mode, respectively.
}}

\item{verbose_progress}{(\emph{logical}) if \code{TRUE}, progress messages are
printed in the console. If \code{FALSE} (the default), nothing is printed.}

\item{...}{Further arguments passed to or from other methods (not currently used).}

\item{object}{an untrained 'aorsf' object, created by setting
\code{no_fit = TRUE} in \code{orsf()}.}
}
\value{
an \emph{obliqueForest} object
}
\description{
Grow or specify an oblique random forest. While the name \code{orsf()}
implies that this function only works for survival forests,
it can be used for classification, regression, or survival
forests.
}
\details{
Why isn't this function called \code{orf()}? In its earlier versions, the
\code{aorsf} package was exclusively for \emph{o}blique \emph{r}andom \emph{s}urvival \emph{f}orests.

\strong{formula for survival oblique RFs}:
\itemize{
\item The response in \code{formula} can be a survival
object as returned by the \link[survival:Surv]{Surv} function,
but can also just be the time and status variables. I.e.,
\code{Surv(time, status) ~ .} works and \code{time + status ~ .} works
\item The response can also be a survival object stored in \code{data}.
For example, \code{y ~ .} is a valid formula if \code{data$y} inherits
from the \code{Surv} class.
}

\strong{mtry}:

The \code{mtry} parameter may be temporarily reduced to ensure that linear
models used to find combinations of predictors remain stable. This occurs
because coefficients in linear model fitting algorithms may become infinite
if the number of predictors exceeds the number of observations.

\strong{oobag_fun}:

If \code{oobag_fun} is specified, it will be used in to compute negation
importance or permutation importance, but it will not have any role
for ANOVA importance.

\strong{n_thread}:

If an R function is to be called from C++ (i.e., user-supplied function to
compute out-of-bag error or identify linear combinations of variables),
\code{n_thread} will automatically be set to 1 because attempting to run R
functions in multiple threads will cause the R session to crash.
}
\section{What is an oblique decision tree?}{


Decision trees are developed by splitting a set of training data into two
new subsets, with the goal of having more similarity within the new subsets
than between them. This splitting process is repeated on the resulting
subsets of data until a stopping criterion is met. When the new subsets of
data are formed based on a single predictor, the decision tree is said to
be axis-based because the splits of the data appear perpendicular to the
axis of the predictor. When linear combinations of variables are used
instead of a single variable, the tree is oblique because the splits of
the data are neither parallel nor at a right angle to the axis

\emph{Figure} : Decision trees for classification with axis-based splitting
(left) and oblique splitting (right). Cases are orange squares; controls
are purple circles. Both trees partition the predictor space defined by
variables X1 and X2, but the oblique splits do a better job of separating
the two classes.

\if{html}{\figure{tree_axis_v_oblique.png}{options: width=95\%}}
}

\section{What is a random forest?}{


Random forests are collections of de-correlated decision trees.
Predictions from each tree are aggregated to make an ensemble
prediction for the forest. For more details, see Breiman at el, 2001.
}

\section{Training, out-of-bag error, and testing}{


In random forests, each tree is grown with a bootstrapped version of
the training set. Because bootstrap samples are selected with replacement,
each bootstrapped training set contains about two-thirds of instances in
the original training set. The 'out-of-bag' data are instances that are
\emph{not} in the bootstrapped training set. Each tree in the random forest
can make predictions for its out-of-bag data, and the out-of-bag
predictions can be aggregated to make an ensemble out-of-bag prediction.
Since the out-of-bag data are not used to grow the tree, the accuracy of
the ensemble out-of-bag predictions approximate the generalization error
of the random forest. Generalization error refers to the error of a
random forest's predictions when it is applied to predict outcomes for
data that were not used to train it, i.e., testing data.
}

\section{Examples}{
\if{html}{\out{<div class="sourceCode r">}}\preformatted{library(aorsf)
library(magrittr) # for \%>\%
}\if{html}{\out{</div>}}

\code{orsf()} is the entry-point of the \code{aorsf} package. It can be used to
fit classification, regression, and survival forests.

For classification, we fit an oblique RF to predict penguin species
using \code{penguin} data from the magnificent \code{palmerpenguins} \href{https://allisonhorst.github.io/palmerpenguins/}{R package}

\if{html}{\out{<div class="sourceCode r">}}\preformatted{# An oblique classification RF
penguin_fit <- orsf(data = penguins_orsf,
                    n_tree = 5, 
                    formula = species ~ .)

penguin_fit
}\if{html}{\out{</div>}}

\if{html}{\out{<div class="sourceCode">}}\preformatted{## ---------- Oblique random classification forest
## 
##      Linear combinations: Accelerated Logistic regression
##           N observations: 333
##                N classes: 3
##                  N trees: 5
##       N predictors total: 7
##    N predictors per node: 3
##  Average leaves per tree: 6
## Min observations in leaf: 5
##           OOB stat value: 0.98
##            OOB stat type: AUC-ROC
##      Variable importance: anova
## 
## -----------------------------------------
}\if{html}{\out{</div>}}

For regression, we use the same data but predict bill length of
penguins:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{# An oblique regression RF
bill_fit <- orsf(data = penguins_orsf, 
                 n_tree = 5, 
                 formula = bill_length_mm ~ .)

bill_fit
}\if{html}{\out{</div>}}

\if{html}{\out{<div class="sourceCode">}}\preformatted{## ---------- Oblique random regression forest
## 
##      Linear combinations: Accelerated Linear regression
##           N observations: 333
##                  N trees: 5
##       N predictors total: 7
##    N predictors per node: 3
##  Average leaves per tree: 48.6
## Min observations in leaf: 5
##           OOB stat value: 0.73
##            OOB stat type: RSQ
##      Variable importance: anova
## 
## -----------------------------------------
}\if{html}{\out{</div>}}

My personal favorite is the oblique survival RF with accelerated Cox
regression because it was the first type of oblique RF that \code{aorsf}
provided (see \href{https://arxiv.org/abs/2208.01129}{ArXiv paper}; the paper
is also published in \emph{Journal of Computational and Graphical Statistics}
but is not publicly available there). Here, we use it to predict
mortality risk following diagnosis of primary biliary cirrhosis:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{# An oblique survival RF
pbc_fit <- orsf(data = pbc_orsf, 
                n_tree = 5,
                formula = Surv(time, status) ~ . - id)

pbc_fit
}\if{html}{\out{</div>}}

\if{html}{\out{<div class="sourceCode">}}\preformatted{## ---------- Oblique random survival forest
## 
##      Linear combinations: Accelerated Cox regression
##           N observations: 276
##                 N events: 111
##                  N trees: 5
##       N predictors total: 17
##    N predictors per node: 5
##  Average leaves per tree: 21.2
## Min observations in leaf: 5
##       Min events in leaf: 1
##           OOB stat value: 0.79
##            OOB stat type: Harrell's C-index
##      Variable importance: anova
## 
## -----------------------------------------
}\if{html}{\out{</div>}}
\subsection{More than one way to grow a forest}{

You can use \code{orsf(no_fit = TRUE)} to make a \emph{specification} to grow a
forest instead of a fitted forest.

\if{html}{\out{<div class="sourceCode r">}}\preformatted{orsf_spec <- orsf(pbc_orsf, 
                  formula = time + status ~ . - id,
                  no_fit = TRUE)

orsf_spec
}\if{html}{\out{</div>}}

\if{html}{\out{<div class="sourceCode">}}\preformatted{## Untrained oblique random survival forest
## 
##      Linear combinations: Accelerated Cox regression
##           N observations: 276
##                 N events: 111
##                  N trees: 500
##       N predictors total: 17
##    N predictors per node: 5
##  Average leaves per tree: 0
## Min observations in leaf: 5
##       Min events in leaf: 1
##           OOB stat value: none
##            OOB stat type: Harrell's C-index
##      Variable importance: anova
## 
## -----------------------------------------
}\if{html}{\out{</div>}}

Why would you do this? Two reasons:
\enumerate{
\item For very computational tasks, you may want to check how long it will
take to fit the forest before you commit to it:
}

\if{html}{\out{<div class="sourceCode r">}}\preformatted{orsf_spec \%>\% 
 orsf_update(n_tree = 10000) \%>\%
 orsf_time_to_train()
}\if{html}{\out{</div>}}

\if{html}{\out{<div class="sourceCode">}}\preformatted{## Time difference of 2.529039 secs
}\if{html}{\out{</div>}}
\enumerate{
\item If fitting multiple forests, use the blueprint along with
\code{orsf_train()} and \code{orsf_update()} to simplify your code:
}

\if{html}{\out{<div class="sourceCode r">}}\preformatted{orsf_fit <- orsf_train(orsf_spec)
orsf_fit_10 <- orsf_update(orsf_fit, leaf_min_obs = 10)
orsf_fit_20 <- orsf_update(orsf_fit, leaf_min_obs = 20)

orsf_fit$leaf_min_obs
}\if{html}{\out{</div>}}

\if{html}{\out{<div class="sourceCode">}}\preformatted{## [1] 5
}\if{html}{\out{</div>}}

\if{html}{\out{<div class="sourceCode r">}}\preformatted{orsf_fit_10$leaf_min_obs
}\if{html}{\out{</div>}}

\if{html}{\out{<div class="sourceCode">}}\preformatted{## [1] 10
}\if{html}{\out{</div>}}

\if{html}{\out{<div class="sourceCode r">}}\preformatted{orsf_fit_20$leaf_min_obs
}\if{html}{\out{</div>}}

\if{html}{\out{<div class="sourceCode">}}\preformatted{## [1] 20
}\if{html}{\out{</div>}}
}

\subsection{tidymodels}{

\code{tidymodels} includes support for \code{aorsf} as a computational engine:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{library(tidymodels)
library(censored)
library(yardstick)

pbc_tidy <- pbc_orsf \%>\% 
 mutate(event_time = Surv(time, status), .before = 1) \%>\% 
 select(-c(id, time, status)) \%>\% 
 as_tibble()

split  <- initial_split(pbc_tidy)

orsf_spec <- rand_forest() \%>\% 
 set_engine("aorsf") \%>\% 
 set_mode("censored regression")

orsf_fit <- fit(orsf_spec, 
                formula = event_time ~ ., 
                data = training(split))
}\if{html}{\out{</div>}}

Prediction with \code{aorsf} models at different times is also supported:

\if{html}{\out{<div class="sourceCode r">}}\preformatted{time_points <- seq(500, 3000, by = 500)

test_pred <- augment(orsf_fit, 
                     new_data = testing(split), 
                     eval_time = time_points)

brier_scores <- test_pred \%>\% 
  brier_survival(truth = event_time, .pred)

brier_scores
}\if{html}{\out{</div>}}

\if{html}{\out{<div class="sourceCode">}}\preformatted{## # A tibble: 6 x 4
##   .metric        .estimator .eval_time .estimate
##   <chr>          <chr>           <dbl>     <dbl>
## 1 brier_survival standard          500    0.0396
## 2 brier_survival standard         1000    0.0685
## 3 brier_survival standard         1500    0.0893
## 4 brier_survival standard         2000    0.105 
## 5 brier_survival standard         2500    0.117 
## 6 brier_survival standard         3000    0.132
}\if{html}{\out{</div>}}

\if{html}{\out{<div class="sourceCode r">}}\preformatted{roc_scores <- test_pred \%>\% 
  roc_auc_survival(truth = event_time, .pred)

roc_scores
}\if{html}{\out{</div>}}

\if{html}{\out{<div class="sourceCode">}}\preformatted{## # A tibble: 6 x 4
##   .metric          .estimator .eval_time .estimate
##   <chr>            <chr>           <dbl>     <dbl>
## 1 roc_auc_survival standard          500     0.966
## 2 roc_auc_survival standard         1000     0.950
## 3 roc_auc_survival standard         1500     0.942
## 4 roc_auc_survival standard         2000     0.944
## 5 roc_auc_survival standard         2500     0.947
## 6 roc_auc_survival standard         3000     0.953
}\if{html}{\out{</div>}}
}
}

\references{
\enumerate{
\item Harrell, E F, Califf, M R, Pryor, B D, Lee, L K, Rosati, A R (1982).
"Evaluating the yield of medical tests." \emph{Jama}, \emph{247}(18), 2543-2546.
\item Breiman, Leo (2001). "Random Forests." \emph{Machine Learning}, \emph{45}(1),
5-32. ISSN 1573-0565.
\item Ishwaran H, Kogalur UB, Blackstone EH, Lauer MS (2008). "Random
survival forests." \emph{The Annals of Applied Statistics}, \emph{2}(3).
\item Menze, H B, Kelm, Michael B, Splitthoff, N D, Koethe, Ullrich,
Hamprecht, A F (2011). "On oblique random forests." In \emph{Machine
Learning and Knowledge Discovery in Databases: European Conference,
ECML PKDD 2011, Athens, Greece, September 5-9, 2011, Proceedings, Part
II 22}, 453-469. Springer.
\item Jaeger BC, Long DL, Long DM, Sims M, Szychowski JM, Min Y, Mcclure LA,
Howard G, Simon N (2019). "Oblique random survival forests." \emph{The
Annals of Applied Statistics}, \emph{13}(3).
\item Jaeger BC, Welden S, Lenoir K, Speiser JL, Segar MW, Pandey A, Pajewski
NM (2023). "Accelerated and interpretable oblique random survival
forests." \emph{Journal of Computational and Graphical Statistics}, 1-16.
}
}
