% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ml_classification_gbt_classifier.R,
%   R/ml_model_gradient_boosted_trees.R, R/ml_regression_gbt_regressor.R
\name{ml_gbt_classifier}
\alias{ml_gbt_classifier}
\alias{ml_gradient_boosted_trees}
\alias{ml_gbt_regressor}
\title{Spark ML -- Gradient Boosted Trees}
\usage{
ml_gbt_classifier(x, formula = NULL, max_iter = 20, max_depth = 5,
  step_size = 0.1, subsampling_rate = 1,
  feature_subset_strategy = "auto", min_instances_per_node = 1L,
  max_bins = 32, min_info_gain = 0, loss_type = "logistic",
  seed = NULL, thresholds = NULL, checkpoint_interval = 10,
  cache_node_ids = FALSE, max_memory_in_mb = 256,
  features_col = "features", label_col = "label",
  prediction_col = "prediction", probability_col = "probability",
  raw_prediction_col = "rawPrediction",
  uid = random_string("gbt_classifier_"), ...)

ml_gradient_boosted_trees(x, formula = NULL, type = c("auto",
  "regression", "classification"), features_col = "features",
  label_col = "label", prediction_col = "prediction",
  probability_col = "probability",
  raw_prediction_col = "rawPrediction", checkpoint_interval = 10,
  loss_type = c("auto", "logistic", "squared", "absolute"),
  max_bins = 32, max_depth = 5, max_iter = 20L, min_info_gain = 0,
  min_instances_per_node = 1, step_size = 0.1, subsampling_rate = 1,
  feature_subset_strategy = "auto", seed = NULL, thresholds = NULL,
  cache_node_ids = FALSE, max_memory_in_mb = 256,
  uid = random_string("gradient_boosted_trees_"), response = NULL,
  features = NULL, ...)

ml_gbt_regressor(x, formula = NULL, max_iter = 20, max_depth = 5,
  step_size = 0.1, subsampling_rate = 1,
  feature_subset_strategy = "auto", min_instances_per_node = 1,
  max_bins = 32, min_info_gain = 0, loss_type = "squared",
  seed = NULL, checkpoint_interval = 10, cache_node_ids = FALSE,
  max_memory_in_mb = 256, features_col = "features",
  label_col = "label", prediction_col = "prediction",
  uid = random_string("gbt_regressor_"), ...)
}
\arguments{
\item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}

\item{formula}{Used when \code{x} is a \code{tbl_spark}. R formula as a character string or a formula. This is used to transform the input dataframe before fitting, see \link{ft_r_formula} for details.}

\item{max_iter}{Maxmimum number of iterations.}

\item{max_depth}{Maximum depth of the tree (>= 0); that is, the maximum
number of nodes separating any leaves from the root of the tree.}

\item{step_size}{Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator. (default = 0.1)}

\item{subsampling_rate}{Fraction of the training data used for learning each decision tree, in range (0, 1]. (default = 1.0)}

\item{feature_subset_strategy}{The number of features to consider for splits at each tree node. See details for options.}

\item{min_instances_per_node}{Minimum number of instances each child must
have after split.}

\item{max_bins}{The maximum number of bins used for discretizing
continuous features and for choosing how to split on features at
each node. More bins give higher granularity.}

\item{min_info_gain}{Minimum information gain for a split to be considered
at a tree node. Should be >= 0, defaults to 0.}

\item{loss_type}{Loss function which GBT tries to minimize. Supported: \code{"squared"} (L2) and \code{"absolute"} (L1) (default = squared) for regression and \code{"logistic"} (default) for classification. For \code{ml_gradient_boosted_trees}, setting \code{"auto"}
will default to the appropriate loss type based on model type.}

\item{seed}{Seed for random numbers.}

\item{thresholds}{Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values > 0 excepting that at most one value may be 0. The class with largest value \code{p/t} is predicted, where \code{p} is the original probability of that class and \code{t} is the class's threshold.}

\item{checkpoint_interval}{Set checkpoint interval (>= 1) or disable checkpoint (-1).
E.g. 10 means that the cache will get checkpointed every 10 iterations, defaults to 10.}

\item{cache_node_ids}{If \code{FALSE}, the algorithm will pass trees to executors to match instances with nodes.
If \code{TRUE}, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.
Defaults to \code{FALSE}.}

\item{max_memory_in_mb}{Maximum memory in MB allocated to histogram aggregation.
If too small, then 1 node will be split per iteration,
and its aggregates may exceed this size. Defaults to 256.}

\item{features_col}{Features column name, as a length-one character vector. The column should be single vector column of numeric values. Usually this column is output by \code{\link{ft_r_formula}}.}

\item{label_col}{Label column name. The column should be a numeric column. Usually this column is output by \code{\link{ft_r_formula}}.}

\item{prediction_col}{Prediction column name.}

\item{probability_col}{Column name for predicted class conditional probabilities.}

\item{raw_prediction_col}{Raw prediction (a.k.a. confidence) column name.}

\item{uid}{A character string used to uniquely identify the ML estimator.}

\item{...}{Optional arguments; see Details.}

\item{type}{The type of model to fit. \code{"regression"} treats the response
as a continuous variable, while \code{"classification"} treats the response
as a categorical variable. When \code{"auto"} is used, the model type is
inferred based on the response variable type -- if it is a numeric type,
then regression is used; classification otherwise.}

\item{response}{(Deprecated) The name of the response column (as a length-one character vector.)}

\item{features}{(Deprecated) The name of features (terms) to use for the model fit.}
}
\value{
The object returned depends on the class of \code{x}.

\itemize{
  \item \code{spark_connection}: When \code{x} is a \code{spark_connection}, the function returns an instance of a \code{ml_predictor} object. The object contains a pointer to
  a Spark \code{Predictor} object and can be used to compose
  \code{Pipeline} objects.

  \item \code{ml_pipeline}: When \code{x} is a \code{ml_pipeline}, the function returns a \code{ml_pipeline} with
  the predictor appended to the pipeline.

  \item \code{tbl_spark}: When \code{x} is a \code{tbl_spark}, a predictor is constructed then
  immediately fit with the input \code{tbl_spark}, returning a prediction model.

  \item \code{tbl_spark}, with \code{formula}: specified When \code{formula}
    is specified, the input \code{tbl_spark} is first transformed using a
    \code{RFormula} transformer before being fit by
    the predictor. The object returned in this case is a \code{ml_model} which is a
    wrapper of a \code{ml_pipeline_model}.
}
}
\description{
Perform binary classification and regression using gradient boosted trees. Multiclass classification is not supported yet.
}
\details{
When \code{x} is a \code{tbl_spark} and \code{formula} (alternatively, \code{response} and \code{features}) is specified, the function returns a \code{ml_model} object wrapping a \code{ml_pipeline_model} which contains data pre-processing transformers, the ML predictor, and, for classification models, a post-processing transformer that converts predictions into class labels. For classification, an optional argument \code{predicted_label_col} (defaults to \code{"predicted_label"}) can be used to specify the name of the predicted label column. In addition to the fitted \code{ml_pipeline_model}, \code{ml_model} objects also contain a \code{ml_pipeline} object where the ML predictor stage is an estimator ready to be fit against data. This is utilized by \code{\link{ml_save}} with \code{type = "pipeline"} to faciliate model refresh workflows.

The supported options for \code{feature_subset_strategy} are
  \itemize{
    \item \code{"auto"}: Choose automatically for task: If \code{num_trees == 1}, set to \code{"all"}. If \code{num_trees > 1} (forest), set to \code{"sqrt"} for classification and to \code{"onethird"} for regression.
    \item \code{"all"}: use all features
    \item \code{"onethird"}: use 1/3 of the features
    \item \code{"sqrt"}: use use sqrt(number of features)
    \item \code{"log2"}: use log2(number of features)
    \item \code{"n"}: when \code{n} is in the range (0, 1.0], use n * number of features. When \code{n} is in the range (1, number of features), use \code{n} features. (default = \code{"auto"})
    }

\code{ml_gradient_boosted_trees} is a wrapper around \code{ml_gbt_regressor.tbl_spark} and \code{ml_gbt_classifier.tbl_spark} and calls the appropriate method based on model type.
}
\examples{
\dontrun{
sc <- spark_connect(master = "local")
iris_tbl <- sdf_copy_to(sc, iris, name = "iris_tbl", overwrite = TRUE)

partitions <- iris_tbl \%>\%
  sdf_partition(training = 0.7, test = 0.3, seed = 1111)

iris_training <- partitions$training
iris_test <- partitions$test

gbt_model <- iris_training \%>\%
  ml_gradient_boosted_trees(Sepal_Length ~ Petal_Length + Petal_Width)

pred <- sdf_predict(iris_test, gbt_model)

ml_regression_evaluator(pred, label_col = "Sepal_Length")
}

}
\seealso{
See \url{http://spark.apache.org/docs/latest/ml-classification-regression.html} for
  more information on the set of supervised learning algorithms.

Other ml algorithms: \code{\link{ml_aft_survival_regression}},
  \code{\link{ml_decision_tree_classifier}},
  \code{\link{ml_generalized_linear_regression}},
  \code{\link{ml_isotonic_regression}},
  \code{\link{ml_linear_regression}},
  \code{\link{ml_linear_svc}},
  \code{\link{ml_logistic_regression}},
  \code{\link{ml_multilayer_perceptron_classifier}},
  \code{\link{ml_naive_bayes}},
  \code{\link{ml_one_vs_rest}},
  \code{\link{ml_random_forest_classifier}}
}
\concept{ml algorithms}
