% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/task.R
\name{Task}
\alias{Task}
\title{Creating and evaluating tasks}
\description{
Evaluation \code{Task}s provide a flexible data structure for evaluating LLM-based
tools.
\enumerate{
\item \strong{Datasets} contain a set of labelled samples. Datasets are just a
tibble with columns \code{input} and \code{target}, where \code{input} is a prompt
and \code{target} is either literal value(s) or grading guidance.
\item \strong{Solvers} evaluate the \code{input} in the dataset and produce a final result.
\item \strong{Scorers} evaluate the final output of solvers. They may use text
comparisons (like \code{\link[=detect_match]{detect_match()}}), model grading (like
\code{\link[=model_graded_qa]{model_graded_qa()}}), or other custom schemes.
}

\strong{The usual flow of LLM evaluation with Tasks calls \verb{$new()} and then \verb{$eval()}.}
\verb{$eval()} just calls \verb{$solve()}, \verb{$score()}, \verb{$measure()}, \verb{$log()},
and \verb{$view()} in order. The remaining methods are generally only
recommended for expert use.
}
\examples{
if (!identical(Sys.getenv("ANTHROPIC_API_KEY"), "")) {
  # set the log directory to a temporary directory
  withr::local_envvar(VITALS_LOG_DIR = withr::local_tempdir())

  library(ellmer)
  library(tibble)

  simple_addition <- tibble(
    input = c("What's 2+2?", "What's 2+3?"),
    target = c("4", "5")
  )

  # create a new Task
  tsk <- Task$new(
    dataset = simple_addition,
    solver = generate(chat_anthropic(model = "claude-3-7-sonnet-latest")),
    scorer = model_graded_qa()
  )

  # evaluate the task (runs solver and scorer) and opens
  # the results in the Inspect log viewer (if interactive)
  tsk$eval()

  # $eval() is shorthand for:
  tsk$solve()
  tsk$score()
  tsk$measure()
  tsk$log()
  tsk$view()

  # get the evaluation results as a data frame
  tsk$get_samples()

  # view the task directory with $view() or vitals_view()
  vitals_view()
}

}
\seealso{
\code{\link[=generate]{generate()}} for the simplest possible solver, and
\link{scorer_model} and \link{scorer_detect} for two built-in approaches to
scoring.
}
\section{Public fields}{
\if{html}{\out{<div class="r6-fields">}}
\describe{
\item{\code{dir}}{The directory where evaluation logs will be written to. Defaults
to \code{vitals_log_dir()}.}

\item{\code{metrics}}{A named vector of metric values resulting from \verb{$measure()}
(called inside of \verb{$eval()}). Will be \code{NULL} if metrics have yet to
be applied.}
}
\if{html}{\out{</div>}}
}
\section{Methods}{
\subsection{Public methods}{
\itemize{
\item \href{#method-Task-new}{\code{Task$new()}}
\item \href{#method-Task-eval}{\code{Task$eval()}}
\item \href{#method-Task-get_samples}{\code{Task$get_samples()}}
\item \href{#method-Task-solve}{\code{Task$solve()}}
\item \href{#method-Task-score}{\code{Task$score()}}
\item \href{#method-Task-measure}{\code{Task$measure()}}
\item \href{#method-Task-log}{\code{Task$log()}}
\item \href{#method-Task-view}{\code{Task$view()}}
\item \href{#method-Task-set_solver}{\code{Task$set_solver()}}
\item \href{#method-Task-set_scorer}{\code{Task$set_scorer()}}
\item \href{#method-Task-set_metrics}{\code{Task$set_metrics()}}
\item \href{#method-Task-get_cost}{\code{Task$get_cost()}}
\item \href{#method-Task-clone}{\code{Task$clone()}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Task-new"></a>}}
\if{latex}{\out{\hypertarget{method-Task-new}{}}}
\subsection{Method \code{new()}}{
The typical flow of LLM evaluation with vitals tends to involve first
calling this method and then \verb{$eval()} on the resulting object.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{Task$new(
  dataset,
  solver,
  scorer,
  metrics = NULL,
  epochs = NULL,
  name = deparse(substitute(dataset)),
  dir = vitals_log_dir()
)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{dataset}}{A tibble with, minimally, columns \code{input} and \code{target}.}

\item{\code{solver}}{A function that takes a vector of inputs from the
dataset's \code{input} column as its first argument and determines values
approximating \code{dataset$target}. Its return value must be a list with
the following elements:
\itemize{
\item \code{result} - A character vector of the final responses, with the same length
as \code{dataset$input}.
\item \code{solver_chat} - A list of ellmer Chat objects that were used to solve
each input, also with the same length as \code{dataset$input}.
}

Additional output elements can be included in a slot \code{solver_metadata} that
has the same length as \code{dataset$input}, which will be logged in
\code{solver_metadata}.

Additional arguments can be passed to the solver via \verb{$solve(...)}
or \verb{$eval(...)}. See the definition of \code{\link[=generate]{generate()}} for a function that
outputs a valid solver that just passes inputs to ellmer Chat objects'
\verb{$chat()} method in parallel.}

\item{\code{scorer}}{A function that evaluates how well the solver's return value
approximates the corresponding elements of \code{dataset$target}. The function
should take in the \verb{$get_samples()} slot of a Task object and return a list with
the following elements:
\itemize{
\item \code{score} - A vector of scores with length equal to \code{nrow(samples)}.
Built-in scorers return ordered factors with
levels \code{I} < \code{P} (optionally) < \code{C} (standing for "Incorrect", "Partially
Correct", and "Correct"). If your scorer returns this output type, the
package will automatically calculate metrics.
}

Optionally:
\itemize{
\item \code{scorer_chat} - If your scorer makes use of ellmer, also include a list of
ellmer Chat objects that were used to score each result, also with
length \code{nrow(samples)}.
\item \code{scorer_metadata} - Any intermediate results or other values that you'd
like to be stored in the persistent log. This should also have length
equal to \code{nrow(samples)}.
}

Scorers will probably make use of \code{samples$input}, \code{samples$target}, and
\code{samples$result} specifically. See \link[=scorer_model]{model-based scoring}
for examples.}

\item{\code{metrics}}{A named list of functions that take in a vector of scores
(as in \code{task$get_samples()$score}) and output a single numeric value.}

\item{\code{epochs}}{The number of times to repeat each sample. Evaluate each sample
multiple times to better quantify variation. Optional, defaults to \code{1L}.
The value of \code{epochs} supplied to \verb{$eval()} or \verb{$score()} will take
precedence over the value in \verb{$new()}.}

\item{\code{name}}{A name for the evaluation task. Defaults to
\code{deparse(substitute(dataset))}.}

\item{\code{dir}}{Directory where logs should be stored.}
}
\if{html}{\out{</div>}}
}
\subsection{Returns}{
A new Task object.
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Task-eval"></a>}}
\if{latex}{\out{\hypertarget{method-Task-eval}{}}}
\subsection{Method \code{eval()}}{
Evaluates the task by running the solver, scorer, logging results, and
viewing (if interactive). This method works by calling \verb{$solve()},
\verb{$score()}, \verb{$log()}, and \verb{$view()} in sequence.

The typical flow of LLM evaluation with vitals tends to involve first
calling \verb{$new()} and then this method on the resulting object.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{Task$eval(..., epochs = NULL, view = interactive())}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{...}}{Additional arguments passed to the solver and scorer functions.}

\item{\code{epochs}}{The number of times to repeat each sample. Evaluate each sample
multiple times to better quantify variation. Optional, defaults to \code{1L}.
The value of \code{epochs} supplied to \verb{$eval()} or \verb{$score()} will take
precedence over the value in \verb{$new()}.}

\item{\code{view}}{Automatically open the viewer after evaluation (defaults to
TRUE if interactive, FALSE otherwise).}
}
\if{html}{\out{</div>}}
}
\subsection{Returns}{
The Task object (invisibly)
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Task-get_samples"></a>}}
\if{latex}{\out{\hypertarget{method-Task-get_samples}{}}}
\subsection{Method \code{get_samples()}}{
The task's samples represent the evaluation in a data frame format.

\code{\link[=vitals_bind]{vitals_bind()}} row-binds the output of this
function called across several tasks.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{Task$get_samples()}\if{html}{\out{</div>}}
}

\subsection{Returns}{
A tibble representing the evaluation. Based on the \code{dataset},
\code{epochs} may duplicate rows, and the solver and scorer will append
columns to this data.
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Task-solve"></a>}}
\if{latex}{\out{\hypertarget{method-Task-solve}{}}}
\subsection{Method \code{solve()}}{
Solve the task by running the solver
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{Task$solve(..., epochs = NULL)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{...}}{Additional arguments passed to the solver function.}

\item{\code{epochs}}{The number of times to repeat each sample. Evaluate each sample
multiple times to better quantify variation. Optional, defaults to \code{1L}.
The value of \code{epochs} supplied to \verb{$eval()} or \verb{$score()} will take
precedence over the value in \verb{$new()}.}
}
\if{html}{\out{</div>}}
}
\subsection{Returns}{
The Task object (invisibly)
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Task-score"></a>}}
\if{latex}{\out{\hypertarget{method-Task-score}{}}}
\subsection{Method \code{score()}}{
Score the task by running the scorer and then applying metrics to
its results.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{Task$score(...)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{...}}{Additional arguments passed to the scorer function.}
}
\if{html}{\out{</div>}}
}
\subsection{Returns}{
The Task object (invisibly)
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Task-measure"></a>}}
\if{latex}{\out{\hypertarget{method-Task-measure}{}}}
\subsection{Method \code{measure()}}{
Applies metrics to a scored Task.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{Task$measure()}\if{html}{\out{</div>}}
}

\subsection{Returns}{
The Task object (invisibly)
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Task-log"></a>}}
\if{latex}{\out{\hypertarget{method-Task-log}{}}}
\subsection{Method \code{log()}}{
Log the task to a directory.

Note that, if an \code{VITALS_LOG_DIR} envvar is set, this will happen
automatically in \verb{$eval()}.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{Task$log(dir = vitals_log_dir())}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{dir}}{The directory to write the log to.}
}
\if{html}{\out{</div>}}
}
\subsection{Returns}{
The path to the logged file, invisibly.
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Task-view"></a>}}
\if{latex}{\out{\hypertarget{method-Task-view}{}}}
\subsection{Method \code{view()}}{
View the task results in the Inspect log viewer
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{Task$view()}\if{html}{\out{</div>}}
}

\subsection{Returns}{
The Task object (invisibly)
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Task-set_solver"></a>}}
\if{latex}{\out{\hypertarget{method-Task-set_solver}{}}}
\subsection{Method \code{set_solver()}}{
Set the solver function
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{Task$set_solver(solver)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{solver}}{A function that takes a vector of inputs from the
dataset's \code{input} column as its first argument and determines values
approximating \code{dataset$target}. Its return value must be a list with
the following elements:
\itemize{
\item \code{result} - A character vector of the final responses, with the same length
as \code{dataset$input}.
\item \code{solver_chat} - A list of ellmer Chat objects that were used to solve
each input, also with the same length as \code{dataset$input}.
}

Additional output elements can be included in a slot \code{solver_metadata} that
has the same length as \code{dataset$input}, which will be logged in
\code{solver_metadata}.

Additional arguments can be passed to the solver via \verb{$solve(...)}
or \verb{$eval(...)}. See the definition of \code{\link[=generate]{generate()}} for a function that
outputs a valid solver that just passes inputs to ellmer Chat objects'
\verb{$chat()} method in parallel.}
}
\if{html}{\out{</div>}}
}
\subsection{Returns}{
The Task object (invisibly)
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Task-set_scorer"></a>}}
\if{latex}{\out{\hypertarget{method-Task-set_scorer}{}}}
\subsection{Method \code{set_scorer()}}{
Set the scorer function
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{Task$set_scorer(scorer)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{scorer}}{A function that evaluates how well the solver's return value
approximates the corresponding elements of \code{dataset$target}. The function
should take in the \verb{$get_samples()} slot of a Task object and return a list with
the following elements:
\itemize{
\item \code{score} - A vector of scores with length equal to \code{nrow(samples)}.
Built-in scorers return ordered factors with
levels \code{I} < \code{P} (optionally) < \code{C} (standing for "Incorrect", "Partially
Correct", and "Correct"). If your scorer returns this output type, the
package will automatically calculate metrics.
}

Optionally:
\itemize{
\item \code{scorer_chat} - If your scorer makes use of ellmer, also include a list of
ellmer Chat objects that were used to score each result, also with
length \code{nrow(samples)}.
\item \code{scorer_metadata} - Any intermediate results or other values that you'd
like to be stored in the persistent log. This should also have length
equal to \code{nrow(samples)}.
}

Scorers will probably make use of \code{samples$input}, \code{samples$target}, and
\code{samples$result} specifically. See \link[=scorer_model]{model-based scoring}
for examples.}
}
\if{html}{\out{</div>}}
}
\subsection{Returns}{
The Task object (invisibly)
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Task-set_metrics"></a>}}
\if{latex}{\out{\hypertarget{method-Task-set_metrics}{}}}
\subsection{Method \code{set_metrics()}}{
Set the metrics that will be applied in \verb{$measure()} (and thus \verb{$eval()}).
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{Task$set_metrics(metrics)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{metrics}}{A named list of functions that take in a vector of scores
(as in \code{task$get_samples()$score}) and output a single numeric value.}
}
\if{html}{\out{</div>}}
}
\subsection{Returns}{
The Task (invisibly)
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Task-get_cost"></a>}}
\if{latex}{\out{\hypertarget{method-Task-get_cost}{}}}
\subsection{Method \code{get_cost()}}{
The cost of this eval
This is a wrapper around ellmer's \verb{$token_usage()} function.
That function is called at the beginning and end of each call to
\verb{$solve()} and \verb{$score()}; this function returns the cost inferred
by taking the differences in values of \verb{$token_usage()} over time.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{Task$get_cost()}\if{html}{\out{</div>}}
}

\subsection{Returns}{
A tibble displaying the cost of solving and scoring the
evaluation by model, separately for the solver and scorer.
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-Task-clone"></a>}}
\if{latex}{\out{\hypertarget{method-Task-clone}{}}}
\subsection{Method \code{clone()}}{
The objects of this class are cloneable with this method.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{Task$clone(deep = FALSE)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{deep}}{Whether to make a deep clone.}
}
\if{html}{\out{</div>}}
}
}
}
