% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/FuzzyTokenSet.R
\name{FuzzyTokenSet}
\alias{FuzzyTokenSet}
\title{Fuzzy Token Set Comparator}
\usage{
FuzzyTokenSet(
  inner_comparator = Levenshtein(normalize = TRUE),
  agg_function = base::mean,
  deletion = 1,
  insertion = 1,
  substitution = 1
)
}
\arguments{
\item{inner_comparator}{inner string distance comparator of class
\code{\linkS4class{StringComparator}}. Defaults to normalized \code{\link{Levenshtein}}
distance.}

\item{agg_function}{function used to aggregate the costs of the optimal
operations. Defaults to \code{\link[base:mean]{base::mean}}.}

\item{deletion}{non-negative weight associated with deletion of a token.
Defaults to 1.}

\item{insertion}{non-negative weight associated insertion of a token.
Defaults to 1.}

\item{substitution}{non-negative weight associated with substitution of a
token. Defaults to 1.}
}
\description{
Compares a pair of token sets \eqn{x} and \eqn{y} by computing the
optimal cost of transforming \eqn{x} into \eqn{y} using single-token
operations (insertions, deletions and substitutions). The cost of
single-token operations is determined at the character-level using an
internal string comparator.
}
\details{
A token set is an unordered enumeration of tokens, which may include
duplicates. Given two token sets \eqn{x} and \eqn{y}, this comparator
computes the optimal cost of transforming \eqn{x} into \eqn{y} using the
following single-token operations:
\itemize{
\item deleting a token \eqn{a} from \eqn{x} at cost \eqn{w_d \times \mathrm{inner}(a, "")}{w_d * inner(a, "")}
\item inserting a token \eqn{b} in \eqn{y} at cost \eqn{w_i \times \mathrm{inner}("", b)}{w_i * inner("", b)}
\item substituting a token \eqn{a} in \eqn{x} for a token \eqn{b}
in \eqn{y} at cost \eqn{w_s \times \mathrm{inner}(a, b)}{w_s * inner(a, b)}
}

where \eqn{\mathrm{inner}}{inner} is an internal string comparator and
\eqn{w_d, w_i, w_s} are non-negative weights, referred to as \code{deletion},
\code{insertion} and \code{substitution} in the parameter list. By default, the
\emph{mean} cost of the optimal set of operations is returned. Other methods of
aggregating the costs are supported by specifying a non-default
\code{agg_function}.

If the internal string comparator is a \emph{distance} function, then the optimal
set of operations \emph{minimize} the cost. Otherwise, the optimal set of
operations \emph{maximize} the cost. The optimization problem is solved exactly
using a linear sum assignment solver.
}
\note{
This comparator is qualitatively similar to the \code{\link{MongeElkan}}
comparator, however it is arguably more principled, since it is formulated
as a cost optimization problem. It also offers more control over the costs
of missing tokens (by varying the \code{deletion} and \code{insertion} weights).
This is useful for comparing full names, when dropping a name (e.g.
middle name) shouldn't be severely penalized.
}
\examples{
## Compare names with heterogenous representations
x <- "The University of California - San Diego"
y <- "Univ. Calif. San Diego"
# Tokenize strings on white space
x <- strsplit(x, '\\\\s+')
y <- strsplit(y, '\\\\s+')
FuzzyTokenSet()(x, y)
# Reduce the cost associated with missing words
FuzzyTokenSet(deletion = 0.5, insertion = 0.5)(x, y)

## Compare full name with abbreviated name, reducing the penalty 
## for dropping parts of the name
fullname <- "JOSE ELIAS TEJADA BASQUES"
name <- "JOSE BASQUES"
# Tokenize strings on white space
fullname <- strsplit(fullname, '\\\\s+')
name <- strsplit(name, '\\\\s+')
comparator <- FuzzyTokenSet(deletion = 0.5)
comparator(fullname, name) < comparator(name, fullname) # TRUE

}
