% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/textstat_dist.R, R/textstat_simil.R
\name{textstat_dist}
\alias{textstat_dist}
\alias{textstat_simil}
\title{Similarity and distance computation between documents or features}
\usage{
textstat_dist(x, selection = NULL, margin = c("documents", "features"),
  method = "euclidean", upper = FALSE, diag = FALSE, p = 2)

textstat_simil(x, selection = NULL, margin = c("documents", "features"),
  method = "correlation", upper = FALSE, diag = FALSE)
}
\arguments{
\item{x}{a \link{dfm} object}

\item{selection}{character vector of document names or feature labels from
\code{x}.  A \code{"dist"} object is returned if selection is \code{NULL}, 
otherwise, a matrix is returned.}

\item{margin}{identifies the margin of the dfm on which similarity or 
difference will be computed:  \code{documents} for documents or 
\code{features} for word/term features.}

\item{method}{method the similarity or distance measure to be used; see
Details}

\item{upper}{whether the upper triangle of the symmetric \eqn{V \times V} 
matrix is recorded}

\item{diag}{whether the diagonal of the distance matrix should be recorded}

\item{p}{The power of the Minkowski distance.}
}
\value{
\code{textstat_simil} and \code{textstat_dist} return \code{dist} class objects.
}
\description{
These functions compute matrixes of distances and similarities between 
documents or features from a \code{\link{dfm}} and return a 
\code{\link[stats]{dist}} object (or a matrix if specific targets are
selected).  They are fast and robust because they operate directly on the sparse
\link{dfm} objects.
}
\details{
\code{textstat_dist} options are: \code{"euclidean"} (default), 
  \code{"Chisquared"}, \code{"Chisquared2"}, \code{"hamming"}, 
  \code{"kullback"}. \code{"manhattan"}, \code{"maximum"}, \code{"canberra"},
  and \code{"minkowski"}.

\code{textstat_simil} options are: \code{"correlation"} (default), 
  \code{"cosine"}, \code{"jaccard"}, \code{"eJaccard"}, \code{"dice"},
  \code{"eDice"}, \code{"simple matching"}, \code{"hamann"}, and 
  \code{"faith"}.
}
\note{
If you want to compute similarity on a "normalized" dfm object 
  (controlling for variable document lengths, for methods such as correlation
  for which different document lengths matter), then wrap the input dfm in 
  \code{\link{dfm_weight}(x, "relfreq")}.
}
\examples{
# create a dfm from inaugural addresses from Reagan onwards
presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1990), 
               remove = stopwords("english"), stem = TRUE, remove_punct = TRUE)
               
# distances for documents 
(d1 <- textstat_dist(presDfm, margin = "documents"))
as.matrix(d1)

# distances for specific documents
textstat_dist(presDfm, "2017-Trump", margin = "documents")
textstat_dist(presDfm, "2005-Bush", margin = "documents", method = "eJaccard")
(d2 <- textstat_dist(presDfm, c("2009-Obama" , "2013-Obama"), margin = "documents"))
as.list(d1)

# similarities for documents
(s1 <- textstat_simil(presDfm, method = "cosine", margin = "documents"))
as.matrix(s1)
as.list(s1)

# similarities for for specific documents
textstat_simil(presDfm, "2017-Trump", margin = "documents")
textstat_simil(presDfm, "2017-Trump", method = "cosine", margin = "documents")
textstat_simil(presDfm, c("2009-Obama" , "2013-Obama"), margin = "documents")

# compute some term similarities
s2 <- textstat_simil(presDfm, c("fair", "health", "terror"), method = "cosine", 
                      margin = "features")
head(as.matrix(s2), 10)
as.list(s2, n = 8)

}
\references{
The \code{"Chisquared"} metric is from Legendre, P., & Gallagher,
  E. D. (2001).
  "\href{http://adn.biol.umontreal.ca/~numericalecology/Reprints/Legendre_&_Gallagher.pdf}{Ecologically
   meaningful transformations for ordination of species data}".
  \emph{Oecologia}, 129(2), 271–280. doi.org/10.1007/s004420100716
  
  The \code{"Chisquared2"} metric is the "Quadratic-Chi" measure from Pele,
  O., & Werman, M. (2010). 
  "\href{http://www.ariel.ac.il/sites/ofirpele/publications/ECCV2010.pdf}{The
  Quadratic-Chi Histogram Distance Family}". In \emph{Computer Vision – ECCV
  2010} (Vol. 6312, pp. 749–762). Berlin, Heidelberg: Springer, Berlin,
  Heidelberg. doi.org/10.1007/978-3-642-15552-9_54.
  
  \code{"hamming"} is \eqn{\sum{x \neq y)}}.

  \code{"kullback"} is the Kullback-Leibler distance, which assumes that
  \eqn{P(x_i) = 0} implies \eqn{P(y_i)=0}, and in case both \eqn{P(x_i)} and
  \eqn{P(y_i)} equals to zero, then \eqn{P(x_i) * log(p(x_i)/p(y_i))} is
  assumed to be zero as the limit value.  The formula is:
   \deqn{\sum{P(x)*log(P(x)/p(y))}}
   
  All other measures are described in the \pkg{proxy} package.
}
\seealso{
\code{\link{textstat_dist}}, \code{\link{as.list.dist}},
  \code{\link{dist}}
}
\author{
Kenneth Benoit, Haiyan Wang
}
