% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/collocations2.R
\name{collocations2}
\alias{collocations2}
\title{detect collocations from text}
\usage{
collocations2(x, method = c("lr", "chi2", "pmi", "dice"), features = "*",
  valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE,
  min_count = 1, size = 2, ...)
}
\arguments{
\item{x}{a character, \link{corpus}, \link{tokens} object}

\item{method}{association measure for detecting collocations.  Let \eqn{i} 
index documents, and \eqn{j} index features, \eqn{n_{ij}} refers to 
observed counts, and \eqn{m_{ij}} the expected counts in a collocations 
frequency table of dimensions \eqn{(J - size + 1)^2}. Available measures 
are computed as: \describe{ \item{\code{"lr"}}{The likelihood ratio 
statistic \eqn{G^2}, computed as: \deqn{2 * \sum_i \sum_j ( n_{ij} * log 
\frac{n_{ij}}{m_{ij}} )} } \item{\code{"chi2"}}{Pearson's \eqn{\chi^2} 
statistic, computed as: \deqn{\sum_i \sum_j \frac{(n_{ij} - 
m_{ij})^2}{m_{ij}}} } \item{\code{"pmi"}}{point-wise mutual information 
score, computed as log \eqn{n_{11}/m_{11}}} \item{\code{"dice"}}{the Dice 
coefficient, computed as \eqn{n_{11}/n_{1.} + n_{.1}}} 
\item{\code{"all"}}{returns all of the above} }}

\item{features}{features to be selected for collocations}

\item{valuetype}{how to interpret keyword expressions: \code{"glob"} for 
"glob"-style wildcard expressions; \code{"regex"} for regular expressions;
or \code{"fixed"} for exact matching. See \link{valuetype} for details.}

\item{case_insensitive}{ignore the case when matching features if \code{TRUE}}

\item{min_count}{exclude collocations below this count}

\item{size}{length of the collocation.  Only bigram (\code{n=2}) and trigram 
(\code{n=3}) collocations are currently implemented.  Can be \code{c(2,3)}
(or \code{2:3}) to return both bi- and tri-gram collocations.}

\item{...}{additional parameters passed to \code{\link{tokens}}}
}
\value{
a collocations class object: a specially classed data.table consisting 
  of collocations, their frequencies, and the computed association measure(s).
}
\description{
Detects collocations from texts or a corpus, returning a data.frame of
collocations and their scores, sorted in descending order of the association
measure.  Words separated by punctuation delimiters are not counted by
default (\code{spanPunct = FALSE})  as adjacent and hence are not eligible to
be collocations.
}
\references{
McInnes, B T. 2004. "Extending the Log Likelihood Measure to 
  Improve Collocation Identification."  M.Sc. Thesis, University of 
  Minnesota.
}
\seealso{
\link{tokens_ngrams}
}
\author{
Kenneth Benoit
}
\keyword{collocations}
\keyword{internal}
