% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/dfm-main.R, R/dfm-methods.R
\name{dfm}
\alias{as.dfm}
\alias{dfm}
\alias{dfm.character}
\alias{dfm.corpus}
\alias{dfm.tokenizedTexts}
\alias{is.dfm}
\title{create a document-feature matrix}
\usage{
dfm(x, ...)

\method{dfm}{character}(x, verbose = TRUE, toLower = TRUE,
  removeNumbers = TRUE, removePunct = TRUE, removeSeparators = TRUE,
  removeTwitter = FALSE, stem = FALSE, ignoredFeatures = NULL,
  keptFeatures = NULL, matrixType = c("sparse", "dense"),
  language = "english", thesaurus = NULL, dictionary = NULL,
  valuetype = c("glob", "regex", "fixed"), dictionary_regex = FALSE, ...)

\method{dfm}{tokenizedTexts}(x, verbose = TRUE, toLower = TRUE,
  stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,
  matrixType = c("sparse", "dense"), language = "english",
  thesaurus = NULL, dictionary = NULL, valuetype = c("glob", "regex",
  "fixed"), dictionary_regex = FALSE, ...)

\method{dfm}{corpus}(x, verbose = TRUE, groups = NULL, ...)

is.dfm(x)

as.dfm(x)
}
\arguments{
\item{x}{corpus or character vector from which to generate the
document-feature matrix}

\item{...}{additional arguments passed to \link{tokenize}, which can include for
instance \code{ngrams} and \code{concatenator} for tokenizing multi-token
sequences}

\item{verbose}{display messages if \code{TRUE}}

\item{toLower}{convert texts to lowercase}

\item{removeNumbers}{remove numbers, see \link{tokenize}}

\item{removePunct}{remove numbers, see \link{tokenize}}

\item{removeSeparators}{remove separators (whitespace), see \link{tokenize}}

\item{removeTwitter}{if \code{FALSE}, preserve \code{#} and \code{@}
characters, see \link{tokenize}}

\item{stem}{if \code{TRUE}, stem words}

\item{ignoredFeatures}{a character vector of user-supplied features to
ignore, such as "stop words".  To access one
possible list (from any list you wish), use \code{\link{stopwords}()}.  The
pattern matching type will be set by \code{valuetype}.  For behaviour of
\code{ingoredFeatures} with \code{ngrams > 1}, see Details.}

\item{keptFeatures}{a use supplied regular expression defining which features
to keep, while excluding all others.  This can be used in lieu of a
dictionary if there are only specific features that a user wishes to keep.
To extract only Twitter usernames, for example, set \code{keptFeatures =
"@*"} and make sure that \code{removeTwitter = FALSE} as an additional
argument passed to \link{tokenize}.  Note: \code{keptFeatures =
"^@\\\w+\\\b"} would be the regular expression version of this matching
pattern.  The pattern matching type will be set by \code{valuetype}.}

\item{matrixType}{deprecated, used to produce a dense matrix if \code{dense},
but this was removed in 0.8.2.  All dfm objects are now created as a sparse
matrix of class \code{dgCMatrix} from the \pkg{\link{Matrix}} package.}

\item{language}{Language for stemming.  Choices are
\code{danish}, \code{dutch}, \code{english}, \code{finnish}, \code{french},
\code{german}, \code{hungarian}, \code{italian}, \code{norwegian},
\code{porter}, \code{portuguese}, \code{romanian}, \code{russian},
\code{spanish}, \code{swedish}, \code{turkish}.}

\item{thesaurus}{A list of character vector "thesaurus" entries, in a
dictionary list format, which can also include regular expressions  if
\code{dictionary_regex} is \code{TRUE} (see examples).  Note that unlike
dictionaries, each entry in a thesaurus key must be unique, otherwise only
the first match in the list will be used.  Thesaurus keys are converted to
upper case to create a feature label in the dfm, as a reminder that this
was not a type found in the text, but rather the label of a thesaurus key.}

\item{dictionary}{A list of character vector dictionary entries, including
regular expressions (see examples)}

\item{valuetype}{\code{fixed} for words as is; \code{"regex"} for regular
expressions; or \code{"glob"} for "glob"-style wildcard.  Glob format is
the default.  See \code{\link{selectFeatures}}.}

\item{dictionary_regex}{\code{TRUE} means the dictionary is already in
regular expression format, otherwise it will be converted from the "glob"
format.  This is a legacy argument that will soon be phased out in favour
of \code{valuetype}.}

\item{groups}{character vector containing the names of document variables for
aggregating documents}
}
\value{
A \link{dfm-class} object containing a sparse matrix representation
  of the counts of features by document, along with associated settings and
  metadata.
}
\description{
Create a sparse matrix document-feature matrix from a corpus or a vector of
texts.  The sparse matrix construction uses  the \pkg{Matrix} package, and is
both much faster and much more memory efficient than the corresponding dense
(regular \code{matrix}) representation.  For details on the structure of the
dfm class, see \link{dfm-class}.
}
\details{
The default behavior for \code{ignoredFeatures} when constructing
  ngrams using \code{dfm(x, } \emph{ngrams > 1}\code{)} is to remove any ngram that
  contains any item in \code{ignoredFeatures}.  If you wish to remove these before
  constructing ngrams, you will need to first tokenize the texts with ngrams, then
  remove the features to be ignored, and then construct the dfm using this modified
  tokenization object.  See the code examples for an illustration.

\code{is.dfm} returns \code{TRUE} if and only if its argument is a \link{dfm}.

\code{as.dfm} coerces a matrix or data.frame to a dfm
}
\examples{
# why we phased out dense matrix dfm objects
(size1 <- object.size(dfm(inaugTexts, verbose = FALSE)))
(size2 <- object.size(as.matrix(dfm(inaugTexts, verbose = FALSE))))
cat("Compacted by ", round(as.numeric((1-size1/size2)*100), 1), "\%.\\n", sep="")

# for a corpus
mydfm <- dfm(subset(inaugCorpus, Year>1980))
mydfm <- dfm(subset(inaugCorpus, Year>1980), toLower=FALSE)

# grouping documents by docvars in a corpus
mydfmGrouped <- dfm(subset(inaugCorpus, Year>1980), groups = "President")

# with English stopwords and stemming
dfmsInaug2 <- dfm(subset(inaugCorpus, Year>1980),
                  ignoredFeatures=stopwords("english"), stem=TRUE)
# works for both words in ngrams too
dfm("Banking industry", stem = TRUE, ngrams = 2, verbose = FALSE)

# with dictionaries
mycorpus <- subset(inaugCorpus, Year>1900)
mydict <- list(christmas=c("Christmas", "Santa", "holiday"),
               opposition=c("Opposition", "reject", "notincorpus"),
               taxing="taxing",
               taxation="taxation",
               taxregex="tax*",
               country="united states")
dictDfm <- dfm(mycorpus, dictionary=mydict)
dictDfm

# with the thesaurus feature
mytexts <- c("The new law included a capital gains tax, and an inheritance tax.",
             "New York City has raised a taxes: an income tax and a sales tax.")
mydict <- dictionary(list(tax=c("tax", "income tax", "capital gains tax", "inheritance tax")))
dfm(phrasetotoken(mytexts, mydict), thesaurus = lapply(mydict, function(x) gsub("\\\\s", "_", x)))
# pick up "taxes" with "tax" as a regex
dfm(phrasetotoken(mytexts, mydict), thesaurus = list(anytax = "tax"), valuetype = "regex")

# removing stopwords
testText <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus, with
             the newspaper from a boy named Seamus, in his mouth."
testCorpus <- corpus(testText)
# note: "also" is not in the default stopwords("english")
features(dfm(testCorpus, ignoredFeatures = stopwords("english")))
# for ngrams
features(dfm(testCorpus, ngrams = 2, ignoredFeatures = stopwords("english")))
features(dfm(testCorpus, ngrams = 1:2, ignoredFeatures = stopwords("english")))

## removing stopwords before constructing ngrams
tokensAll <- tokenize(toLower(testText), removePunct = TRUE)
tokensNoStopwords <- removeFeatures(tokensAll, stopwords("english"))
tokensNgramsNoStopwords <- ngrams(tokensNoStopwords, 2)
features(dfm(tokensNgramsNoStopwords, ngrams = 1:2))

# keep only certain words
dfm(testCorpus, keptFeatures = "*s", verbose = FALSE)  # keep only words ending in "s"
dfm(testCorpus, keptFeatures = "s$", valuetype = "regex", verbose = FALSE)

# testing Twitter functions
testTweets <- c("My homie @justinbieber #justinbieber shopping in #LA yesterday #beliebers",
                "2all the ha8ers including my bro #justinbieber #emabiggestfansjustinbieber",
                "Justin Bieber #justinbieber #belieber #fetusjustin #EMABiggestFansJustinBieber")
dfm(testTweets, keptFeatures = "#*", removeTwitter = FALSE)  # keep only hashtags
dfm(testTweets, keptFeatures = "^#.*$", valuetype = "regex", removeTwitter = FALSE)
}
\author{
Kenneth Benoit
}

