% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/textcleaner.R
\name{textcleaner}
\alias{textcleaner}
\title{Text Cleaner}
\usage{
textcleaner(
  data = NULL,
  miss = 99,
  partBY = c("row", "col"),
  dictionary = NULL,
  spelling = c("UK", "US"),
  add.path = NULL,
  keepStrings = FALSE,
  allowPunctuations = c("-", "all"),
  allowNumbers = FALSE,
  lowercase = TRUE,
  continue = NULL
)
}
\arguments{
\item{data}{Matrix or data frame.
A dataset of text data.
Participant IDs will be automatically identified if they are included.
If no IDs are provided, then their order in the corresponding
row (or column is used). A message will notify the user how IDs were assigned}

\item{miss}{Numeric or character.
Value for missing data.
Defaults to \code{99}}

\item{partBY}{Character.
Are participants by row or column?
Set to \code{"row"} for by row.
Set to \code{"col"} for by column}

\item{dictionary}{Character vector.
Can be a vector of a corpus or any text for comparison.
Dictionary to be used for more efficient text cleaning.
Defaults to \code{NULL}, which will use \code{\link[SemNetDictionaries]{general.dictionary}}

Use \code{dictionaries()} or \code{find.dictionaries()} for more options
(See \code{\link{SemNetDictionaries}} for more details)}

\item{spelling}{Character vector.
English spelling to be used.
\itemize{

\item{\code{"UK"}}
{For British spelling (e.g., colour, grey, programme, theatre)}

\item{\code{"US"}}
{For American spelling (e.g., color, gray, program, theater)}

}}

\item{add.path}{Character.
Path to additional dictionaries to be found.
DOES NOT search recursively (through all folders in path)
to avoid time intensive search.
Set to \code{"choose"} to open an interactive directory explorer}

\item{keepStrings}{Boolean.
Should strings be retained or separated?
Defaults to \code{FALSE}.
Set to \code{TRUE} to retain strings as strings}

\item{allowPunctuations}{Character vector.
Allows punctuation characters to be included in responses.
Defaults to \code{"-"}.
Set to \code{"all"} to keep all punctuation characters}

\item{allowNumbers}{Boolean.
Defaults to \code{FALSE}.
Set to \code{TRUE} to keep numbers in text}

\item{lowercase}{Boolean.
Should words be converted to lowercase?
Defaults to \code{TRUE}.
Set to \code{FALSE} to keep words as they are}

\item{continue}{List.
A result previously unfinished that still needs to be completed.
Allows you to continue to manually spell-check their data
after you've closed or errored out.
Defaults to \code{NULL}}
}
\value{
This function returns a list containing the following objects:

\item{binary}{A matrix of responses where each row represents a participant
and each column represents a unique response. A response that a participant has provided is a '\code{1}'
and a response that a participant has not provided is a '\code{0}'}

\item{responses}{A list containing two objects:

\itemize{

\item{\code{clean}}
{A response matrix that has been spell-checked and de-pluralized with duplicates removed.
This can be used as a final dataset for analyses (e.g., fluency of responses)}

\item{\code{original}}
{The original response matrix that has had white spaces before and
after words response. Also converts all upper-case letters to lower case}

}

}

\item{spellcheck}{A list containing three objects:

\itemize{

\item{\code{full}}
{All responses regardless of spell-checking changes}

\item{\code{auto}}
{Only the incorrect responses that were changed during spell-check}

}

}

\item{removed}{A list containing two objects: 

\itemize{

\item{\code{rows}}
{Identifies removed participants by their row (or column) location in the original data file}

\item{\code{ids}}
{Identifies removed participants by their ID (see argument \code{data})}

}

}

\item{partChanges}{A list where each participant is a list index with each
response that was been changed. Participants are identified by their ID (see argument \code{data}).
This can be used to replicate the cleaning process and to keep track of changes more generally.
Participants with \code{NA} did not have any changes from their original data
and participants with missing data are removed (see \code{removed$ids})}
}
\description{
An automated cleaning function for spell-checking, de-pluralizing,
removing duplicates, and binarizing text data
}
\examples{
# Toy example
raw <- open.animals[c(1:10),-c(1:3)]

if(interactive())
{
    #Full test
    clean <- textcleaner(open.animals[,-c(1,2)], partBY = "row", dictionary = "animals")
}

}
\references{
Hornik, K., & Murdoch, D. (2010).
Watch Your Spelling!.
\emph{The R Journal}, \emph{3}, 22-28.
}
\author{
Alexander Christensen <alexpaulchristensen@gmail.com>
}
