% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utils-perm.R
\name{rancors_builder}
\alias{rancors_builder}
\title{Build Multiple Random Corpora}
\usage{
rancors_builder(
  data,
  vocab,
  probs,
  n_cors,
  n_docs,
  len_mean,
  len_var,
  len_min,
  len_max,
  seed = NULL
)
}
\arguments{
\item{data}{Data.frame containing vocabulary and probabilities}

\item{vocab}{Name of the column containing vocabulary}

\item{probs}{Name of the column containing probabilities}

\item{n_cors}{Integer indicating the number of corpora to build}

\item{n_docs}{Integer(s) indicating the number of documents to be returned
If two numbers are provide, number will be randomly sampled
within the range for each corpora.}

\item{len_mean}{Integer(s) indicating the mean of the document lengths.
If two numbers are provide, number will be randomly sampled
within the range for each corpora.}

\item{len_var}{Integer(s) indicating the standard deviation
of the document lengths. If two numbers are provide,
number will be randomly sampled
within the range for each corpora.}

\item{len_min}{Integer(s) indicating the minimum of the document lengths.
If two numbers are provide, number will be randomly sampled
within the range for each corpora.}

\item{len_max}{Integer(s) indicating the maximum of the document lengths.
If two numbers are provide, number will be randomly sampled
within the range for each corpora.}

\item{seed}{Optional seed for reproducibility}
}
\description{
\code{rancors_builder()} generates multiple random corpus (rancor) based on a user
defined term probabilities and vocabulary. Users can set the number of
documents, as well as the mean, standard deviation, minimum, and maximum
document lengths (i.e. number of tokens). The output is a list of
document-term matrices. To produce a \emph{single} random corpus, use
\code{rancor_builder()} (note the singular).
}
\examples{
# create corpus and DTM
my_corpus <- data.frame(
  text = c(
    "I hear babies crying I watch them grow",
    "They'll learn much more than I'll ever know",
    "And I think to myself",
    "What a wonderful world",
    "Yes I think to myself",
    "What a wonderful world"
  ),
  line_id = paste0("line", seq_len(6))
)
## some text preprocessing
my_corpus$clean_text <- tolower(gsub("'", "", my_corpus$text))

dtm <- dtm_builder(
  data = my_corpus,
  text = clean_text,
  doc_id = line_id
)

# use colSums to get term frequencies
df <- data.frame(
  vocab = colnames(dtm),
  freqs = colSums(dtm)
)
# convert to probabilities
df$probs <- df$freqs / sum(df$freqs)

# create random DTM
ls_dtms <- df |> 
rancors_builder(vocab,
   probs,
   n_cors = 20,
   n_docs = 100,
   len_mean = c(50, 200),
   len_var = 5,
   len_min = 20,
   len_max = 1000,
   seed = 59801
)
length(ls_dtms)

}
\author{
Dustin Stoltz and Marshall Taylor
}
