% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/corpus.R
\name{corpus_install}
\alias{corpus_install}
\alias{corpus_packages}
\alias{corpus_rename}
\alias{corpus_remove}
\alias{corpus_as_tarball}
\alias{corpus_copy}
\alias{corpus_recode}
\title{Install and manage corpora.}
\usage{
corpus_install(pkg = NULL,
  repo = "http://polmine.sowi.uni-due.de/packages", tarball = NULL,
  lib = .libPaths()[1], verbose = TRUE, user = NULL,
  password = NULL, ...)

corpus_packages()

corpus_rename(old, new, registry_dir = Sys.getenv("CORPUS_REGISTRY"),
  verbose = TRUE)

corpus_remove(corpus, registry_dir = Sys.getenv("CORPUS_REGISTRY"))

corpus_as_tarball(corpus, registry_dir, tarfile, verbose = TRUE)

corpus_copy(corpus, registry_dir, data_dir = NULL,
  registry_dir_new = file.path(normalizePath(tempdir(), winslash = "/"),
  "cwb", "registry", fsep = "/"),
  data_dir_new = file.path(normalizePath(tempdir(), winslash = "/"),
  "cwb", "indexed_corpora", tolower(corpus), fsep = "/"),
  verbose = interactive(), progress = TRUE)

corpus_recode(corpus, registry_dir = Sys.getenv("CORPUS_REGISTRY"),
  data_dir = registry_file_parse(corpus, registry_dir)[["home"]],
  skip = character(), to = c("latin1", "UTF-8"), verbose = TRUE)
}
\arguments{
\item{pkg}{Name of the data package.}

\item{repo}{URL of the repository.}

\item{tarball}{The URL or local path to a tarball with a CWB indexed corpus.}

\item{lib}{Directory for R packages, defaults to \code{.libPaths()[1]}.}

\item{verbose}{Logical, whether to be verbose.}

\item{user}{A user name that can be specified to download a corpus from a password protected site.}

\item{password}{A password that can be specified to download a corpus from a password protected site.}

\item{...}{Further parameters that will be passed into
\code{install.packages}, if argument \code{tarball} is \code{NULL}, or into
or \code{download.file}, if \code{tarball} is specified.}

\item{old}{Name of the (old) corpus.}

\item{new}{Name of the (new) corpus.}

\item{registry_dir}{Directory of registry.}

\item{corpus}{A CWB corpus.}

\item{tarfile}{Filename of tarball.}

\item{data_dir}{The data directory where the files of the CWB corpus live.}

\item{registry_dir_new}{Target directory with for (new) registry files.}

\item{data_dir_new}{Target directory for corpus files.}

\item{progress}{Logical, whether to show a progress bar.}

\item{skip}{A character vector with s_attributes to skip.}

\item{to}{Character string describing the target encoding of the corpus.}
}
\description{
Utitlity functions to keep the installation of indexed CWB corpora wrapped
into R data packages simple.
}
\details{
A data package with a CWB corpus is assumed to include a directory
  \code{/extdata/cwb/registry} for registry files and a directory
  \code{/extdata/cwb/indexed_corpora} for the inexed corpus files. The
  \code{corpus_install} function combines two steps necessary to install a
  CWB corpus. First, it calls \code{install.packages}, then it resets the
  path pointing to the directory with the indexed corpus files in the
  registry file. The package will be installed to the standard library
  directory for installing R packages (\code{.libPaths()[1]}). Another
  location can be used by stating the param 'lib' explicitly (see
  documentation for \code{\link{install.packages}}).
  The function can also be used to install a corpus from a password protected
  repository. Further parameters are handed over to install.packages, so you
  might add \code{method = "wget" extra = "--user donald --password duck"}.
  See examples how to check whether the directory has been set correctly.

\code{corpus_packages} will detect the packages that include CWB
  corpora. Note that the directory structure of all installed packages is
  evaluated which may be slow on network-mounted file systems.

\code{corpus_rename} will rename a corpus, affecting the name of the
  registry file, the corpus id, and the name of the directory where data
  files reside.

\code{corpus_remove} can be used to drop a corpus.

\code{corpus_as_tarball} will create a tarball (.tar.gz-file) with
  two subdirectories. The 'registry' subdirectory will host the registry file
  for the tarred corpus. The data files will be put in a subdirectory with
  the corpus name in the 'indexed_corpora' subdirectory.

\code{corpus_copy} will create a copy of a corpus (useful for
  experimental modifications, for instance).
}
\examples{
registry_file_new <- file.path(
  normalizePath(tempdir(), winslash = "/"),
  "cwb", "registry", "reuters", fsep = "/"
  )
if (file.exists(registry_file_new)) file.remove(registry_file_new)
corpus_copy(
  corpus = "REUTERS",
  registry_dir = system.file(package = "RcppCWB", "extdata", "cwb", "registry"),
  data_dir = system.file(
    package = "RcppCWB",
    "extdata", "cwb", "indexed_corpora", "reuters"
  )
)
unlink(file.path(
  normalizePath(tempdir(), winslash = "/"),
  "cwb", fsep = "/"),
  recursive = TRUE)
corpus <- "REUTERS"
pkg <- "RcppCWB"
s_attr <- "places"
Q <- '"oil"'

registry_dir_src <- system.file(package = pkg, "extdata", "cwb", "registry")
data_dir_src <- system.file(package = pkg, "extdata", "cwb", "indexed_corpora", tolower(corpus))

registry_dir_tmp <- file.path(
  normalizePath(tempdir(), winslash = "/"),
  "cwb", "registry", fsep = "/"
)
registry_file_tmp <- file.path(registry_dir_tmp, tolower(corpus), fsep = "/")
data_dir_tmp <- file.path(
  normalizePath(tempdir(), winslash = "/"),
  "cwb", "indexed_corpora", tolower(corpus), fsep = "/"
)

if (file.exists(registry_file_tmp)) file.remove(registry_file_tmp)
if (!dir.exists(data_dir_tmp)){
   dir.create(data_dir_tmp, recursive = TRUE)
} else {
  if (length(list.files(data_dir_tmp)) > 0L)
    file.remove(list.files(data_dir_tmp, full.names = TRUE))
}

corpus_copy(
  corpus = corpus,
  registry_dir = registry_dir_src,
  data_dir = data_dir_src,
  registry_dir_new = registry_dir_tmp,
  data_dir_new = data_dir_tmp
)

RcppCWB::cl_charset_name(corpus = corpus, registry = registry_dir_tmp)

corpus_recode(
  corpus = corpus,
  registry_dir = registry_dir_tmp,
  data_dir = data_dir_tmp,
  to = "UTF-8"
)

RcppCWB::cl_delete_corpus(corpus = corpus, registry = registry_dir_tmp)
RcppCWB::cqp_initialize(registry_dir_tmp)
RcppCWB::cl_charset_name(corpus = corpus, registry = registry_dir_tmp)

n_strucs <- RcppCWB::cl_attribute_size(
  corpus = corpus, attribute = s_attr, attribute_type = "s", registry = registry_dir_tmp
)
strucs <- 0L:(n_strucs - 1L)
struc_values <- RcppCWB::cl_struc2str(
  corpus = corpus, s_attribute = s_attr, struc = strucs, registry = registry_dir_tmp
)
speakers <- unique(struc_values)

Sys.setenv("CORPUS_REGISTRY" = registry_dir_tmp)
if (RcppCWB::cqp_is_initialized()) RcppCWB::cqp_reset_registry() else RcppCWB::cqp_initialize()
RcppCWB::cqp_query(corpus = corpus, query = Q)
cpos <- RcppCWB::cqp_dump_subcorpus(corpus = corpus)
ids <- RcppCWB::cl_cpos2id(
  corpus = corpus, p_attribute = "word", registry = registry_dir_tmp, cpos = cpos
)
str <- RcppCWB::cl_id2str(
  corpus = corpus, p_attribute = "word", registry = registry_dir_tmp, id = ids
)
unique(str)

unlink(file.path(normalizePath(tempdir(), winslash = "/"), "cwb", fsep = "/"), recursive = TRUE)
}
\seealso{
For managing registry files, see \code{\link{registry_file_parse}}
for switching to a packaged corpus.
}
