% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/utilities.R
\name{utils_stats}
\alias{utils_stats}
\alias{av_dev}
\alias{ci_mean_t}
\alias{ci_mean_z}
\alias{cv}
\alias{freq_table}
\alias{freq_hist}
\alias{hmean}
\alias{gmean}
\alias{kurt}
\alias{n_missing}
\alias{n_unique}
\alias{n_valid}
\alias{pseudo_sigma}
\alias{range_data}
\alias{row_col_mean}
\alias{row_col_sum}
\alias{sd_amo}
\alias{sd_pop}
\alias{sem}
\alias{skew}
\alias{sum_dev}
\alias{ave_dev}
\alias{sum_sq_dev}
\alias{sum_sq}
\alias{var_pop}
\alias{var_amo}
\alias{cv_by}
\alias{max_by}
\alias{min_by}
\alias{means_by}
\alias{mean_by}
\alias{n_by}
\alias{sd_by}
\alias{var_by}
\alias{sem_by}
\alias{sum_by}
\title{Useful functions for computing descriptive statistics}
\usage{
av_dev(.data, ..., na.rm = FALSE)

ci_mean_t(.data, ..., na.rm = FALSE, level = 0.95)

ci_mean_z(.data, ..., na.rm = FALSE, level = 0.95)

cv(.data, ..., na.rm = FALSE)

freq_table(.data, var, k = NULL, digits = 3)

freq_hist(
  table,
  xlab = NULL,
  ylab = NULL,
  fill = "gray",
  color = "black",
  ygrid = TRUE
)

hmean(.data, ..., na.rm = FALSE)

gmean(.data, ..., na.rm = FALSE)

kurt(.data, ..., na.rm = FALSE)

n_missing(.data, ..., na.rm = FALSE)

n_unique(.data, ..., na.rm = FALSE)

n_valid(.data, ..., na.rm = FALSE)

pseudo_sigma(.data, ..., na.rm = FALSE)

range_data(.data, ..., na.rm = FALSE)

row_col_mean(.data, na.rm = FALSE)

row_col_sum(.data, na.rm = FALSE)

sd_amo(.data, ..., na.rm = FALSE)

sd_pop(.data, ..., na.rm = FALSE)

sem(.data, ..., na.rm = FALSE)

skew(.data, ..., na.rm = FALSE)

sum_dev(.data, ..., na.rm = FALSE)

ave_dev(.data, ..., na.rm = FALSE)

sum_sq_dev(.data, ..., na.rm = FALSE)

sum_sq(.data, ..., na.rm = FALSE)

var_pop(.data, ..., na.rm = FALSE)

var_amo(.data, ..., na.rm = FALSE)

cv_by(.data, ..., .vars = NULL, na.rm = FALSE)

max_by(.data, ..., .vars = NULL, na.rm = FALSE)

min_by(.data, ..., .vars = NULL, na.rm = FALSE)

means_by(.data, ..., .vars = NULL, na.rm = FALSE)

mean_by(.data, ..., .vars = NULL, na.rm = FALSE)

n_by(.data, ..., .vars = NULL, na.rm = FALSE)

sd_by(.data, ..., .vars = NULL, na.rm = FALSE)

var_by(.data, ..., .vars = NULL, na.rm = FALSE)

sem_by(.data, ..., .vars = NULL, na.rm = FALSE)

sum_by(.data, ..., .vars = NULL, na.rm = FALSE)
}
\arguments{
\item{.data}{A data frame or a numeric vector.}

\item{...}{The argument depends on the function used.
\itemize{
\item For \verb{*_by} functions, \code{...} is one or more categorical variables
for grouping the data. Then the statistic required will be computed for all
numeric variables in the data. If no variables are informed in \code{...},
the statistic will be computed ignoring all non-numeric variables in
\code{.data}.
\item For the other statistics, \code{...} is a comma-separated of unquoted
variable names to compute the statistics. If no variables are informed in n
\code{...}, the statistic will be computed for all numeric variables in
\code{.data}.
}}

\item{na.rm}{If \code{FALSE}, the default, missing values are removed with a
warning. If \code{TRUE}, missing values are silently removed.}

\item{level}{The confidence level for the confidence interval of the mean.
Defaults to 0.95.}

\item{var}{The variable to compute the frequency table. See \code{Details} for
more details.}

\item{k}{The number of classes to be created. See \code{Details} for
more details.}

\item{digits}{The number of significant figures to show. Defaults to 2.}

\item{table}{A frequency table computed with \code{\link[=freq_table]{freq_table()}}.}

\item{xlab, ylab}{The \code{x} and \code{y} labels.}

\item{fill, color}{The color to fill the bars and color the border of the bar,
respectively.}

\item{ygrid}{Shows a grid line on the \code{y} axis? Defaults to \code{TRUE}.
freq_hist <- function(table,}

\item{.vars}{Used to select variables in the \verb{*_by()} functions. One or more
unquoted expressions separated by commas. Variable names can be used as if
they were positions in the data frame, so expressions like \code{x:y} can be
used to select a range of variables. Defaults to \code{NULL} (all numeric
variables are analyzed)..}
}
\value{
\itemize{
\item Functions \verb{*_by()} returns a \code{tbl_df} with the computed statistics by
each level of the factor(s) declared in \code{...}.
\item All other functions return a named integer if the input is a data frame
or a numeric value if the input is a numeric vector.
\item \code{freq_table()} Returns a list with the frequency table and the breaks used
for class definition. These breaks can be used to construct an histogram of
the variable.
}
}
\description{
\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#stable}{\figure{lifecycle-stable.svg}{options: alt='[Stable]'}}}{\strong{[Stable]}}
\itemize{
\item \strong{The following functions compute descriptive statistics by levels of
a factor or combination of factors quickly.}
\itemize{
\item \code{cv_by()} For computing coefficient of variation.
\item \code{max_by()} For computing maximum values.
\item \code{mean_by()} For computing arithmetic means.
\item \code{min_by()} For compuing minimum values.
\item \code{n_by()} For getting the length.
\item \code{sd_by()} For computing sample standard deviation.
\item \code{var_by()} For computing sample variance.
\item \code{sem_by()} For computing standard error of the mean.
}
\item \strong{Useful functions for descriptive statistics. All of them work
naturally with \verb{\\\%>\\\%}, handle grouped data and multiple variables (all
numeric variables from \code{.data} by default).}
\itemize{
\item \code{av_dev()} computes the average absolute deviation.
\item \code{ci_mean_t()} computes the t-interval for the mean.
\item \code{ci_mean_z()} computes the z-interval for the mean.
\item \code{cv()} computes the coefficient of variation.
\item \code{freq_table()} Computes a frequency table for either numeric and
categorical/discrete data. For numeric data, it is possible to define the
number of classes to be generated.
\item \verb{hmean(), gmean()} computes the harmonic and geometric means,
respectively. The harmonic mean is the reciprocal of the arithmetic mean of
the reciprocals. The geometric mean is the \emph{n}th root of \emph{n}
products.
\item \code{kurt()} computes the kurtosis like used in SAS and SPSS.
\item \code{range_data()} Computes the range of the values.
\item \code{n_valid()} The valid (not \code{NA}) length of a data.
\item \code{n_unique()} Number of unique values.
\item \code{n_missing()} Number of missing values.
\item \verb{row_col_mean(), row_col_sum()} Adds a row with the mean/sum of
each variable and a column with the the mean/sum for each row of the data.
\item \verb{sd_amo(), sd_pop()} Computes sample and populational standard
deviation, respectively.
\item \code{sem()} computes the standard error of the mean.
\item \code{skew()} computes the skewness like used in SAS and SPSS.
\item \code{ave_dev()} computes the average of the absolute deviations.
\item \code{sum_dev()} computes the sum of the absolute deviations.
\item \code{sum_sq()} computes the sum of the squared values.
\item \code{sum_sq_dev()} computes the sum of the squared deviations.
\item \verb{var_amo(), var_pop()} computes sample and populational variance.
}
}

\code{\link[=desc_stat]{desc_stat()}} is wrapper function around the above ones and can be
used to compute quickly all these statistics at once.
}
\details{
The function \code{freq_table()} computes a frequency table for either
numerical or categorical variables. If a variable is categorical or
discrete (integer values), the number of classes will be the number of
levels that the variable contains.

If a variable (say, data) is continuous, the number of classes (k) is given by
the square root of the number of samples (n) if \verb{n =< 100} or \code{5 * log10(n)}
if \code{n > 100}.

The amplitude (\mjseqn{A}) of the data is used to define the size of the class (\mjseqn{c}),
given by

\loadmathjax
\mjsdeqn{c = \frac{A}{n - 1}}

The lower limit of the first class (LL1) is given by min(data) - c / 2. The
upper limit is given by LL1 + c. The limits of the other classes are given in
the same way. After the creation of the classes, the absolute and relative
frequencies within each class are computed.
}
\examples{
\donttest{
library(metan)
# means of all numeric variables by ENV
mean_by(data_ge2, GEN, ENV)

# Coefficient of variation for all numeric variables
# by GEN and ENV
cv_by(data_ge2, GEN, ENV)

# Skewness of a numeric vector
set.seed(1)
nvec <- rnorm(200, 10, 1)
skew(nvec)

# Confidence interval 0.95 for the mean
# All numeric variables
# Grouped by levels of ENV
data_ge2 \%>\%
  group_by(ENV) \%>\%
  ci_mean_t()

# standard error of the mean
# Variable PH and EH
sem(data_ge2, PH, EH)

# Frequency table for variable NR
data_ge2 \%>\%
  freq_table(NR)
}

}
\references{
Ferreira, Daniel Furtado. 2009. Estatistica Basica. 2 ed. Vicosa,
MG: UFLA.
}
\author{
Tiago Olivoto \email{tiagoolivoto@gmail.com}
}
