% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/numbers.R
\name{prep_outliers}
\alias{prep_outliers}
\title{Outliers Data Preparation}
\usage{
prep_outliers(data, str_input, type = c("stop", "set_na"), top_percent,
  bottom_percent)
}
\arguments{
\item{data}{data frame}

\item{str_input}{string input variable (if empty, it runs for all numeric variable).}

\item{type}{can be 'stop' or 'set_na', in the first case the original variable is stopped at the desiered percentile, 'set_na'  sets NA to the same values.}

\item{top_percent}{value from 0 to 1, represents the highest X percentage of values to treat}

\item{bottom_percent}{value from 0 to 1, represents the lowest X percentage of values to treat}
}
\value{
A data frame with the desired outlier transformation
}
\description{
Deal with outliers by setting an 'NA value' or by 'stopping' them at a certain. The parameters: 'top_percent'/'bottom_percent' are used to consider a value as outlier.

Setting NA is recommended when doing statistical analysis, parameter: type='set_na'.
Stopping is recommended when creating a predictive model without biasing the result due to outliers, parameter: type='stop'.

Automatization: `prep_outliers` skip all factor/char columns, so it can receive a whole data frame, removing outliers by finally, returning a the cleaned data.
}
\examples{
# Creating data frame with outliers
set.seed(10)
df=data.frame(var1=rchisq(1000,df = 1), var2=rnorm(1000))
df=rbind(df, 1135, 2432) # forcing outliers
df$id=as.character(seq(1:1002))

# for var1: mean is ~ 4.56, and max 2432
summary(df)

########################################################
### PREPARING OUTLIERS FOR DESCRIPTIVE STATISTICS
########################################################

#### EXAMPLE 1: Removing top 1\% for a single variable
# checking the value for the top 1\% of highest values (percentile 0.99), which is ~ 7.05
quantile(df$var1, 0.99)

# Setting type='set_na' sets NA to the highest value)
var1_treated=prep_outliers(data = df,  str_input = 'var1', type='set_na', top_percent  = 0.01)

# now the mean (~ 0.94) is more accurate, and note that: 1st, median and 3rd
#  quartiles remaining very similar to the original variable.
summary(var1_treated)

#### EXAMPLE 2: if 'str_input' is missing, then it runs for all numeric variables
#  (which have 3 or more distinct values).
df_treated2=prep_outliers(data = df, type='set_na', top_percent  = 0.01)
summary(df_treated2)

#### EXAMPLE 3: Removing top 1\% (and bottom 1\%) for 'N' specific variables.
vars_to_process=c('var1', 'var2')
df_treated3=prep_outliers(data = df, str_input = vars_to_process, type='set_na',
 bottom_percent = 0.01, top_percent  = 0.01)
summary(df_treated3)

########################################################
### PREPARING OUTLIERS FOR PREDICTIVE MODELING
########################################################

#### EXAMPLE 4: Stopping outliers at the top 1\% value for all variables. For example
#   if the top 1\% has a value of 7, then all values above will be set to 7. Useful
#   when modeling because outlier cases can be used.
df_treated4=prep_outliers(data = df, top_percent  = 0.01, type='stop')
}

