% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/growth.R
\name{cleangrowth}
\alias{cleangrowth}
\title{Clean growth measurements}
\usage{
cleangrowth(
  subjid,
  param,
  agedays,
  sex,
  measurement,
  recover.unit.error = FALSE,
  sd.extreme = 25,
  z.extreme = 25,
  lt3.exclude.mode = "default",
  height.tolerance.cm = 2.5,
  error.load.mincount = 2,
  error.load.threshold = 0.5,
  sd.recenter = NA,
  sdmedian.filename = "",
  sdrecentered.filename = "",
  include.carryforward = FALSE,
  ewma.exp = -1.5,
  ref.data.path = "",
  log.path = NA,
  parallel = FALSE,
  num.batches = NA,
  quietly = TRUE,
  adult_cutpoint = 20,
  weight_cap = Inf,
  adult_columns_filename = "",
  prelim_infants = FALSE
)
}
\arguments{
\item{subjid}{Vector of unique identifiers for each subject in the database.}

\item{param}{Vector identifying each measurement, may be 'WEIGHTKG', 'WEIGHTLBS', 'HEIGHTCM', 'HEIGHTIN', 'LENGTHCM', or 'HEADCM'.
'HEIGHTCM'/'HEIGHTIN' vs. 'LENGTHCM' only affects z-score calculations between ages 24 to 35 months (730 to 1095 days).
All linear measurements below 731 days of life (age 0-23 months) are interpreted as supine length, and
all linear measurements above 1095 days of life (age 36+ months) are interpreted as standing height.
Note: at the moment, all LENGTHCM will be converted to HEIGHTCM. In the future, the algorithm will be updated to consider this difference.
Additionally, imperial 'HEIGHTIN' and 'WEIGHTLBS' measurements are converted to
metric during algorithm calculations.}

\item{agedays}{Numeric vector containing the age in days at each measurement.}

\item{sex}{Vector identifying the gender of the subject, may be 'M', 'm', or 0 for males, vs. 'F', 'f' or 1 for females.}

\item{measurement}{Numeric vector containing the actual measurement data.  Weight must be in
kilograms (kg), and linear measurements (height vs. length) in centimeters (cm).}

\item{recover.unit.error}{Indicates whether the cleaning algorithm should
attempt to identify unit errors (I.e. inches vs. cm, lbs vs. kg). If unit
errors are identified, the value will be corrected and retained within the
cleaning algorithm as a valid measurement.  Defaults to FALSE.}

\item{sd.extreme}{Measurements more than sd.extreme standard deviations from
the mean (either above or below) will be flagged as invalid. Defaults to 25.}

\item{z.extreme}{Measurements with an absolute z-score greater than
z.extreme will be flagged as invalid. Defaults to 25.}

\item{lt3.exclude.mode}{Determines type of exclusion procedure to use for 1 or 2 measurements of one type without
matching same ageday measurements for the other parameter. Options include "default" (standard growthcleanr approach),
and "flag.both" (in case of two measurements of one type without matching values for the other parameter, flag both
for exclusion if beyond threshold)}

\item{height.tolerance.cm}{maximum decrease in height tolerated for sequential measurements}

\item{error.load.mincount}{minimum count of exclusions on parameter before
considering excluding all measurements. Defaults to 2.}

\item{error.load.threshold}{threshold of percentage of excluded measurement count to included measurement
count that must be exceeded before excluding all measurements of either parameter. Defaults to 0.5.}

\item{sd.recenter}{specifies how to recenter medians. May be a data frame or
table w/median SD-scores per day of life by gender and parameter, or "NHANES"
or "derive" as a character vector.
\itemize{
\item If \code{sd.recenter} is specified as a data set, use the data set
\item If \code{sd.recenter} is specified as "\code{nhanes}", use NHANES reference medians
\item If \code{sd.recenter} is specified as "\code{derive}", derive from input
\item If \code{sd.recenter} is not specified or \code{NA}:
\itemize{
\item If the input set has at least 5,000 observations, derive medians from input
\item If the input set has fewer than 5,000 observations, use NHANES
}
}

If specifying a data set, columns must include param, sex, agedays, and sd.median
(referred to elsewhere as "modified Z-score"), and those medians will be used
for recentering. A summary of how the NHANES reference medians were derived is
available in README.md. Defaults to NA.}

\item{sdmedian.filename}{Name of file to save sd.median data calculated on the input dataset to as CSV.
Defaults to "", for which this data will not be saved. Use for extracting medians for parallel processing
scenarios other than the built-in parallel option.}

\item{sdrecentered.filename}{Name of file to save re-centered data to as CSV. Defaults to "", for which this
data will not be saved. Useful for post-processing and debugging.}

\item{include.carryforward}{Determines whether Carry-Forward values are kept in the output. Defaults to False.}

\item{ewma.exp}{Exponent to use for weighting measurements in the
exponentially weighted moving average calculations. Defaults to -1.5.
This exponent should be negative in order to weight growth measurements
closer to the measurement being evaluated more strongly. Exponents that are
further from zero (e.g. -3) will increase the relative influence of
measurements close in time to the measurement being evaluated compared to
using the default exponent.}

\item{ref.data.path}{Path to reference data. If not supplied, the year 2000
Centers for Disease Control (CDC) reference data will be used.}

\item{log.path}{Path to log file output when running in parallel (non-quiet mode). Default is NA. A new
directory will be created if necessary. Set to NA to disable log files.}

\item{parallel}{Determines if function runs in parallel.  Defaults to FALSE.}

\item{num.batches}{Specify the number of batches to run in parallel. Only
applies if parallel is set to TRUE. Defaults to the number of workers
returned by the getDoParWorkers function in the foreach package.}

\item{quietly}{Determines if function messages are to be displayed and if log files (parallel only) are to be generated.
Defaults to TRUE}

\item{adult_cutpoint}{Number between 18 and 20, describing ages when the
pediatric algorithm should not be applied (< adult_cutpoint), and the adult
algorithm should apply (>= adult_cutpoint). Numbers outside this range will be
changed to the closest number within the range. Defaults to 20.}

\item{weight_cap}{Positive number, describing a weight cap in kg (rounded to the
nearest .1, +/- .1) within the adult dataset. If there is no weight cap, set
to Inf. Defaults to Inf.}

\item{adult_columns_filename}{Name of file to save original adult data, with additional output columns to
as CSV. Defaults to "", for which this data will not be saved. Useful
for post-analysis. For more information on this output, please see README.}

\item{prelim_infants}{TRUE/FALSE. Run the in-development release of the infants algorithm (expands pediatric algorithm to improve performance for children 0 – 2 years). Not recommended for use in research. For more information regarding the logic of the algorithm, see the vignette 'Preliminary Infants Algorithm.' Defaults to FALSE.}
}
\value{
Vector of exclusion codes for each of the input measurements.

Possible values for each code are:
\itemize{
\item 'Include', 'Unit-Error-High', 'Unit-Error-Low', 'Swapped-Measurements', 'Missing',
\item 'Exclude-Carried-Forward', 'Exclude-SD-Cutoff', 'Exclude-EWMA-Extreme', 'Exclude-EWMA-Extreme-Pair',
\item 'Exclude-Extraneous-Same-Day',
\item 'Exclude-EWMA-8', 'Exclude-EWMA-9', 'Exclude-EWMA-10', 'Exclude-EWMA-11', 'Exclude-EWMA-12', 'Exclude-EWMA-13', 'Exclude-EWMA-14',
\item 'Exclude-Min-Height-Change', 'Exclude-Max-Height-Change',
\item 'Exclude-Pair-Delta-17', 'Exclude-Pair-Delta-18', 'Exclude-Pair-Delta-19',
\item 'Exclude-Single-Outlier', 'Exclude-Too-Many-Errors', 'Exclude-Too-Many-Errors-Other-Parameter'
}
}
\description{
Clean growth measurements
}
\examples{
\donttest{
# Run calculation using a small subset of given data
df_stats <- as.data.frame(syngrowth)
df_stats <- df_stats[df_stats$subjid \%in\% unique(df_stats[, "subjid"])[1:5], ]

clean_stats <-cleangrowth(subjid = df_stats$subjid,
                         param = df_stats$param,
                         agedays = df_stats$agedays,
                         sex = df_stats$sex,
                         measurement = df_stats$measurement)

# Once processed you can filter data based on result value
df_stats <- cbind(df_stats, "clean_result" = clean_stats)
clean_df_stats <- df_stats[df_stats$clean_result == "Include",]

# Parallel processing: run using 2 cores and batches
clean_stats <- cleangrowth(subjid = df_stats$subjid,
                           param = df_stats$param,
                           agedays = df_stats$agedays,
                           sex = df_stats$sex,
                           measurement = df_stats$measurement,
                           parallel = TRUE,
                           num.batches = 2)
}
}
