% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/applyQC.R
\name{cleanData}
\alias{cleanData}
\title{Create plink dataset with individuals and markers passing quality control}
\usage{
cleanData(
  indir,
  name,
  qcdir = indir,
  filterSex = TRUE,
  filterHeterozygosity = TRUE,
  filterSampleMissingness = TRUE,
  filterRelated = TRUE,
  filterAncestry = TRUE,
  filterSNPMissingness = TRUE,
  lmissTh = 0.01,
  filterHWE = TRUE,
  hweTh = 1e-05,
  filterMAF = TRUE,
  macTh = 20,
  mafTh = NULL,
  path2plink = NULL,
  verbose = FALSE,
  keep_individuals = NULL,
  remove_individuals = NULL,
  exclude_markers = NULL,
  extract_markers = NULL,
  showPlinkOutput = TRUE
)
}
\arguments{
\item{indir}{[character] /path/to/directory containing the basic PLINK data
files name.bim, name.bed, name.fam files.}

\item{name}{[character] Prefix of PLINK files, i.e. name.bed, name.bim,
name.fam.}

\item{qcdir}{[character] /path/to/directory where results will be written to.
If \code{\link{perIndividualQC}} was conducted, this directory should be the
same as qcdir specified in \code{\link{perIndividualQC}}, i.e. it contains
name.fail.IDs with IIDs of individuals that failed QC. User needs writing
permission to qcdir. Per default, qcdir=indir.}

\item{filterSex}{[logical] Set to exclude samples that failed the sex
check (via \code{\link{check_sex}} or \code{\link{perIndividualQC}}).
Requires file qcdir/name.fail-sexcheck.IDs (automatically created by
\code{\link{perIndividualQC}} if do.evaluate_check_sex set to TRUE).}

\item{filterHeterozygosity}{[logical] Set to exclude samples that failed
check for outlying heterozygosity rates (via
\code{\link{check_het_and_miss}} or
\code{\link{perIndividualQC}}). Requires file qcdir/name.fail-het.IDs
(automatically created by \code{\link{perIndividualQC}} if
do.evaluate_check_het_and_miss set to TRUE).}

\item{filterSampleMissingness}{[logical] Set to exclude samples that failed
check for excessive missing genotype rates (via
\code{\link{check_het_and_miss}} or
\code{\link{perIndividualQC}}). Requires file qcdir/name.fail-imiss.IDs
(automatically created by \code{\link{perIndividualQC}} if
do.evaluate_check_het_and_miss set to TRUE).}

\item{filterRelated}{[logical] Set to exclude samples that failed relatedness
check (via \code{\link{check_relatedness}} or \code{\link{perIndividualQC}}).
Requires file qcdir/name.fail-IBD.IDs (automatically created by
\code{\link{perIndividualQC}} if do.evaluate_check_relatedness set to TRUE).}

\item{filterAncestry}{[logical] Set to exclude samples that are
excluded for ancestry (via \code{\link{ancestry_prediction}} or 
\code{\link{perIndividualQC}}).
Requires file qcdir/name.exclude-ancestry.IDs (automatically created by
\code{\link{perIndividualQC}} if do.evaluate_check_sex set to TRUE).}

\item{filterSNPMissingness}{[logical] Set to exclude markers that have
excessive missing rates across samples (via
\code{\link{check_snp_missingness}} or \code{\link{perMarkerQC}}). Requires
lmissTh to be set.}

\item{lmissTh}{[double] Threshold for acceptable variant missing rate across
samples.}

\item{filterHWE}{[logical] Set to exclude markers that fail HWE exact test
(via \code{\link{check_hwe}} or \code{\link{perMarkerQC}}). Requires hweTh to
be set.}

\item{hweTh}{[double] Significance threshold for deviation from HWE.}

\item{filterMAF}{[logical] Set to exclude markers that fail minor allele
frequency or minor allele count threshold (via \code{\link{check_maf}} or
\code{\link{perMarkerQC}}). Requires mafTh or macTh to be set.}

\item{macTh}{[double] Threshold for minor allele cut cut-off, if both mafTh
and macTh are specified, macTh is used (macTh = mafTh\*2\*NrSamples).}

\item{mafTh}{[double] Threshold for minor allele frequency cut-off.}

\item{path2plink}{[character] Absolute path to PLINK executable
(\url{https://www.cog-genomics.org/plink/1.9/}) i.e.
plink should be accessible as path2plink -h. The full name of the executable
should be specified: for windows OS, this means path/plink.exe, for unix
platforms this is path/plink. If not provided, assumed that PATH set-up works
and PLINK will be found by \code{\link[sys]{exec}}('plink').}

\item{verbose}{[logical] If TRUE, progress info is printed to standard out.}

\item{keep_individuals}{[character] Path to file with individuals to be
retained in the analysis. The file has to be a space/tab-delimited text file
with family IDs in the first column and within-family IDs in the second
column. All samples not listed in this file will be removed from the current
analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#indiv}.
Default: NULL, i.e. no filtering on individuals.}

\item{remove_individuals}{[character] Path to file with individuals to be
removed from the analysis. The file has to be a space/tab-delimited text file
with family IDs in the first column and within-family IDs in the second
column. All samples listed in this file will be removed from the current
analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#indiv}.
Default: NULL, i.e. no filtering on individuals.}

\item{exclude_markers}{[character] Path to file with makers to be
removed from the analysis. The file has to be a text file with a list of
variant IDs (usually one per line, but it's okay for them to just be
separated by spaces). All listed variants will be removed from the current
analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#snp}.
Default: NULL, i.e. no filtering on markers.}

\item{extract_markers}{[character] Path to file with makers to be
included in the analysis. The file has to be a text file with a list of
variant IDs (usually one per line, but it's okay for them to just be
separated by spaces). All unlisted variants will be removed from the current
analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#snp}.
Default: NULL, i.e. no filtering on markers.}

\item{showPlinkOutput}{[logical] If TRUE, plink log and error messages are
printed to standard out.}
}
\value{
names [list] with i) passIDs, containing a [data.frame] with family
[FID] and individual [IID] IDs of samples that pass the QC, ii) failIDs,
containing a [data.frame] with family [FID] and individual [IID] IDs of
samples that fail the QC.
}
\description{
Individuals that fail per-individual QC and markers that fail
per-marker QC are removed from indir/name.bim/.bed/.fam and a new, dataset
with the remaining individuals and markers is created as
qcdir/name.clean.bim/.bed/.fam.
}
\examples{
package.dir <- find.package('plinkQC')
indir <- file.path(package.dir, 'extdata')
qcdir <- tempdir()
name <- "data"
path2plink <- '/path/to/plink'
# the following code is not run on package build, as the path2plink on the
# user system is not known.
\dontrun{
# Run qc on all samples and markers in the dataset
## Run individual QC checks
fail_individuals <- perIndividualQC(indir=indir, qcdir=qcdir, name=name,
refSamplesFile=paste(qcdir, "/HapMap_ID2Pop.txt",sep=""),
refColorsFile=paste(qcdir, "/HapMap_PopColors.txt", sep=""),
prefixMergedDataset="data.HapMapIII", interactive=FALSE, verbose=FALSE,
path2plink=path2plink)

## Run marker QC checks
fail_markers <- perMarkerQC(indir=indir, qcdir=qcdir, name=name,
path2plink=path2plink)

## Create new dataset of individuals and markers passing QC
ids_all <- cleanData(indir=indir, qcdir=qcdir, name=name, macTh=15,
verbose=TRUE, path2plink=path2plink,
filterRelated=TRUE)

# Run qc on subset of samples and markers in the dataset
highlight_samples <- read.table(system.file("extdata", "keep_individuals",
package="plinkQC"))
remove_individuals_file <- system.file("extdata", "remove_individuals",
package="plinkQC")

fail_individuals <- perIndividualQC(indir=indir, qcdir=qcdir, name=name,
 interactive=FALSE, verbose=FALSE,
highlight_samples = highlight_samples[,2], highlight_type = "label",
remove_individuals = remove_individuals_file, path2plink=path2plink)

## Run marker QC checks
fail_markers <- perMarkerQC(indir=indir, qcdir=qcdir, name=name,
path2plink=path2plink)

## Create new dataset of individuals and markers passing QC
ids_all <- cleanData(indir=indir, qcdir=qcdir, name=name, macTh=15,
verbose=TRUE, path2plink=path2plink, 
remove_individuals = remove_individuals_file)
}
}
