% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/Clones.R
\name{formatClones}
\alias{formatClones}
\title{Generate an ordered list of airrClone objects for lineage construction}
\usage{
formatClones(
  data,
  seq = "sequence_alignment",
  clone = "clone_id",
  subgroup = "clone_subgroup",
  id = "sequence_id",
  germ = "germline_alignment_d_mask",
  v_call = "v_call",
  j_call = "j_call",
  junc_len = "junction_length",
  mask_char = "N",
  max_mask = 0,
  pad_end = TRUE,
  text_fields = NULL,
  num_fields = NULL,
  seq_fields = NULL,
  add_count = TRUE,
  verbose = FALSE,
  collapse = TRUE,
  cell = "cell_id",
  locus = "locus",
  traits = NULL,
  mod3 = TRUE,
  randomize = TRUE,
  use_regions = TRUE,
  dup_singles = FALSE,
  nproc = 1,
  chain = "H",
  heavy = "IGH",
  filterStop = TRUE,
  minseq = 2,
  split_light = FALSE,
  majoronly = FALSE,
  columns = NULL
)
}
\arguments{
\item{data}{data.frame containing the AIRR or Change-O data for a clone.
See \link{makeAirrClone} for required columns and their defaults}

\item{seq}{name of the column containing observed DNA sequences. All 
sequences in this column must be multiple aligned.}

\item{clone}{name of the column containing the identifier for the clone. All 
entries in this column should be identical.}

\item{subgroup}{name of the column containing the identifier for the subgroup.}

\item{id}{name of the column containing sequence identifiers.}

\item{germ}{name of the column containing germline DNA sequences. All entries 
in this column should be identical for any given clone, and they
must be multiple aligned with the data in the \code{seq} column.}

\item{v_call}{name of the column containing V-segment allele assignments. All 
entries in this column should be identical to the gene level.}

\item{j_call}{name of the column containing J-segment allele assignments. All 
entries in this column should be identical to the gene level.}

\item{junc_len}{name of the column containing the length of the junction as a 
numeric value. All entries in this column should be identical 
for any given clone.}

\item{mask_char}{character to use for masking and padding.}

\item{max_mask}{maximum number of characters to mask at the leading and trailing
sequence ends. If \code{NULL} then the upper masking bound will 
be automatically determined from the maximum number of observed 
leading or trailing Ns amongst all sequences. If set to \code{0} 
(default) then masking will not be performed.}

\item{pad_end}{if \code{TRUE} pad the end of each sequence with \code{mask_char}
to make every sequence the same length.}

\item{text_fields}{text annotation columns to retain and merge during duplicate removal.}

\item{num_fields}{numeric annotation columns to retain and sum during duplicate removal.}

\item{seq_fields}{sequence annotation columns to retain and collapse during duplicate 
removal. Note, this is distinct from the \code{seq} and \code{germ} 
arguments, which contain the primary sequence data for the clone
and should not be repeated in this argument.}

\item{add_count}{if \code{TRUE} add an additional annotation column called 
\code{COLLAPSE_COUNT} during duplicate removal that indicates the 
number of sequences that were collapsed.}

\item{verbose}{passed on to \code{collapseDuplicates}. If \code{TRUE}, report the 
numbers of input, discarded and output sequences; otherwise, process
sequences silently.}

\item{collapse}{collapse identical sequences?}

\item{cell}{name of the column containing cell assignment information}

\item{locus}{name of the column containing locus information}

\item{traits}{column ids to keep distinct during sequence collapse}

\item{mod3}{pad sequences to length mutliple three?}

\item{randomize}{randomize sequence order? Important if using PHYLIP}

\item{use_regions}{assign CDR/FWR regions?}

\item{dup_singles}{Duplicate sequences in singleton clones to include them as trees?}

\item{nproc}{number of cores to parallelize formating over.}

\item{chain}{if HL, include light chain information if available.}

\item{heavy}{name of heavy chain locus (default = "IGH")}

\item{filterStop}{only use sequences that do not contain an in-frame stop codon}

\item{minseq}{minimum number of sequences per clone}

\item{split_light}{split or lump subgroups? See \code{resolveLightChains}.}

\item{majoronly}{only return largest subgroup and sequences without light chains}

\item{columns}{additional data columns to include in output}
}
\value{
A tibble of \link{airrClone} objects containing modified clones.
}
\description{
\code{formatClones} takes a \code{data.frame} or \code{tibble} with AIRR or 
Change-O style columns as input and masks gap positions, masks ragged ends, 
removes duplicates sequences, and merges annotations associated with duplicate
sequences. If specified, it will un-merge duplicate sequences with different 
values specified in the \code{traits} option. It returns a list of \code{airrClone}
objects ordered by number of sequences which serve as input for lineage reconstruction.
}
\details{
This function is a wrapper for \link{makeAirrClone}. Also removes whitespace,
;, :, and = from ids
}
\examples{
data(ExampleAirr)
# Select two clones, for demonstration purpose
sel <- c("3170", "3184")
clones <- formatClones(ExampleAirr[ExampleAirr$clone_id \%in\% sel,],traits="sample_id")
}
\seealso{
Executes in order \link{makeAirrClone}. Returns a tibble of 
\link{airrClone} objects 
     which serve as input to \link{getTrees} and \link{findSwitches}.
}
