% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/harmonize_sumstats.R
\name{harmonize_sumstats}
\alias{harmonize_sumstats}
\title{Harmonizing GWAS summary to reference data}
\usage{
harmonize_sumstats(
  sumstats,
  x,
  match_by_id = TRUE,
  check_strand_flip = FALSE,
  return_indice = FALSE
)
}
\arguments{
\item{sumstats}{A data frame with two columns: "id" and "pvalue".
\itemize{
\item id = SNP ID (e.g., rs numbers)
\item pvalue = SNP-level p value
}

If \code{match_by_id = FALSE}, it requires additional columns: "chr", "pos", "A1"
and "A2".
\itemize{
\item chr =  chromosome
\item pos =  base-pair position (must be integer)
\item A1, A2 = allele codes (allele order is not important)
}}

\item{x}{A \code{bed.matrix} object created using the reference data.}

\item{match_by_id}{If \code{TRUE}, SNP matching will be performed by SNP IDs
instead of genomic position and allele codes. Default is \code{TRUE}.}

\item{check_strand_flip}{Only applies when \code{match_by_id = FALSE}. If \code{TRUE},
the function 1) removes ambiguous A/T and G/C SNPs for which the strand is
not obvious, and 2) attempts to find additional matching entries by
flipping allele codes (i.e., A->T, T->A, C->G, G->A). If the GWAS genotype
data itself is used as the reference data, it would be safe to set
\code{FALSE}. Default is \code{FALSE}.}

\item{return_indice}{Only applied when \code{match_by_id = FALSE}. If \code{TRUE}, the
function provides an additional column indicating whether the match is
with swapped alleles. If \code{check_strand_flip = TRUE}, the function also
provides an additional column indicating whether the match is with flipped
strand. Unnecessary for gene-based tests in this package, but may be
useful for other purposes (e.g., harmonization for meta-analysis that
needs to flip the sign of beta for a match with swapped alleles).}
}
\value{
A data frame with columns: "id", "chr", "pos", "A1", "A2" and
"pvalue". If \code{return_indice = TRUE}, the data frame includes additional
columns \code{key_}, \code{swapped_}, and \code{flipped_}. \code{key_} is "chr_pos_A1_A2" in
\code{sumstat} (the original input before harmonization). \code{swapped_} contains a
logical vector indicating reference allele swap. \code{flipped_} contains a
logical vector indicating strand flip.
}
\description{
Finds an intersection of variants between GWAS summary and reference data.
}
\details{
Pre-processing of GWAS summary data is required because the sets of variants
available in a particular GWAS might be poorly matched to the variants in
reference data. SNP matching can be performed either 1) by SNP ID or 2) by
chromosome code, base-pair position, and allele codes, while taking into
account possible strand flips and reference allele swap. For matched
entries, the SNP IDs in GWAS summary data are replaced with the ones in the
reference data.
}
\examples{
\dontshow{data.table::setDTthreads(1)}
## GWAS summary statistics
head(exGWAS)

## Load reference genotype data
bfile <- system.file("extdata", "example.bed", package = "snpsettest")
x <- read_reference_bed(path = bfile)

## Harmonize by SNP IDs
hsumstats1 <- harmonize_sumstats(exGWAS, x)

## Harmonize by genomic position and allele codes
## Reference allele swap will be taken into account
hsumstats2 <- harmonize_sumstats(exGWAS, x, match_by_id = FALSE)

## Check matching entries by flipping allele codes
## Ambiguous SNPs will be excluded from harmonization
hsumstats3 <- harmonize_sumstats(exGWAS, x, match_by_id = FALSE,
                                 check_strand_flip = TRUE)
}
