#=======================================================
#' @title Simulated genotypes
#'
#' @description Simulate SNP genotype data from a pedigree, with optional
#'  missingess and errors.
#'
#' @details Provide either a pedigree dataframe, or the name of a text file
#' containing
#'   the pedigree. Please ensure the pedigree is a valid pedigree, for example
#'   by first running fixPedigree() from library Pedantics.
#'
#' Errors are generated by replacing randomly chosen genotypes with random
#'  genotypes, with equal probabilities. As this will not result in a change in
#'  genotype in around 1/3rd of cases, the number of replaced genotypes is nSnp
#'  X n individuals X error rate X 3/2
#'
#' @param Ped  Dataframe, pedigree with columns ID - dam - sire; additional
#'   columns are ignored.
#' @param nSnp  number of SNPs to simulate.
#' @param ParMis  proportion of parents with fully missing genotype.
#' @param MAF  (optional) vector with minor allele frequency at each locus.
#'   If none specified, allele frequencies will be sampled from a uniform
#'   distribution between 0.3 and 0.5.
#' @param OutFile  filename for simulated genotypes. If NA (default), return
#'   matrix with genotypes within R.
#' @param nGen  maximum number of generations to consider (pedigree depth).
#' @param PropLQ  proportion of low-quality samples.
#' @param MisHQ  average missingness for high-quality samples, assuming a
#'    beta-disstribution with alpha = 1.
#' @param MisLQ  average missingness in low-quality samples.
#' @param ErHQ  error rate in high quality samples (defaults to 0.005).
#' @param ErLQ  error rate in low quality samples.
#' @param quiet suppress messages.
#'
#' @return A matrix with genotype data in sequoia's input format, encoded as
#'  0/1/2/-9.
#'
#' @seealso \code{\link{EstConf}}
#'
#' @author Jisca Huisman, \email{jisca.huisman@gmail.com}
#'
#' @section Disclaimer:
#' This simulation is highly simplistic and assumes that all SNPs segregate
#' completely independently, and that the SNPs are in Hardy-Weinberg
#' equilibrium in the pedigree founders. Results based on this simulated data
#'  will provide an minimum estimate of the number of SNPs required, and an
#'  optimistic estimate of pedigree reconstruction performance.
#'
#' @examples
#' data(Ped_HSg5)
#' GenoM <- SimGeno(Ped = Ped_HSg5, nSnp = 100, ParMis = 0.2)
#'
#' @export

SimGeno <- function(Ped = NULL,
                    nSnp = 400,
                    ParMis = 0.4,
                    MAF = NULL,
					          OutFile = NA,
                    nGen = 20,
                    PropLQ = 0,
                    MisHQ = 0.005,
                    MisLQ = 0.30,
                    ErHQ = 5e-4,
                    ErLQ = 5e-3,
					          quiet = FALSE)
{
  if (is.null(OutFile)) stop("'OutFile' must be filename or NA")
  if(is.null(Ped)) stop("please provide a pedigree to simulate from")

  if (interactive() & !quiet & !is.na(OutFile)) {
    if (file.exists(OutFile)) {
      ANS <- readline(prompt = paste("WARNING: ", OutFile,
                                     "will be overwritten.",
                                     "Press <N> to abort, or any other key to continue."))
    } else {
      ANS <- readline(prompt = paste("Genotypes will be written to ", OutFile,
                                     ". Press <N> to abort, or any other key to continue."))
    }
    if (substr(ANS, 1, 1) %in% c("N", "n")) stop()
  }

  if (!is.null(MAF)) {
    if (length(MAF) != nSnp)  stop("Length of 'MAF' does not equal 'nSnp'")
    Q <- as.numeric(MAF)
  } else {
    Q <- round(stats::runif(nSnp, min=0.3, max=0.5),3)
  }

  #================================
  # check & prep

  PedIN <- Ped
  for (x in 1:3) {
    Ped[,x] <- as.character(Ped[,x])
  }
  Ped <- AddParPed(Ped)
  Founders <- which(is.na(Ped[,2]) & is.na(Ped[,3]))
  NotParent <- which(!Ped[,1] %in% Ped[,2] & !Ped[,1] %in% Ped[,3])
  if (length(intersect(Founders, NotParent))>0) {
    Ped <- Ped[-intersect(Founders, NotParent),]
  }
  nInd <- nrow(Ped)

  Q[Q>0.5] <- 1-Q[Q>0.5]


  #================================
  # divide pedigree into `generations'
  # the parents of an individual must come from earlier cohorts than itself,
  # or from the founder population

  Ped$Ci <- NA  # individual's cohort
  Ped$Cd <- NA  # dam's cohort
  Ped$Cs <- NA  # sire's cohort
  Ped$Ci[is.na(Ped[,2]) & is.na(Ped[,3])] <- 0
  for (x in 0:nGen) {
    Ped$Cd[is.na(Ped$Cd) & Ped[,2] %in% Ped[which(Ped$Ci<=x), 1]] <- x
    Ped$Cs[is.na(Ped$Cs) & Ped[,3] %in% Ped[which(Ped$Ci<=x), 1]] <- x
    Ped$Ci[which(is.na(Ped$Ci) &
                   (Ped$Cd<=x | is.na(Ped[,2])) &
                   (Ped$Cs<=x | is.na(Ped[,3])))] <- x+1
  }

  Ped$nd <- sapply(Ped[,2], function(x) ifelse(is.na(x), NA, which(Ped[,1]==x)))
  Ped$ns <- sapply(Ped[,3], function(x) ifelse(is.na(x), NA, which(Ped[,1]==x)))

  #================================
  # simulate genotypes
  # founders: random draw of alleles under HWE
  # non-founders: following Mendelian inheritance

  SGeno <- matrix(NA, nSnp, nInd)

  for (i in which(Ped$Ci==0)) {
    SGeno[, i] <- stats::rbinom(nSnp, 2, prob=Q)
  }

  for (x in 1:nGen) {
    for (i in which(Ped$Ci==x)) {
      if (!is.na(Ped$nd[i]) & !is.na(Ped$ns[i])) {
        SGeno[, i] <- rowSums(cbind(stats::rbinom(nSnp, 1, prob=SGeno[,Ped$nd[i]]/2),
                                    stats::rbinom(nSnp, 1, prob=SGeno[,Ped$ns[i]]/2)))
      } else if (!is.na(Ped$nd[i])) {
        SGeno[, i] <- rowSums(cbind(stats::rbinom(nSnp, 1, prob=SGeno[,Ped$nd[i]]/2),
                                    stats::rbinom(nSnp, 1, prob=Q)))
      } else if (!is.na(Ped$ns[i])) {
        SGeno[, i] <- rowSums(cbind(stats::rbinom(nSnp, 1, prob=Q),
                                    stats::rbinom(nSnp, 1, prob=SGeno[,Ped$ns[i]]/2)))
      }
    }
  }

  #================================
  # genotyping errors & missing values:
  # simulate mixture of high-quality and low-quality samples

  iLQ <- sample.int(nInd, round(nInd*PropLQ), replace=FALSE)
  iHQ <- setdiff(1:nInd, iLQ)
  if (MisLQ>0 & PropLQ>0) {
    nmis.LQ <- round(stats::rbeta(length(iLQ), 1, 1/MisLQ-1) * nSnp)
    for (i in 1:length(iLQ)) {
      SGeno[sample.int(nSnp, nmis.LQ[i]), iLQ[i]] <- -9
    }
  }

  nmis.HQ <- round(stats::rbeta(nInd-length(iLQ), 1, 1/MisHQ-1) * nSnp)
  for (i in 1:length(iHQ)) {
    SGeno[sample.int(nSnp, nmis.HQ[i]), iHQ[i]] <- -9
  }

  ErXH <- which(stats::rbinom(length(iHQ)*nSnp, 1, prob=ErHQ*3/2)==1)
  SGeno[, iHQ][ErXH] <- sample(0:2, length(ErXH), replace=TRUE)

  if (MisLQ<1 & MisLQ>0) {
    ErXL <- which(stats::rbinom(length(iLQ)*nSnp, 1, prob=ErLQ*3/2)==1)
    SGeno[, iLQ][ErXL] <- sample(0:2, length(ErXL), replace=TRUE)
  }

  #================================
  NotSampled <- which(!Ped[,1] %in% as.character(PedIN[,1]))
  if (ParMis>0) {
    IsParent <- which(Ped[,1] %in% c(Ped[,2], Ped[,3]))
    if (length(IsParent)>0) {
      NotSampled <- c(NotSampled,
                      sample(IsParent, round(length(IsParent)*ParMis),
                             replace=FALSE))
    }
  }
  if (length(NotSampled)>0) {
    SGeno <- SGeno[, -NotSampled]
    colnames(SGeno) <- Ped[-NotSampled, 1]
  } else {
    colnames(SGeno) <- Ped[, 1]
  }

  #================================
  # output
  if (!is.na(OutFile)) {
    utils::write.table(t(SGeno), OutFile, quote=FALSE, col.names=FALSE)
  } else return(t(SGeno))
}
