
#=======================================================
#' Simulated genotypes
#'
#' Simulate SNP genotype data from a pedigree, with optional missingess and
#'  errors.
#'
#' Provide either a pedigree dataframe, or the name of a text file containing
#'   the pedigree. Please ensure the pedigree is a valid pedigree, for example
#'   by first running fixPedigree() from library Pedantics.
#'
#' @param Ped  Dataframe, pedigree with columns ID - dam - sire; additional
#'   columns are ignored.
#' @param PedFile  name of a pedigree file.
#' @param MafFile  (optional) file with minor allele frequency at each locus.
#'   If none specified, allele frequencies will be sampled from a uniform
#'   distribution between 0.3 and 0.5.
#' @param OutFile  filename for simulated genotypes. If NA, return
#'   matrix with genotypes in console.
#' @param nSnp  number of SNPs to simulate.
#' @param nGen  maximum number of generations to consider.
#' @param PropLQ  proportion of low-quality samples.
#' @param MisHQ  average missingness for high-quality samples, assuming a
#'    beta-disstribution with alpha = 1.
#' @param MisLQ  average missingness in low-quality samples.
#' @param ParMis  proportion of parents with fully missing genotype.
#' @param ErHQ  error rate in high quality samples (defaults to 0.005).
#' @param ErLQ  error rate in low quality samples.
#' @param quiet suppress messages.
#'
#' @return A matrix with genotype data encoded as 0/1/2/-9, in a format
#'   similar to that generated by Plink's --recodeA option. The matrix is
#'   written to a specified textfile unless OutFile=NA.
#'
#' @author Jisca Huisman, \email{jisca.huisman@gmail.com}
#'
#' @examples
#' data(Ped_HSg5)
#' GenoM <- SimGeno(Ped = Ped_HSg5, nSnp = 100, ParMis = 0.2)
#'
#' @export

SimGeno <- function(Ped = NULL,
                    PedFile = NULL,
                    MafFile = NULL,
					          OutFile = NA,
                    nSnp = 400,
                    nGen = 10,
                    PropLQ = 0,
                    MisHQ = 0.005,
                    MisLQ = 0.30,
                    ParMis = 0.2,
                    ErHQ = 5e-4,
                    ErLQ = 5e-3,
					          quiet = FALSE)
{
  if (is.null(OutFile)) stop("'OutFile' must be filename or NA")
  if (interactive() & !quiet & !is.na(OutFile)) {
    if (file.exists(OutFile)) {
      ANS <- readline(prompt = paste("WARNING: ", OutFile,
                                     "will be overwritten.",
                                     "Press <N> to abort, or any other key to continue."))
    } else {
      ANS <- readline(prompt = paste("Genotypes will be written to ", OutFile,
                                     ". Press <N> to abort, or any other key to continue."))
    }
    if (substr(ANS, 1, 1) %in% c("N", "n")) stop()
  }

  if(!is.null(PedFile) & is.null(Ped)) {
    Ped <- ReadTable(PedFile)
  } else if (!(is.null(PedFile) & !is.null(Ped))) {
    stop("please provide either 'Ped' OR 'PedFile'")
  }

  if (!is.null(MafFile)) {
    Q <- as.numeric(t(ReadTable(MafFile)))
    nSnp <- length(Q)
  } else {
    Q <- round(stats::runif(nSnp, min=0.3, max=0.5),3)
  }

  #================================
  # check & prep

  PedIN <- Ped
  Ped <- AddParPed(Ped)
  for (x in 1:3) {
    Ped[,x] <- as.character(Ped[,x])
  }
  Founders <- which(is.na(Ped[,2]) & is.na(Ped[,3]))
  NotParent <- which(!Ped[,1] %in% Ped[,2] & !Ped[,1] %in% Ped[,3])
  if (length(intersect(Founders, NotParent))>0) {
    Ped <- Ped[-intersect(Founders, NotParent),]
  }
  nInd <- nrow(Ped)

  Q[Q>0.5] <- 1-Q[Q>0.5]


  #================================
  # divide pedigree into `generations'
  # the parents of an individual must come from earlier cohorts than itself,
  # or from the founder population

  Ped$Ci <- NA  # individual's cohort
  Ped$Cd <- NA  # dam's cohort
  Ped$Cs <- NA  # sire's cohort
  Ped$Ci[is.na(Ped[,2]) & is.na(Ped[,3])] <- 0
  for (x in 0:nGen) {
    Ped$Cd[is.na(Ped$Cd) & Ped[,2] %in% Ped[which(Ped$Ci<=x), 1]] <- x
    Ped$Cs[is.na(Ped$Cs) & Ped[,3] %in% Ped[which(Ped$Ci<=x), 1]] <- x
    Ped$Ci[which(is.na(Ped$Ci) &
                   (Ped$Cd<=x | is.na(Ped[,2])) &
                   (Ped$Cs<=x | is.na(Ped[,3])))] <- x+1
  }

  Ped$nd <- sapply(Ped[,2], function(x) ifelse(is.na(x), NA, which(Ped[,1]==x)))
  Ped$ns <- sapply(Ped[,3], function(x) ifelse(is.na(x), NA, which(Ped[,1]==x)))

  #================================
  # simulate genotypes
  # founders: random draw of alleles under HWE
  # non-founders: following Mendelian inheritance

  SGeno <- matrix(NA, nSnp, nInd)

  for (i in which(Ped$Ci==0)) {
    SGeno[, i] <- stats::rbinom(nSnp, 2, prob=Q)
  }

  for (x in 1:nGen) {
    for (i in which(Ped$Ci==x)) {
      if (!is.na(Ped$nd[i]) & !is.na(Ped$ns[i])) {
        SGeno[, i] <- rowSums(cbind(stats::rbinom(nSnp, 1, prob=SGeno[,Ped$nd[i]]/2),
                                    stats::rbinom(nSnp, 1, prob=SGeno[,Ped$ns[i]]/2)))
      } else if (!is.na(Ped$nd[i])) {
        SGeno[, i] <- rowSums(cbind(stats::rbinom(nSnp, 1, prob=SGeno[,Ped$nd[i]]/2),
                                    stats::rbinom(nSnp, 1, prob=Q)))
      } else if (!is.na(Ped$ns[i])) {
        SGeno[, i] <- rowSums(cbind(stats::rbinom(nSnp, 1, prob=Q),
                                    stats::rbinom(nSnp, 1, prob=SGeno[,Ped$ns[i]]/2)))
      }
    }
  }

  #================================
  # genotyping errors & missing values:
  # simulate mixture of high-quality and low-quality samples

  iLQ <- sample.int(nInd, round(nInd*PropLQ), replace=FALSE)
  iHQ <- setdiff(1:nInd, iLQ)
  if (MisLQ>0 & PropLQ>0) {
    nmis.LQ <- round(stats::rbeta(length(iLQ), 1, 1/MisLQ-1) * nSnp)
    for (i in 1:length(iLQ)) {
      SGeno[sample.int(nSnp, nmis.LQ[i]), iLQ[i]] <- -9
    }
  }
  nmis.HQ <- round(stats::rbeta(nInd-length(iLQ), 1, 1/MisHQ-1) * nSnp)


  for (i in 1:length(iHQ)) {
    SGeno[sample.int(nSnp, nmis.HQ[i]), iHQ[i]] <- -9
  }

	if (MisHQ>0) {
	  nEr.HQ <- round(stats::runif(nInd-length(iLQ), 1, 1/MisHQ-1) * nSnp)
	} else {
	  nEr.HQ <- 0
	}
  ErXH <- which(stats::rbinom(length(iHQ)*nSnp, 1, prob=ErHQ)==1)
  SGeno[, iHQ][ErXH] <- sample(0:2, length(ErXH), replace=TRUE)

  if (MisLQ<1 & MisLQ>0) {
    nEr.LQ <- round(stats::runif(length(iLQ), 1, 1/MisLQ-1) * nSnp)
    ErXL <- which(stats::rbinom(length(iLQ)*nSnp, 1, prob=ErLQ)==1)
    SGeno[, iLQ][ErXL] <- sample(0:2, length(ErXL), replace=TRUE)
  }

  #================================
  NotSampled <- which(!Ped[,1] %in% as.character(PedIN[,1]))
  if (ParMis>0) {
    IsParent <- which(Ped[,1] %in% c(Ped[,2], Ped[,3]))
    if (length(IsParent)>0) {
      NotSampled <- c(NotSampled,
                      sample(IsParent, round(length(IsParent)*ParMis),
                             replace=FALSE))
    }
  }
  if (length(NotSampled)>0) {
    SGeno <- SGeno[, -NotSampled]
    colnames(SGeno) <- Ped[-NotSampled, 1]
  } else {
    colnames(SGeno) <- Ped[, 1]
  }

  #================================
  # output
  if (!is.na(OutFile)) {
    utils::write.table(t(SGeno), OutFile, quote=FALSE, col.names=FALSE)
  } else return(t(SGeno))
}


#============================================================================
#============================================================================
# Utils functions for comparisons


#' Compare two vectors
#'
#' Compare a vector with inferred sibs to a vector of `true' sibs
#'
#' @param  Infrd  vector of inferred sibs
#' @param  Simld  vector of true sibs
#' @param  SNPd character vector with IDs of genotyped individuals
#'
#' @return a named numeric vector of length 4, with the total length of Simld,
#'   the length of the intersect of the two vectors, the number occurring in
#'   Infrd but not Simld ('err'), and the number occuring in Simld but not
#'   Infrd ('missed').

Vcomp <- function(Infrd, Simld, SNPd)
{
  total <- length(Simld)
  both <- length(intersect(Infrd, Simld))
  err <- length(setdiff(Infrd[Infrd %in% SNPd], Simld))
  missed <- length(setdiff(Simld, Infrd))
  c(total=total, both=both, err=err, missed=missed)
}


#======================================================================
#' Find the closest matching inferred sibship to a true sibship
#'
#' @param SimX  a vector with the IDs in the true sibship
#' @param Infrd  a list of vectors with the IDs in the inferred sibships
#' @param SNPd character vector with IDs of genotyped individuals
#'
#' @return a named numeric vector with the number of matches ('NumMatch'),
#'   the position of the best match ('Best'), the inferred sibship size of
#'   this best match ('Tot'), the number of matching IDs ('OK'), and the
#'   number of mismatches ('err').

SibMatch <- function(SimX, Infrd, SNPd)
{
  VC <- sapply(Infrd, Vcomp, SimX, SNPd)
  mtch <- which(VC[2,]>0)
  if (length(mtch)>1) {
    mtch <- which.max(VC[2,])[1]  # which inferred sibship has most members of true sibship
  }
  if (length(mtch)==1) {
    if (VC["err", mtch] > VC["both", mtch]) {
      mtch <- NULL
    }
  }
  if (length(mtch)==1) {
    OUT <- data.frame(NumMatch = sum(VC[2,]>0),
             Best = colnames(VC)[mtch],  # as.numeric(substr(colnames(VC)[mtch],3,5)),
             Tot = VC["total", mtch],
             OK = VC["both", mtch],
             Er = VC["err", mtch], stringsAsFactors=FALSE)
  } else {
    OUT <- data.frame(NumMatch=0, Best=NA,  Tot= length(SimX), OK = NA, Er = NA)
  }
  OUT
}


#======================================================================
# transpose matrix to data.frame  (sapply returns wrong format)

# adapted from Hmisc::all.is.numeric
all.is.numeric <- function (x, what = c("test", "vector"))
{
  what <- match.arg(what)
  isnum <- suppressWarnings(!any(is.na(as.numeric(stats::na.exclude(x)))))
  if (what == "test")
    isnum
  else if (isnum)
    as.numeric(x)
  else x
}

tdf <- function(M)
{
  DF <- matrix(NA, ncol(M), nrow(M))
  for (r in 1:nrow(M)) {
    DF[, r] <- unlist(M[r, ])
  }
  DF <- as.data.frame(DF, stringsAsFactors = FALSE, row.names=colnames(M))
  for (r in 1:ncol(DF)) {
    if (all.is.numeric(DF[, r]))  DF[, r] <- as.numeric(DF[, r])
  }
  names(DF) <- rownames(M)
  DF
}

#============================================================================
#============================================================================
#' Compare two Pedigrees
#'
#' Compare an inferred pedigree to a previous or simulated pedigree,
#' including comparison of sibship clusters and sibship grandparents.
#'
#' Provide either a dataframe of the pedigree, or the filename of a text file.
#' Both pedigrees are assumed to have as first three columns id, dam and sire,
#' in that order.
#'
#' @param  Ped1 original pedigree, dataframe with columns id-dam-sire; only the
#'   first 3 columns will be used.
#' @param  PedFile1  filename of original pedigree; either provide Ped1, or
#'   specify PedFile1.
#' @param  Ped2 infered pedigree, e.g. SeqOUT$Pedigree, with columns
#'   id-dam-sire.
#' @param  sep1  the field separator character in 'PedFile1'.
#' @param  DumPrefix  the dummy prefix used. If NULL, the intersect between
#'   the IDs in Pedigrees 1 and 2 is taken as the list of genotyped
#'   individuals. Otherwise, all individuals in Pedigree 2 with IDs
#'   not starting with the Dummy prefix are taken as genotyped.
#'
#' @return A list with
#' \item{Counts}{A 7 x 5 x 2 named numeric array with the number of matches and
#'   mismatches}
#' \item{MergedPed}{A side-by-side comparison of the two pedigrees}
#' \item{ConsensusPed}{A consensus pedigree, with Pedigree 2 taking priority
#'   over Pedigree 1}
#'
#' The first dimension of 'Counts' denotes the following categories:
#' \item{GG}{Genotyped individual, assigned a genotyped parent in either
#'   pedigree}
#' \item{GD}{Genotyped individual, assigned a dummy parent, or at least 1
#'   genotyped sibling or a genotyped grandparent in Pedigree 1)}
#' \item{GT}{Genotyped individual, total}
#' \item{DG}{Dummy individual, assigned a genotyped parent (i.e., grandparent
#'    of the sibship in Pedigree 2)}
#' \item{DD}{Dummy individual, assigned a dummy parent (i.e., avuncular
#'   relationship between sibships in Pedigree 2)}
#' \item{DT}{Dummy total}
#' \item{TT}{Total total, includes all genotyped individuals, plus
#'   non-genotyped individuals in Pedigree 1, plus non-replaced dummy
#'   individuals (see below) in Pedigree 2}
#'
#' The dummy individual count includes all non-genotyped individuals in
#' Pedigree 1 who have, according to either pedigree, at least 2 genotyped
#' offspring, or at least one genotyped offspring and a genotyped parent.
#'
#' The second dimension of 'Counts' gives the outcomes:
#' \item{Total}{The total number of individuals with a parent assigned in
#'    either or both pedigrees}
#' \item{Match}{The same parent is assigned in both pedigrees (non-missing).
#'      For dummy parents, it is considered a match if the inferred sibship
#'      which contains the most offspring of a non-genotyped parent, consists
#'      for more than half of this individual's offspring.}
#' \item{Mismatch}{Different parents assigned in the two pedigrees. When
#'    a sibship according to Pedigree 1 is split over two sibships in Pedigree
#'    2, the smaller fraction is included in the count here.}
#' \item{P1only}{Parent in Pedigree 1 but not 2; includes non-assignable
#'     parents (e.g. not genotyped and no genotyped offspring).}
#' \item{P2only}{Parent in Pedigree 2 but not 1.}
#'
#' The third dimension 'Counts' separates between maternal and paternal
#' assignments, where e.g. paternal 'DR' is the assignment of fathers to both
#' maternal and paternal sibships.
#'
#' 'MergedPed' provides the following columns:
#' \item{id}{All ids in both Pedigree 1 and 2}
#' \item{dam.1, sire.1}{parents in Pedigree 1}
#' \item{dam.2, sire.2}{parents in Pedigree 2}
#' \item{id.r, dam.r, sire.r}{when in Pedigree 2 a dummy parent is assigned,
#'   this column gives the best-matching non-genotyped individual according to
#'   Pedigree 1, or "nomatch". If a sibship in Pedigree 1 is divided over 2
#'   sibships in Pedigree 2, the smaller one will be denoted as "nomatch"}
#'
#' In 'ConsensusPed', the priority used is parent.r (if not "nomatch) >
#'   parent.2 > parent.1. The columns 'dam.cat' and 'sire.cat' give a 2-letter
#'   code denoting whether the focal individual (first letter) and its assigned
#'   parent (2nd letter) are
#'   \item{G}{Genotyped}
#'   \item{D}{Dummy individual (in Pedigree 1)}
#'   \item{R}{Dummy individual replaced by best matching non-genotyped individual}
#'   \item{U}{Ungenotyped (in Pedigree 2, with no dummy match)}
#'
#' @author Jisca Huisman, \email{jisca.huisman@gmail.com}
#'
#' @seealso \code{\link{DyadCompare}, \link{sequoia}}
#'
#' @examples
#' \dontrun{
#' data(Ped_HSg5, SimGeno_example, LH_HSg5, package="sequoia")
#' SeqOUT <- sequoia(GenoM = SimGeno_example, LifeHistData = LH_HSg5)
#' compare <- PedCompare(Ped1=Ped_HSg5, Ped2=SeqOUT$Pedigree)
#' compare$Counts   # 2 mismatches, due to simulated genotyping errors
#' head(compare$MergedPed)
#'
#' PedM <- compare$MergedPed
#' # find mismatching mothers:
#' with(PedM, PedM[which(dam.1!=dam.2 & dam.1!=dam.r),])
#'
#' # find mothers in Ped1 which are genotyped but not assigned in Ped2:
#' with(PedM, PedM[which(is.na(dam.2) & !is.na(dam.1) &
#'                        !is.na(id) & dam.1 %in% id),])
#' }
#' @export

PedCompare <- function(Ped1 = NULL,
                       PedFile1 = NULL,
                       Ped2 = NULL,
                       sep1 = "\t",
                       DumPrefix = c("F0", "M0"))
{
  if(sum(is.null(Ped1), is.null(PedFile1))!=1) stop("provide either 'Ped1' OR 'PedFile1'")
  if(is.null(Ped2)) stop("provide 'Ped2'")
  if(is.null(Ped1)) Ped1 <- ReadTable(PedFile1, sep=sep1)[,1:3]
  names(Ped1) <- c("id", "dam.1", "sire.1")
  names(Ped2)[1:3] <- c("id", "dam.2", "sire.2")
  for (i in 1:3) Ped1[, i] <- as.character(Ped1[, i])
  for (i in 1:3) Ped2[, i] <- as.character(Ped2[, i])
  if (!any(Ped2$id %in% Ped1$id))  stop("no common IDs in Ped1 and Ped2")
  Ped1 <- AddParPed(Ped1)
  Ped2 <- AddParPed(Ped2)
  if (is.null(DumPrefix)) {
    SNPd <- intersect(Ped2$id, Ped1$id)
  } else {
    DPnc <- nchar(DumPrefix)
    SNPd <- Ped2$id[substr(Ped2$id,1,DPnc[1])!=DumPrefix[1] &
                      substr(Ped2$id,1,DPnc[2])!=DumPrefix[2]]
  }
  PedX <- merge(Ped1, Ped2[Ped2$id %in% SNPd, ], all.y=TRUE)
  DumPed <- Ped2[!Ped2$id %in% SNPd, ]
  Dummies <- list(DumPed$id[DumPed$id %in% Ped2$dam],
                  DumPed$id[DumPed$id %in% Ped2$sire])
  Par <- matrix(c("dam.1", "dam.2", "dam.r",
                  "sire.1", "sire.2", "sire.r"), 2,byrow=TRUE)

  #===  match dummies to non-genotyped parents  ===
  Sibs1 <- list()
  SibScore <- list()
  DumReal <- list()
  for (p in 1:2) {
    Sibs1[[p]] <- split(PedX$id, PedX[, Par[p,1]])
    Sibs1[[p]] <- Sibs1[[p]][!names(Sibs1[[p]]) %in% SNPd]
  }
  NoDummies <- with(PedX, all(dam.2 %in% SNPd | is.na(dam.2)) &
                      all(sire.2 %in% SNPd | is.na(sire.2)))
  if (!NoDummies) {
    for (p in 1:2) {
      Sibs2 <- split(PedX$id, PedX[, Par[p,2]])
      Sibs2 <- Sibs2[!names(Sibs2) %in% SNPd]

      if (length(Sibs1[[p]])>0) {
        SibScore[[p]] <- tdf(sapply(Sibs1[[p]], SibMatch, Sibs2, SNPd))  # slow!
        if (length(stats::na.exclude(SibScore[[p]][, "Best"])) >
            length(unique(stats::na.exclude(SibScore[[p]][, "Best"])))) {
          SibScore[[p]] <- SibScore[[p]][order(SibScore[[p]][, "OK"],
                                               decreasing=TRUE), ]
          dups <- duplicated(SibScore[[p]][, "Best"], incomparables=NA)
          BestS.d <- unique(SibScore[[p]]$Best[dups])
          if (sum(dups) > 1) {
            SibScore[[p]][dups, "Er"] <- rowSums(SibScore[[p]][dups, c("OK", "Er")])
          } else {
            SibScore[[p]][dups, "Er"] <- sum(SibScore[[p]][dups, c("OK", "Er")])
          }
          SibScore[[p]][dups, c("Best", "OK")] <- NA
          for (s in 1:length(BestS.d)) {
            tmp <- SibScore[[p]][which(SibScore[[p]]$Best==BestS.d[s]), ]
            if (tmp$OK==1 & tmp$Er>=1) {
              SibScore[[p]][which(SibScore[[p]]$Best==BestS.d[s]),
                            c("Best", "OK")] <- NA
            }
          }
        }
        tmp <- SibScore[[p]][!is.na(SibScore[[p]][, "Best"]), ]
        DumReal[[p]] <- as.data.frame(cbind(real = rownames(tmp),
                              dummy = tmp$Best), stringsAsFactors=FALSE)
        DumReal[[p]] <- merge(DumReal[[p]], DumPed[DumPed$id %in% Dummies[[p]],],
                              by.x="dummy", by.y="id", all=TRUE)[,c("real", "dummy")]
        DumReal[[p]][,"real"][is.na(DumReal[[p]][,"real"])] <- "nomatch"
      } else {
        SibScore[[p]] <- NA
        DumReal[[p]] <- data.frame(real=NA, dummy=NA)
      }
    }
    for (p in 1:2) {
      PedX <- merge(PedX, stats::setNames(DumReal[[p]], c(Par[p,3], Par[p,2])),
                    all.x=TRUE)
      DumPed <- merge(DumPed, DumReal[[p]], by.x="id", by.y="dummy", all.x=TRUE,
                      suffixes = c(".x",".y"))
    }
    DumPed$id.r <- apply(DumPed[,c("real.x", "real.y")], 1, function(x)
      ifelse(all(is.na(x)), NA, min(x, na.rm=T)))
    DumPed <- DumPed[, c("id", "dam.2", "sire.2", "id.r")]
    DumPed <- merge(DumPed, Ped1, all.x=TRUE, by.x="id.r", by.y="id",
                    suffixes=c(".2", ".1"))
    for (p in 1:2) {
      DumPed <- merge(DumPed, stats::setNames(DumReal[[p]], c(Par[p,3], Par[p,2])),
                      all.x=TRUE)
    }
  } else {
    PedX$dam.r <- NA
    PedX$sire.r <- NA
  }

  #===  Combined pedigree  ===
  PedX$id.r <- PedX$id
  PedY <- merge(PedX, DumPed, all=TRUE, sort=FALSE)  # NA's for id.r = "nomatch"
  PedY <- merge(PedY, stats::setNames(Ped1, c("id.r", "dam.1", "sire.1" )),
                all=TRUE, sort=FALSE)

  Founders <- list()
  NGpar <- list()
  for (p in 1:2) {
    Founders[[p]] <- unique(unlist(PedY[is.na(PedY[,Par[p,1]]) & is.na(PedY[,Par[p,2]]),
                                 c("id", "id.r")]))
    Sibs1[[p]] <- Sibs1[[p]][sapply(Sibs1[[p]],length)>1  |
                               !names(Sibs1[[p]]) %in% Founders[[p]]]
    NGpar[[p]] <- names(Sibs1[[p]])  # potential dummies
  }
  GD <- list(G = SNPd, D = c(unlist(NGpar), unlist(Dummies)) )

  Score <- array(0, dim=c(7, 5, 2),
                 dimnames=list(c("GG", "GD", "GT", "DG", "DD", "DT", "TT"),
            c("Total", "Match", "Mismatch", "P1only", "P2only"),
            c("dam", "sire")))
  ID <- c(G="id", D="id.r")
  for (p in 1:2) {
    for (i in c("G", "D")) {  # focal
      for (j in c("G", "D")) {  # parent
        if ((i=="D" | j=="D") & length(DumReal)==0) break
        ij <- paste0(i,j)
        k <- ifelse(j=="G", 2, 3)
        PedTmp <- PedY[which(PedY[,ID[i]] %in% GD[[i]] &
                          (PedY[,Par[p,1]] %in% GD[[j]] | PedY[,Par[p,2]] %in% GD[[j]])), ]
        Score[ij, "Total", p] <- nrow(PedTmp)
#        Score[ij, "MaxMatch", p] <- sum(!is.na(PedTmp[,Par[p,1]]) & PedTmp[,Par[p,1]]
#                                        %in% GD[[j]])
        Score[ij, "Match", p] <- sum(PedTmp[,Par[p,1]] == PedTmp[,Par[p,k]], na.rm=T)
        Score[ij, "P1only", p] <- sum(!is.na(PedTmp[,Par[p,1]]) & PedTmp[,Par[p,1]]
                                         %in% GD[[j]] & is.na(PedTmp[,Par[p,2]]))
        Score[ij, "P2only", p] <- sum(!is.na(PedTmp[,Par[p,2]]) & PedTmp[,Par[p,2]]
                                         %in% GD[[j]] & is.na(PedTmp[,Par[p,1]]))
        Score[ij, "Mismatch", p] <- sum(PedTmp[,Par[p,1]] != PedTmp[,Par[p,k]], na.rm=T)
      }
    }
  }
  for (p in 1:2) {  # Totals
    for (i in c("G", "D", "T")) {
      if (i=="D" & length(DumReal)==0) break
      ij <- paste0(i,"T")
      if (i != "T") {
        PedTmp <- PedY[which(PedY[,ID[i]] %in% GD[[i]] &
                            (!is.na(PedY[,Par[p,1]]) | !is.na(PedY[,Par[p,2]]))), ]
      } else {
        PedTmp <- PedY[!is.na(PedY[,Par[p,1]]) | !is.na(PedY[,Par[p,2]]), ]
      }
      Score[ij, "Total", p] <- nrow(PedTmp)
#      Score[ij, "MaxMatch", p] <- sum(!is.na(PedTmp[,Par[p,1]]))
      Score[ij, "P1only", p] <- sum(!is.na(PedTmp[,Par[p,1]]) & is.na(PedTmp[,Par[p,2]]))
      Score[ij, "P2only", p] <- sum(is.na(PedTmp[,Par[p,1]]) & !is.na(PedTmp[,Par[p,2]]))
      Score[ij, "Match", p] <- sum(PedTmp[,Par[p,1]] == PedTmp[,Par[p,2]] |
                                        PedTmp[,Par[p,1]] == PedTmp[,Par[p,3]], na.rm=T)
      Score[ij, "Mismatch", p] <- sum(PedTmp[,Par[p,1]] != PedTmp[,Par[p,2]] &
                  (PedTmp[,Par[p,1]] != PedTmp[,Par[p,3]] | is.na(PedTmp[,Par[p,3]])), na.rm=T)
    }
  }
  PedY <- PedY[, c("id", "dam.1", "sire.1", "dam.2", "sire.2",
                   "id.r", "dam.r", "sire.r")]
  for (p in 1:2) {
    ParCat <- ifelse(is.na(PedY[,Par[p,2]]) & !is.na(PedY[,Par[p,1]]), "U",
                ifelse(PedY$id %in% SNPd,
                 ifelse(PedY[,Par[p,2]]  %in% SNPd, "GG",
                  ifelse(PedY[,Par[p,2]]  %in% Dummies[[p]],
                    ifelse(PedY[,Par[p,3]]=="nomatch", "GD", "GR"),
                    NA)),
                ifelse(PedY$id %in% unlist(Dummies),
                  ifelse(PedY$id.r == "nomatch",
                    ifelse(PedY[,Par[p,2]]  %in% SNPd, "DG",
                      ifelse(PedY[,Par[p,3]]=="nomatch", "DD", "DR")),
                    ifelse(PedY[,Par[p,2]]  %in% SNPd, "RG",
                      ifelse(PedY[,Par[p,3]]=="nomatch", "RD", "RR"))),
                  NA)))
    if (p==1)  PedY$dam.cat <- ParCat
    if (p==2)  PedY$sire.cat <- ParCat
  }
  PedY$id.r[which(PedY$id==PedY$id.r)] <- NA

  PedC <- with(PedY, data.frame(
    id = ifelse(!is.na(id.r) & id.r!="nomatch", id.r, id),
    dam = ifelse(!is.na(dam.r) & dam.r!="nomatch", dam.r,
                 ifelse(!is.na(dam.2), dam.2, dam.1)),
    sire = ifelse(!is.na(sire.r) & sire.r!="nomatch", sire.r,
                  ifelse(!is.na(sire.2), sire.2, sire.1)),
    dam.cat = dam.cat,
    sire.cat = sire.cat))


  list(Counts = Score, MergedPed = PedY[, 1:8], ConsensusPed = PedC)
}


#============================================================================
#============================================================================
#' Find siblings

#' @param x  an ID
#' @param Ped  a pedigree with columns id - dam - sire
#'
#' @return The individuals which are full or half siblings to x, as a
#'   three-column matrix with column names id1 (x), id2 (the siblings), and
#'   RC (the relatedness category, 'FS' or 'HS').

rc <- function(x, Ped) {
  RelCat <- with(Ped,
                 ifelse(id == id[x], "S",
                        ifelse(eqv(dam[x],dam) & eqv(sire[x], sire), "FS",
                               ifelse(eqv(dam[x],dam) |  eqv(sire[x], sire), "HS",
                                      NA))))
  out <- cbind(id1 = Ped$id[x],
               id2 = Ped$id[!is.na(RelCat)],
               RC = stats::na.exclude(RelCat))
  out <- out[out[,"RC"] != "S", ]
  out
}


#============================================================================
#' Compare dyads
#'
#' Count the number of half and full sibling pairs correctly and incorrectly
#' assigned
#'
#' @param  Ped1 Original pedigree, dataframe with 3 columns: id-dam-sire
#' @param  PedFile1  filename of original pedigree, only the first 3 columns
#'   will be used. Provide EITHER Ped1 OR PedFile1.
#' @param  Ped2 Infered pedigree, e.g. SeqOUT$Pedigree
#' @param  PedFile2  filename of infered pedigree
#' @param  sep1  the field separator character in 'PedFile1'
#' @param  sep2  the field separator character in 'PedFile2'
#'
#' @return A 3x3 table with the number of pairs assigned as full siblings (FS),
#'   half siblings (HS) or unrelated (U) in the two pedigrees.
#'
#' @seealso \code{\link{PedCompare}}
#' @export

DyadCompare <- function(Ped1 = NULL,
                       PedFile1 = NULL,
                       Ped2 = NULL,
                       PedFile2 = NULL,
                       sep1 = "\t",
                       sep2 = "")
{
  if(sum(is.null(Ped1), is.null(PedFile1))!=1) stop("provide either 'Ped1' OR 'PedFile1'")
  if(sum(is.null(Ped2), is.null(PedFile2))!=1) stop("provide either 'Ped2' OR 'PedFile2'")
  if(is.null(Ped1)) Ped1 <- ReadTable(PedFile1, sep=sep1)[,1:3]
  if(is.null(Ped2)) Ped2 <- ReadTable(PedFile2, sep=sep2)[,1:3]
  names(Ped1) <- c("id", "dam", "sire")
  names(Ped2)[1:3] <- c("id", "dam", "sire")
  for (i in 1:3) Ped1[, i] <- as.character(Ped1[, i])
  for (i in 1:3) Ped2[, i] <- as.character(Ped2[, i])
  if (!any(Ped2$id %in% Ped1$id))  stop("no common IDs in Ped1 and Ped2")
  Ped1 <- AddParPed(Ped1)
  Ped2 <- AddParPed(Ped2)

  # note: each pair is counted double
  RCT <- matrix(NA, 0, 3)
  for (x in 1:nrow(Ped1)) {
    RCT <- rbind(RCT, rc(x, Ped1))
  }

  RCI <- matrix(NA, 0, 3)
  for (x in 1:nrow(Ped2)) {
    RCI <- rbind(RCI, rc(x, Ped2))
  }

  RCTI <- merge(as.data.frame(RCT, stringsAsFactors=FALSE),
                as.data.frame(RCI, stringsAsFactors=FALSE),
                by=c("id1", "id2"), all=TRUE, suffixes = c(".1", ".2"))
  RCTI <- RCTI[RCTI$id1 %in% Ped1$id & RCTI$id2 %in% Ped1$id &
                 RCTI$id1 %in% Ped2$id & RCTI$id2 %in% Ped2$id, ]
  RCTI$RC.1[is.na(RCTI$RC.1)] <- "U"
  RCTI$RC.2[is.na(RCTI$RC.2)] <- "U"
  RCTI$RC.1 <- factor(RCTI$RC.1, levels=c("FS", "HS", "U"))
  RCTI$RC.2 <- factor(RCTI$RC.2, levels=c("FS", "HS", "U"))

  tbl <- with(RCTI, Table(RC.1, RC.2))/2  # pairs included double
  tbl["U", "U"] <- nrow(Ped2) * (nrow(Ped2)-1)/2 - sum(tbl)
  tbl
  #  sweep(tbl, 1, rowSums(tbl), "/")
}


#============================================================================
#============================================================================
