% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/COMBO_EM.R
\name{COMBO_EM}
\alias{COMBO_EM}
\title{EM-Algorithm Estimation of the Binary Outcome Misclassification Model}
\usage{
COMBO_EM(
  Ystar,
  x_matrix,
  z_matrix,
  beta_start,
  gamma_start,
  tolerance = 1e-07,
  max_em_iterations = 1500,
  em_method = "squarem"
)
}
\arguments{
\item{Ystar}{A numeric vector of indicator variables (1, 2) for the observed
outcome \code{Y*}. There should be no \code{NA} terms. The reference category is 2.}

\item{x_matrix}{A numeric matrix of covariates in the true outcome mechanism.
\code{x_matrix} should not contain an intercept and no values should be \code{NA}.}

\item{z_matrix}{A numeric matrix of covariates in the observation mechanism.
\code{z_matrix} should not contain an intercept and no values should be \code{NA}.}

\item{beta_start}{A numeric vector or column matrix of starting values for the \eqn{\beta}
parameters in the true outcome mechanism. The number of elements in \code{beta_start}
should be equal to the number of columns of \code{x_matrix} plus 1.}

\item{gamma_start}{A numeric vector or matrix of starting values for the \eqn{\gamma}
parameters in the observation mechanism. In matrix form, the \code{gamma_start} matrix rows
correspond to parameters for the \code{Y* = 1}
observed outcome, with the dimensions of \code{z_matrix} plus 1, and the
gamma parameter matrix columns correspond to the true outcome categories
\eqn{Y \in \{1, 2\}}. A numeric vector for \code{gamma_start} is
obtained by concatenating the gamma matrix, i.e. \code{gamma_start <- c(gamma_matrix)}.}

\item{tolerance}{A numeric value specifying when to stop estimation, based on
the difference of subsequent log-likelihood estimates. The default is \code{1e-7}.}

\item{max_em_iterations}{An integer specifying the maximum number of
iterations of the EM algorithm. The default is \code{1500}.}

\item{em_method}{A character string specifying which EM algorithm will be applied.
Options are \code{"em"}, \code{"squarem"}, or \code{"pem"}. The default and
recommended option is \code{"squarem"}.}
}
\value{
\code{COMBO_EM} returns a data frame containing four columns. The first
column, \code{Parameter}, represents a unique parameter value for each row.
The next column contains the parameter \code{Estimates}, followed by the standard
error estimates, \code{SE}. The final column, \code{Convergence}, reports
whether or not the algorithm converged for a given parameter estimate.

Estimates are provided for the binary misclassification model, as well as two
additional cases. The "SAMBA" parameter estimates are from the R Package,
SAMBA, which uses the EM algorithm to estimate a binary outcome misclassification
model that assumes there is perfect specificity. The "PSens" parameter estimates
are estimated using the EM algorithm for the binary outcome misclassification
model that assumes there is perfect sensitivitiy. The "Naive" parameter
estimates are from a simple logistic regression \code{Y* ~ X}.
}
\description{
Jointly estimate \eqn{\beta} and \eqn{\gamma} parameters from the true outcome
and observation mechanisms, respectively, in a binary outcome misclassification
model.
}
\examples{
\donttest{
set.seed(123)
n <- 1000
x_mu <- 0
x_sigma <- 1
z_shape <- 1

true_beta <- matrix(c(1, -2), ncol = 1)
true_gamma <- matrix(c(.5, 1, -.5, -1), nrow = 2, byrow = FALSE)

x_matrix = matrix(rnorm(n, x_mu, x_sigma), ncol = 1)
X = matrix(c(rep(1, n), x_matrix[,1]), ncol = 2, byrow = FALSE)
z_matrix = matrix(rgamma(n, z_shape), ncol = 1)
Z = matrix(c(rep(1, n), z_matrix[,1]), ncol = 2, byrow = FALSE)

exp_xb = exp(X \%*\% true_beta)
pi_result = exp_xb[,1] / (exp_xb[,1] + 1)
pi_matrix = matrix(c(pi_result, 1 - pi_result), ncol = 2, byrow = FALSE)

true_Y <- rep(NA, n)
for(i in 1:n){
    true_Y[i] = which(stats::rmultinom(1, 1, pi_matrix[i,]) == 1)
}

exp_zg = exp(Z \%*\% true_gamma)
pistar_denominator = matrix(c(1 + exp_zg[,1], 1 + exp_zg[,2]), ncol = 2, byrow = FALSE)
pistar_result = exp_zg / pistar_denominator

pistar_matrix = matrix(c(pistar_result[,1], 1 - pistar_result[,1],
                         pistar_result[,2], 1 - pistar_result[,2]),
                       ncol = 2, byrow = FALSE)

obs_Y <- rep(NA, n)
for(i in 1:n){
    true_j = true_Y[i]
    obs_Y[i] = which(rmultinom(1, 1,
                     pistar_matrix[c(i, n + i),
                                     true_j]) == 1)
 }

Ystar <- obs_Y

starting_values <- rep(1,6)
beta_start <- matrix(starting_values[1:2], ncol = 1)
gamma_start <- matrix(starting_values[3:6], ncol = 2, nrow = 2, byrow = FALSE)

EM_results <- COMBO_EM(Ystar, x_matrix = x_matrix, z_matrix = z_matrix,
                       beta_start = beta_start, gamma_start = gamma_start)

EM_results}
}
\references{
Beesley, L. and Mukherjee, B. (2020).
Statistical inference for association studies using electronic health records:
Handling both selection bias and outcome misclassification.
Biometrics, 78, 214-226.
}
