#' @title GALAHAD: Geometry-Adaptive Lyapunov-Assured Hybrid Optimizer
#' @description Battle-hardened production optimizer with geometry awareness,
#'   Lyapunov stability monitoring, and trust-region safety.
#'
#' @param V Objective function: \code{function(theta) -> scalar}
#' @param gradV Gradient function: \code{function(theta) -> vector of length p}
#' @param theta0 Initial parameter vector (numeric, length p)
#' @param parts List with geometry partitions: \code{T}, \code{P}, \code{E}.
#'   See Details.
#' @param control Optional list of control parameters. See Details.
#' @param callback Optional progress callback:
#'   \code{function(info)} where info has \code{iter}, \code{theta},
#'   \code{value}, \code{grad_norm}
#'
#' @details
#' \strong{Geometry Partitions:}
#'
#' Parameters are divided into three geometric types:
#' \describe{
#'   \item{\code{T} (log-scale)}{Natural gradient on positive reals.
#'     Use for scale parameters spanning orders of magnitude
#'     (e.g., \eqn{\sigma \in (0.01, 100)}).}
#'   \item{\code{P} (positive orthant)}{Entropy mirror descent.
#'     Use for positive parameters with moderate range
#'     (e.g., \eqn{\alpha \in (0.1, 10)}).}
#'   \item{\code{E} (Euclidean)}{Standard gradient descent.
#'     Use for unconstrained parameters (e.g., regression coefficients).}
#' }
#'
#' \strong{Control Parameters:}
#' \describe{
#'   \item{\code{max_iter}}{Maximum iterations (default: 2000)}
#'   \item{\code{tol_g}}{Gradient tolerance (default: 1e-6)}
#'   \item{\code{tol_x}}{Step tolerance (default: 1e-9)}
#'   \item{\code{tol_f}}{Function change tolerance (default: 1e-12)}
#'   \item{\code{delta}}{Initial trust radius (default: 1.0)}
#'   \item{\code{eta0}}{Initial step size (default: 1.0)}
#'   \item{\code{V_star}}{Known minimum (optional, for Polyak steps)}
#'   \item{\code{lambda}}{L2 regularization weight (default: 0)}
#' }
#'
#' @return List with components:
#' \describe{
#'   \item{\code{theta}}{Final parameter vector}
#'   \item{\code{value}}{Final objective value}
#'   \item{\code{grad_inf}}{Infinity norm of final gradient}
#'   \item{\code{converged}}{Logical convergence flag}
#'   \item{\code{status}}{Convergence status string}
#'   \item{\code{reason}}{Detailed convergence reason}
#'   \item{\code{iterations}}{Number of iterations performed}
#'   \item{\code{history}}{data.frame with iteration history}
#'   \item{\code{diagnostics}}{List with convergence diagnostics and
#'     Lyapunov certificates}
#'   \item{\code{certificate}}{Convergence certificate}
#' }
#'
#' @examples
#' \donttest{
#' # Quadratic objective
#' p <- 20
#' Q <- diag(1:p)
#' theta_true <- rnorm(p)
#' V <- function(th) 0.5 * sum((th - theta_true) * (Q %*% (th - theta_true)))
#' gradV <- function(th) Q %*% (th - theta_true)
#'
#' # Mixed geometry: log-scale, positive, Euclidean
#' parts <- list(T = 1:5, P = 6:10, E = 11:20)
#' theta0 <- abs(rnorm(p)) + 0.1
#'
#' # Optimize with progress tracking
#' result <- GALAHAD(V, gradV, theta0, parts,
#'   control = list(max_iter = 100, tol_g = 1e-6),
#'   callback = function(info) {
#'     if (info$iter %% 10 == 0) {
#'       cat(sprintf("Iter %3d: V = %.6f, ||g|| = %.3e\n",
#'                   info$iter, info$value, info$grad_norm))
#'     }
#'   })
#'
#' print(result$theta)
#' print(result$diagnostics)
#' }
#'
#' @references
#' Conn, A. R., Gould, N. I., & Toint, P. L. (2000).
#' \emph{Trust-region methods}. SIAM.
#'
#' Amari, S. (1998). Natural gradient works efficiently in learning.
#' \emph{Neural computation}, 10(2), 251-276.
#'
#' @export
#' @importFrom stats lm coef
GALAHAD <- function(V, gradV, theta0, parts, control = list(),
                    callback = NULL) {

  # ===============================================================
  # SETUP & VALIDATION
  # ===============================================================

  cfg <- validate_and_setup(V, gradV, theta0, parts, control)

  # Initialize state
  state <- initialize_state_prealloc(theta0, parts, cfg)

  # Wrap user functions with error handling
  safe_V <- make_safe_function(cfg$V_wrap, "V", state$V + 1e10)
  safe_grad <- make_safe_function(cfg$grad_wrap, "gradV", NULL)

  # ===============================================================
  # MAIN OPTIMIZATION LOOP
  # ===============================================================

  for (k in seq_len(cfg$max_iter)) {

    # === Step Size Selection (with dynamic f_star) ===
    step_cert <- select_step_dynamic_fstar(state, cfg)
    eta_trial <- step_cert$eta

    # === Preconditioned Direction ===
    # FIX #1: L_hat acts as diagonal metric with automatic regularization
    M_diag <- 1 / pmax(pmin(state$L_hat, cfg$l_max), cfg$l_min)
    direction <- -M_diag * state$grad

    # Gradient clipping (optional stability)
    if (!is.null(cfg$grad_clip_norm)) {
      g_norm <- sqrt(sum(state$grad^2))
      direction <- direction * pmin(1, cfg$grad_clip_norm / (g_norm + 1e-12))
    }

    grad_dot_d <- sum(state$grad * direction)

    # Guard: direction must be descent
    if (grad_dot_d >= 0) {
      direction <- -state$grad
      grad_dot_d <- -sum(state$grad^2)
    }

    # ===============================================================
    # ARMIJO BACKTRACKING LOOP (with safety exit)
    # ===============================================================

    eta <- eta_trial
    armijo_iters <- 0

    repeat {
      # Forward-backward step
      theta_fwd <- state$theta + eta * direction
      theta_bwd <- geometry_prox(theta_fwd, state$grad, eta, parts, cfg$eps_safe)

      # Scaled trust-region projection (FIX: now properly defined)
      theta_proj <- trust_project_scaled(theta_bwd, state$theta, cfg$delta,
                                         state$L_hat, parts, cfg$eps_safe)

      # Evaluate merit
      V_trial <- safe_V(theta_proj)

      # Armijo condition
      armijo_satisfied <- (V_trial <= state$V - cfg$c1 * eta * grad_dot_d)

      # FIX #2: Armijo safety exit (prevent infinite loop)
      if (armijo_satisfied || eta <= cfg$eta_min || armijo_iters >= 20) {
        break
      }

      eta <- 0.5 * eta
      armijo_iters <- armijo_iters + 1
    }

    # ===============================================================
    # HALPERN AVERAGING
    # ===============================================================

    alpha_k <- 2 / (k + 2)
    theta_halpern <- alpha_k * state$theta0 + (1 - alpha_k) * theta_proj
    V_halpern <- safe_V(theta_halpern)

    # Accept Halpern only if it doesn't increase merit
    if (V_halpern <= state$V && V_halpern <= V_trial) {
      theta_next <- theta_halpern
      V_next <- V_halpern
      halpern_accepted <- TRUE
    } else {
      theta_next <- theta_proj
      V_next <- V_trial
      halpern_accepted <- FALSE
    }

    # Final safety check
    if (V_next > state$V) {
      theta_next <- state$theta
      V_next <- state$V
      cfg$delta <- max(0.25 * cfg$delta, cfg$delta_min)
    }

    # ===============================================================
    # STATE UPDATE WITH LYAPUNOV TRACKING
    # ===============================================================

    grad_next <- safe_grad(theta_next)
    if (is.null(grad_next)) {
      stop("Gradient evaluation failed at iteration ", k)
    }

    # Update Lipschitz estimate
    s <- theta_next - state$theta
    y <- grad_next - state$grad
    state$L_hat <- update_lipschitz_safe(state$L_hat, s, y, cfg)

    # FIX #3: Relative Lyapunov potential (prevents overflow)
    delta_V <- state$V - V_next

    # Compute relative Lyapunov for diagnostics
    V_scale <- abs(state$V) + 1e-10
    delta_V_rel <- delta_V / V_scale

    lyapunov_ok <- (delta_V >= -1e-10)

    if (!lyapunov_ok) {
      warning(sprintf("Lyapunov violated at iter %d: DeltaV = %.3e (relative: %.3e)",
                      k, delta_V, delta_V_rel))
    }

    # Step norm
    step_norm <- sqrt(sum(s^2))

    # Store in pre-allocated arrays
    state <- update_state_prealloc(state, k, theta_next, V_next, grad_next,
                                   step_norm, delta_V, delta_V_rel,
                                   lyapunov_ok, list(
                                     method = step_cert$method,
                                     eta = eta,
                                     armijo_iters = armijo_iters,
                                     halpern_accepted = halpern_accepted,
                                     f_star_used = step_cert$f_star_used
                                   ))

    # Store s,y for BB
    state$prev_step <- list(s = s, y = y)

    # Adapt trust radius
    cfg$delta <- adapt_trust_radius_smooth(cfg$delta, step_norm,
                                           V_next <= V_trial, cfg)

    # ===============================================================
    # PROGRESS CALLBACK
    # ===============================================================

    if (!is.null(callback)) {
      tryCatch({
        callback(list(
          iter = k,
          theta = theta_next,
          value = V_next,
          grad_norm = sqrt(sum(grad_next^2))
        ))
      }, error = function(e) {
        warning("Callback failed at iteration ", k, ": ", e$message)
      })
    }

    # ===============================================================
    # CONVERGENCE CHECKS
    # ===============================================================

    conv_cert <- check_convergence_certified(state, k, cfg)

    if (conv_cert$converged) {
      return(finalize_output(state, k, conv_cert, "converged"))
    }

    if (step_norm < cfg$tol_x && k > 5) {
      return(finalize_output(state, k,
                             list(converged = FALSE, reason = "STAGNATION"),
                             "stagnation"))
    }
  }

  # Max iterations reached
  finalize_output(state, cfg$max_iter,
                  list(converged = FALSE, reason = "MAX_ITER"), "max_iter")
}


# ===========================================================================
# ENHANCED SUB-ALGORITHMS
# ===========================================================================


#' @title Dynamic f_star Step Selection
#' @keywords internal
select_step_dynamic_fstar <- function(state, cfg) {

  # ===========================================================
  # DYNAMIC f_star UPDATE
  # ===========================================================

  if (!is.null(cfg$V_star) || state$k > 1) {
    # Use minimum of: provided V_star OR observed minimum
    if (!is.null(cfg$V_star)) {
      f_star_base <- cfg$V_star
    } else {
      f_star_base <- Inf
    }

    # Get minimum from history
    if (state$k > 0) {
      hist_min <- min(state$arrays$V[1:state$k], na.rm = TRUE)
      f_star_working <- min(f_star_base, hist_min)
    } else {
      f_star_working <- f_star_base
    }

    # Polyak step with dynamic bound
    g_norm_sq <- sum(state$grad^2)
    gap <- max(state$V - f_star_working, 0)

    if (gap > 1e-10 && g_norm_sq > 1e-16) {
      eta_pl <- gap / (g_norm_sq + 1e-16)
      eta <- clamp(eta_pl, cfg$eta_min, cfg$eta_max)

      return(list(
        eta = eta,
        method = "POLYAK",
        f_star_used = f_star_working
      ))
    }
  }

  # ===========================================================
  # BARZILAI-BORWEIN FALLBACK
  # ===========================================================

  if (!is.null(state$prev_step)) {
    s <- state$prev_step$s
    y <- state$prev_step$y

    sy <- sum(s * y)
    ss <- sum(s * s)
    yy <- sum(y * y)

    if (sy > 1e-16 && ss > 1e-16) {
      eta_bb1 <- sy / yy
      eta_bb2 <- ss / sy
      eta_bb <- pmin(eta_bb1, eta_bb2)
      eta <- clamp(eta_bb, cfg$eta_min, cfg$eta_max)

      return(list(
        eta = eta,
        method = "BB",
        f_star_used = NA_real_
      ))
    }
  }

  # Default step
  list(
    eta = cfg$eta0,
    method = "DEFAULT",
    f_star_used = NA_real_
  )
}


#' @title Trust-Region Projection (Scaled M-norm)
#' @description Projects candidate onto trust region with coordinate-wise
#'   metric scaling
#' @keywords internal
trust_project_scaled <- function(theta_cand, theta_center, delta, L_hat,
                                 parts, eps_safe) {
  # Compute displacement
  d <- theta_cand - theta_center

  # Scaled M-norm: ||d||_M = sqrt(d' * diag(L_hat) * d)
  d_scaled <- sqrt(L_hat) * d
  norm_M <- sqrt(sum(d_scaled^2))

  # If within trust region, return as-is
  if (norm_M <= delta) {
    return(clamp_positive(theta_cand, parts, eps_safe))
  }

  # Otherwise, project onto boundary: theta_center + (delta / ||d||_M) * d
  scale_factor <- delta / (norm_M + 1e-16)
  theta_proj <- theta_center + scale_factor * d

  # Ensure positivity constraints
  clamp_positive(theta_proj, parts, eps_safe)
}


#' @title Initialize Pre-Allocated State
#' @keywords internal
initialize_state_prealloc <- function(theta0, parts, cfg) {
  p <- length(theta0)
  n <- cfg$max_iter

  # Ensure positivity for T and P partitions
  theta0 <- clamp_positive(theta0, parts, cfg$eps_safe)

  # Initial evaluation
  V0 <- cfg$V_wrap(theta0)
  grad0 <- cfg$grad_wrap(theta0)

  # Pre-allocate history arrays
  arrays <- list(
    iter = integer(n),
    V = numeric(n),
    grad_norm = numeric(n),
    step_norm = numeric(n),
    delta_V = numeric(n),
    delta_V_rel = numeric(n),  # FIX #3: Add relative Lyapunov
    lyapunov_ok = logical(n),
    eta = numeric(n),
    method = character(n)
  )

  # Fill with NA
  arrays$V[] <- NA_real_
  arrays$grad_norm[] <- NA_real_
  arrays$step_norm[] <- NA_real_
  arrays$delta_V[] <- NA_real_
  arrays$delta_V_rel[] <- NA_real_
  arrays$eta[] <- NA_real_
  arrays$method[] <- NA_character_

  list(
    theta0 = theta0,
    theta = theta0,
    V = V0,
    grad = grad0,
    k = 0L,
    L_hat = rep(cfg$L_est, p),
    arrays = arrays,
    prev_step = NULL,
    lyapunov_violations = 0L
  )
}


#' @title Update Pre-Allocated State
#' @keywords internal
update_state_prealloc <- function(state, k, theta, V, grad, step_norm,
                                  delta_V, delta_V_rel, lyapunov_ok, cert) {

  state$arrays$iter[k] <- k
  state$arrays$V[k] <- V
  state$arrays$grad_norm[k] <- sqrt(sum(grad^2))
  state$arrays$step_norm[k] <- step_norm
  state$arrays$delta_V[k] <- delta_V
  state$arrays$delta_V_rel[k] <- delta_V_rel  # FIX #3: Store relative
  state$arrays$lyapunov_ok[k] <- lyapunov_ok
  state$arrays$eta[k] <- cert$eta
  state$arrays$method[k] <- cert$method

  if (!lyapunov_ok) {
    state$lyapunov_violations <- state$lyapunov_violations + 1L
  }

  state$theta <- theta
  state$V <- V
  state$grad <- grad
  state$k <- k

  state
}


#' @title Finalize Output with Lyapunov Diagnostics
#' @keywords internal
finalize_output <- function(state, final_k, conv_cert, status) {

  # Trim arrays to actual length
  arrays_trimmed <- lapply(state$arrays, function(x) {
    if (is.numeric(x) || is.logical(x)) {
      x[1:final_k]
    } else {
      x[1:final_k]
    }
  })

  history_df <- as.data.frame(arrays_trimmed)

  list(
    # Solution
    theta = state$theta,
    value = state$V,
    grad_inf = max(abs(state$grad)),

    # Convergence
    converged = conv_cert$converged,
    status = status,
    reason = conv_cert$reason,
    iterations = final_k,

    # History (data.frame for easy plotting)
    history = history_df,

    # Diagnostics with Lyapunov tracking (FIX #3: include relative)
    diagnostics = list(
      L_estimate = mean(state$L_hat),
      lyapunov_violations = state$lyapunov_violations,
      monotone_descent = (state$lyapunov_violations == 0),
      final_delta_V = if (final_k > 1) history_df$delta_V[final_k] else NA,
      final_delta_V_rel = if (final_k > 1) history_df$delta_V_rel[final_k] else NA,
      step_methods_used = table(history_df$method),
      trust_region_active = mean(history_df$step_norm > 0.8 * history_df$step_norm[1])
    ),

    # Certificate
    certificate = conv_cert
  )
}


# ===========================================================================
# UTILITY FUNCTIONS
# ===========================================================================


#' @title Geometry-Aware Proximal Operator
#' @keywords internal
geometry_prox <- function(theta_fwd, grad, eta, parts, eps_safe) {
  th <- theta_fwd

  # T partition: log-scale (exponential family)
  if (length(parts$T) > 0) {
    i <- parts$T
    th[i] <- exp(log(pmax(th[i], eps_safe)) - eta * th[i] * grad[i])
    th[i] <- pmax(th[i], eps_safe)
  }

  # P partition: positive orthant (entropy mirror)
  if (length(parts$P) > 0) {
    i <- parts$P
    th[i] <- pmax(th[i], eps_safe) * exp(-eta * grad[i])
    th[i] <- pmax(th[i], eps_safe)
  }

  # E partition: Euclidean (standard gradient)
  if (length(parts$E) > 0) {
    th[parts$E] <- th[parts$E] - eta * grad[parts$E]
  }

  th
}


#' @title Safe Lipschitz Update
#' @keywords internal
update_lipschitz_safe <- function(L_hat, s, y, cfg) {
  p <- length(L_hat)
  L_new <- L_hat

  idx_valid <- abs(s) > 1e-12

  if (any(idx_valid)) {
    L_incr <- abs(y[idx_valid]) / abs(s[idx_valid])
    # FIX #1: Clamping provides implicit regularization
    L_incr_clamped <- pmax(pmin(L_incr, cfg$l_max), cfg$l_min)
    # Exponential moving average
    L_new[idx_valid] <- 0.8 * L_hat[idx_valid] + 0.2 * L_incr_clamped
  }

  L_new
}


#' @title Smooth Trust Radius Adaptation
#' @keywords internal
adapt_trust_radius_smooth <- function(delta, step_norm, accepted, cfg) {
  if (!accepted) {
    return(max(0.5 * delta, cfg$delta_min))
  }

  usage_ratio <- step_norm / (delta + 1e-16)

  if (usage_ratio > 0.8) {
    min(1.2 * delta, cfg$delta_max)
  } else if (usage_ratio < 0.2) {
    max(0.8 * delta, cfg$delta_min)
  } else {
    delta
  }
}


#' @title Certified Convergence Check
#' @keywords internal
check_convergence_certified <- function(state, k, cfg) {
  g_inf <- max(abs(state$grad))
  step_norm <- if (k > 0) state$arrays$step_norm[k] else Inf

  # Primary: gradient + step tolerance
  if (g_inf <= cfg$tol_g && step_norm <= cfg$tol_x) {
    return(list(
      converged = TRUE,
      reason = "GRADIENT_TOL",
      grad_inf = g_inf,
      step_norm = step_norm
    ))
  }

  # Secondary: relative function change (Kurdyka-Lojasiewicz)
  if (k >= 5) {
    recent_dV <- state$arrays$delta_V[(k-4):k]
    if (all(!is.na(recent_dV)) && all(abs(recent_dV) < cfg$tol_f) &&
        g_inf < 10 * cfg$tol_g) {
      return(list(
        converged = TRUE,
        reason = "RELATIVE_CHANGE",
        max_delta_V = max(abs(recent_dV))
      ))
    }
  }

  list(converged = FALSE, reason = "CONTINUING")
}


#' @title Safe Function Wrapper
#' @keywords internal
make_safe_function <- function(fn, name, fallback) {
  function(...) {
    tryCatch(
      fn(...),
      error = function(e) {
        if (!is.null(fallback)) {
          warning(sprintf("%s failed: %s", name, e$message))
          fallback
        } else {
          stop(sprintf("%s failed: %s", name, e$message))
        }
      }
    )
  }
}


#' @title Validate and Setup Configuration
#' @keywords internal
validate_and_setup <- function(V, gradV, theta0, parts, control) {
  # Input validation
  stopifnot(
    is.function(V),
    is.function(gradV),
    is.numeric(theta0),
    length(theta0) >= 1
  )

  theta0 <- as.numeric(theta0)
  p <- length(theta0)

  parts <- normalize_parts(parts, p)

  # Default control parameters
  defaults <- list(
    max_iter = 2000L,
    eta0 = 1.0,
    eta_min = 1e-8,
    eta_max = 10.0,
    tol_g = 1e-6,
    tol_x = 1e-9,
    tol_f = 1e-12,
    delta = 1.0,
    delta_min = 0.1,
    delta_max = 100.0,
    eps_safe = 1e-12,
    l_min = 1e-8,
    l_max = 1e8,
    L_est = 1.0,
    lambda = 0,
    V_star = NULL,
    c1 = 1e-4,
    grad_clip_norm = NULL
  )

  # Merge with user control
  for (nm in names(defaults)) {
    if (is.null(control[[nm]])) control[[nm]] <- defaults[[nm]]
  }

  # Wrap objective with optional L2 regularization
  control$V_wrap <- if (control$lambda > 0) {
    function(th) V(th) + control$lambda * sum(th^2)
  } else {
    V
  }

  control$grad_wrap <- if (control$lambda > 0) {
    function(th) gradV(th) + 2 * control$lambda * th
  } else {
    gradV
  }

  control
}


#' @title Normalize Geometry Partitions
#' @keywords internal
normalize_parts <- function(parts, p) {
  req <- c("T", "P", "E")
  stopifnot(is.list(parts), all(req %in% names(parts)))

  for (nm in req) {
    idx <- as.integer(parts[[nm]])
    if (length(idx) > 0) {
      stopifnot(all(idx >= 1 & idx <= p))
      parts[[nm]] <- sort(unique(idx))
    } else {
      parts[[nm]] <- integer(0)
    }
  }

  # Ensure complete partition (no overlaps, no gaps)
  all_idx <- c(parts$T, parts$P, parts$E)
  stopifnot(
    length(unique(all_idx)) == length(all_idx),
    length(all_idx) == p
  )

  parts
}


#' @title Clamp Positive Parameters
#' @keywords internal
clamp_positive <- function(theta, parts, eps_safe) {
  if (length(parts$T) > 0) theta[parts$T] <- pmax(theta[parts$T], eps_safe)
  if (length(parts$P) > 0) theta[parts$P] <- pmax(theta[parts$P], eps_safe)
  theta
}


#' @title Clamp Scalar to Range
#' @keywords internal
clamp <- function(x, lower, upper) {
  pmax(lower, pmin(upper, x))
}


# ===========================================================================
# END OF GALAHAD 1.0.0
# ===========================================================================
