#' Select Variables
#'
#' Select variables relevant to propensity for inclusion in All of Us
#'
#' Chooses which variables are meaningful in modeling propensity for inclusion in All of Us (sample_b)
#' as compared to the general US population as represented by a reference probability sample (sample_a). This function
#' assumes that variable names in both sample_a and sample_b are harmonized (i.e., definitions and names are the same across the two sources).
#'
#' @param sample_a data.frame of the reference probability sample (i.e., NHIS)
#' @param sample_b data.frame of the All of Us sample
#' @param aux_variables character vector with names of auxiliary variables
#'
#' @return character vector with selected variable names
#' 
#' @examples
#' # Prepare the NHIS data
#' calVars <- c(
#'   "SEX_A_R", "AGEP_A_R", "HISPALLP_A_R", "ORIENT_A_R", "HICOV_A_R", "EDUCP_A_R", "REGION_R",
#'   "EMPLASTWK_A_R", "HOUTENURE_A_R", "MARITAL_A_R"
#' )
#' stuVars <- "DIBTYPE_A_R"
#' vars_dummies <- c("AGEP_A_R","HISPALLP_A_R","EDUCP_A_R","REGION_R")
#' nhis_keep_vars <- c("PPSU","PSTRAT","WTFA_A")
#' nhis_imputed <- impute_data(nhis_processed, c(calVars, stuVars), nhis_keep_vars)
#' nhis_dummied <- dummies(nhis_imputed, vars=paste0(vars_dummies, '_I'))
#' factor_vars <- setdiff(names(nhis_dummied), nhis_keep_vars)
#' nhis_dummied[factor_vars] <- lapply(nhis_dummied[factor_vars], as.factor)
#' 
#' # Prepare the synthetic All of Us data
#' aou_imputed <- impute_data(aou_synthetic, c(calVars, stuVars))
#' aou_dummied <- dummies(aou_imputed, vars=paste0(vars_dummies, '_I'))
#' aou_dummied[] <- lapply(aou_dummied, as.factor)
#' 
#' # Define base variable names of auxiliary variables
#' aux_variables <- c(
#'   "SEX_A_R_I","AGEP_A_R_I", "HISPALLP_A_R_I","EDUCP_A_R_I",
#'   "REGION_R_I","ORIENT_A_R_I","HICOV_A_R_I",
#'   "EMPLASTWK_A_R_I","HOUTENURE_A_R_I","MARITAL_A_R_I"
#' )
#' 
#' # Provide All of Us and NHIS data to select variables
#' selected_base_vars <- select_variables(nhis_dummied, aou_dummied, aux_variables)
#' 
#' @importFrom survey svytotal svydesign
#' @importFrom stats model.matrix coef
#' @importFrom glue glue
#' @importFrom glmnet cv.glmnet
#' @export
select_variables <- function(sample_a, sample_b, aux_variables) {
    
    # Concatenate the two samples 
    # Step 1: Find common variables
    common_vars <- intersect(names(sample_a), names(sample_b))

    # print variables to be dropped
    a_drop <- setdiff(names(sample_a), common_vars)
    b_drop <- setdiff(names(sample_b), common_vars)
    message(glue("Drop from A: {paste(a_drop, collapse=', ')}"))
    message(glue("Drop from B: {paste(b_drop, collapse=', ')}"))

    # Step 2: Subset each dataset to common variables
    Sample_A_common <- sample_a[, common_vars]
    Sample_B_common  <- sample_b[, common_vars]

    # Step 3: Add flag variable
    Sample_A_common$prob_subset <- 1
    Sample_B_common$prob_subset <- 0

    # Step 4: Concatenate
    combined_data <- rbind(Sample_A_common, Sample_B_common)
    
    # Prepare data
    X <- model.matrix(reformulate(aux_variables), data = combined_data)[, -1]
    y <- combined_data$prob_subset

    # Fit LASSO - use 5-fold cross validation to pick lambda
    lasso_model <- cv.glmnet(X, y, alpha = 1, family = "binomial", nfolds = 5)
    
    # Get selected variables (nonzero coefficients)
    coef_matrix <- coef(lasso_model, s = "lambda.min")
    selected_vars <- rownames(coef_matrix)[coef_matrix[, 1] != 0][-1]  # remove intercept
    dropped_vars <- rownames(coef_matrix)[coef_matrix[, 1] == 0][-1]

    # Print variables not selected
    message(glue("Dropped after selection: {paste(dropped_vars, collapse=', ')}"))
    
    # Remove trailing numbers (1) after underscores or dots
    selected_base_vars <- unique(sub("([_.])?[1]+$", "", selected_vars))
    
    return(selected_base_vars)

}
