#' Prepare Data for Anomaly Detection
#'
#' Preprocesses data for unsupervised anomaly detection by handling identifiers,
#' scaling numerical features, and encoding categorical variables.
#'
#' @importFrom stats median mad
#' @param data A data frame containing the data to be preprocessed.
#' @param id_cols Character vector of column names to exclude from scoring
#'   (e.g., patient IDs, encounter IDs). If NULL, attempts to auto-detect
#'   common ID column patterns.
#' @param exclude_cols Character vector of additional columns to exclude from
#'   scoring. Default is NULL.
#' @param scale_method Character string indicating the scaling method for
#'   numerical variables. Options: "mad" (Median Absolute Deviation, default),
#'   "minmax" (min-max normalization), or "none" (no scaling).
#'
#' @return A list containing:
#'   \item{prepared_data}{A numeric matrix ready for anomaly detection}
#'   \item{metadata}{A list with mapping information:
#'     \itemize{
#'       \item original_data: The original data frame
#'       \item id_cols: Column names used as identifiers
#'       \item numeric_cols: Column names of numeric variables
#'       \item categorical_cols: Column names of categorical variables
#'       \item excluded_cols: Column names excluded from scoring
#'     }
#'   }
#'
#' @export
#'
#' @examples
#' data <- data.frame(
#'   patient_id = 1:20,
#'   age = rnorm(20, 50, 15),
#'   cost = rnorm(20, 10000, 5000),
#'   gender = sample(c("M", "F"), 20, replace = TRUE)
#' )
#' prep_result <- prep_for_anomaly(data, id_cols = "patient_id")
prep_for_anomaly <- function(data, id_cols = NULL, exclude_cols = NULL, 
                             scale_method = "mad") {
  
  # Validate inputs
  if (!is.data.frame(data)) {
    stop("data must be a data frame")
  }
  
  if (nrow(data) == 0) {
    stop("data must have at least one row")
  }
  
  # Auto-detect ID columns if not provided
  if (is.null(id_cols)) {
    id_patterns <- c("id$", "^id", "patient_id", "encounter_id", "record_id")
    id_cols <- character(0)
    for (pattern in id_patterns) {
      matches <- grep(pattern, names(data), ignore.case = TRUE, value = TRUE)
      id_cols <- c(id_cols, matches)
    }
    id_cols <- unique(id_cols)
  }
  
  all_excluded <- unique(c(id_cols, exclude_cols))
  all_excluded <- all_excluded[all_excluded %in% names(data)]
  
  numeric_cols <- names(data)[sapply(data, is.numeric)]
  numeric_cols <- setdiff(numeric_cols, all_excluded)
  
  categorical_cols <- names(data)[sapply(data, function(x) {
    is.factor(x) || is.character(x)
  })]
  categorical_cols <- setdiff(categorical_cols, all_excluded)
  
  if (length(numeric_cols) == 0 && length(categorical_cols) == 0) {
    stop("No columns available for anomaly detection after excluding ID and specified columns")
  }
  
  numeric_matrix <- NULL
  if (length(numeric_cols) > 0) {
    numeric_data <- data[, numeric_cols, drop = FALSE]
    
    # Impute missing values with median
    for (col in numeric_cols) {
      if (any(is.na(numeric_data[[col]]))) {
        numeric_data[[col]][is.na(numeric_data[[col]])] <- 
          stats::median(numeric_data[[col]], na.rm = TRUE)
      }
    }
    
    if (scale_method == "mad") {
      numeric_matrix <- as.matrix(numeric_data)
      for (i in seq_len(ncol(numeric_matrix))) {
        col_median <- stats::median(numeric_matrix[, i], na.rm = TRUE)
        col_mad <- stats::mad(numeric_matrix[, i], na.rm = TRUE)
        if (col_mad > 0) {
          numeric_matrix[, i] <- (numeric_matrix[, i] - col_median) / col_mad
        } else {
          numeric_matrix[, i] <- 0
        }
      }
    } else if (scale_method == "minmax") {
      numeric_matrix <- as.matrix(numeric_data)
      for (i in seq_len(ncol(numeric_matrix))) {
        col_min <- min(numeric_matrix[, i], na.rm = TRUE)
        col_max <- max(numeric_matrix[, i], na.rm = TRUE)
        if (col_max > col_min) {
          numeric_matrix[, i] <- (numeric_matrix[, i] - col_min) / (col_max - col_min)
        } else {
          numeric_matrix[, i] <- 0
        }
      }
    } else if (scale_method == "none") {
      numeric_matrix <- as.matrix(numeric_data)
    } else {
      stop("scale_method must be 'mad', 'minmax', or 'none'")
    }
  }
  
  categorical_matrix <- NULL
  if (length(categorical_cols) > 0) {
    categorical_data <- data[, categorical_cols, drop = FALSE]
    
    for (col in categorical_cols) {
      if (is.character(categorical_data[[col]])) {
        categorical_data[[col]] <- as.factor(categorical_data[[col]])
      }
      if (any(is.na(categorical_data[[col]]))) {
        levels(categorical_data[[col]]) <- c(levels(categorical_data[[col]]), "Missing")
        categorical_data[[col]][is.na(categorical_data[[col]])] <- "Missing"
      }
    }
    
    categorical_list <- lapply(categorical_cols, function(col) {
      stats::model.matrix(~ . - 1, data = categorical_data[, col, drop = FALSE])
    })
    categorical_matrix <- do.call(cbind, categorical_list)
  }
  
  if (!is.null(numeric_matrix) && !is.null(categorical_matrix)) {
    prepared_data <- cbind(numeric_matrix, categorical_matrix)
  } else if (!is.null(numeric_matrix)) {
    prepared_data <- numeric_matrix
  } else if (!is.null(categorical_matrix)) {
    prepared_data <- categorical_matrix
  } else {
    stop("No data available after preprocessing")
  }
  
  metadata <- list(
    original_data = data,
    id_cols = id_cols,
    numeric_cols = numeric_cols,
    categorical_cols = categorical_cols,
    excluded_cols = all_excluded,
    scale_method = scale_method
  )
  
  return(list(
    prepared_data = prepared_data,
    metadata = metadata
  ))
}

