#' Optimal Binning for Categorical Variables using a User-Defined Technique (UDT)
#'
#' This function performs optimal binning for categorical variables using a
#' User-Defined Technique (UDT) that combines frequency-based grouping with
#' statistical similarity measures to create meaningful bins for predictive
#' modeling.
#'
#' The UDT algorithm follows these steps:
#' \enumerate{
#'   \item \strong{Initialization}: Each unique category is initially placed in
#'         its own bin.
#'   \item \strong{Frequency Filtering}: Categories below the \code{bin_cutoff}
#'         frequency threshold are grouped into a single "rare" bin.
#'   \item \strong{Iterative Optimization}: Bins are progressively merged based
#'         on statistical similarity (measured by Jensen-Shannon divergence)
#'         until the desired number of bins (\code{max_bins}) is achieved.
#'   \item \strong{Monotonicity Enforcement}: Final bins are sorted by Weight
#'         of Evidence to ensure consistent trends.
#' }
#'
#' Key characteristics of this implementation:
#' \itemize{
#'   \item \strong{Flexible Framework}: Designed as a customizable foundation
#'         for categorical binning approaches.
#'   \item \strong{Statistical Rigor}: Uses information-theoretic measures to
#'         guide bin combination decisions.
#'   \item \strong{Robust Estimation}: Implements Laplace smoothing to ensure
#'         stable WoE/IV calculations even with sparse data.
#'   \item \strong{Efficiency Focus}: Employs targeted merging strategies to
#'         minimize computational overhead.
#' }
#'
#' Mathematical foundations:
#'
#' Laplace-smoothed probability estimates:
#' \deqn{p_{smoothed} = \frac{count + \alpha}{total + 2\alpha}}{
#' p_smoothed = (count + alpha) / (total + 2*alpha)}
#'
#' Weight of Evidence calculation:
#' \deqn{WoE = \ln\left(\frac{p_{pos,smoothed}}{p_{neg,smoothed}}\right)}{
#' WoE = ln(p_pos_smoothed / p_neg_smoothed)}
#'
#' Information Value computation:
#' \deqn{IV = (p_{pos,smoothed} - p_{neg,smoothed}) \times WoE}{
#' IV = (p_pos_smoothed - p_neg_smoothed) * WoE}
#'
#' Jensen-Shannon divergence between bins:
#' \deqn{JSD(P||Q) = \frac{1}{2}[KL(P||M) + KL(Q||M)]}{
#' JSD(P||Q) = (1/2)[KL(P||M) + KL(Q||M)]}
#' where \eqn{M = \frac{1}{2}(P+Q)} and \eqn{KL} denotes Kullback-Leibler
#' divergence.
#'
#' @param feature A character vector or factor representing the categorical
#'   predictor variable. Missing values (NA) will be converted to the string
#'   "NA" and treated as a separate category.
#' @param target An integer vector containing binary outcome values (0 or 1).
#'   Must be the same length as \code{feature}. Cannot contain missing values.
#' @param min_bins Integer. Minimum number of bins to create. Must be at least
#'   1. Default is 3.
#' @param max_bins Integer. Maximum number of bins to create. Must be greater
#'   than or equal to \code{min_bins}. Default is 5.
#' @param bin_cutoff Numeric. Minimum relative frequency threshold for
#'   individual categories. Categories with frequency below this proportion
#'   will be merged into a collective "rare" bin before optimization.
#'   Value must be between 0 and 1. Default is 0.05 (5\%).
#' @param max_n_prebins Integer. Upper limit on initial bins after frequency
#'   filtering. Controls computational complexity in early stages. Default is 20.
#' @param bin_separator Character string used to separate category names when
#'   multiple categories are combined into a single bin. Default is "\%;\%".
#' @param convergence_threshold Numeric. Threshold for determining algorithm
#'   convergence based on relative changes in total Information Value.
#'   Default is 1e-6.
#' @param max_iterations Integer. Maximum number of iterations permitted for
#'   the optimization routine. Default is 1000.
#'
#' @return A list containing the results of the optimal binning procedure:
#' \describe{
#'   \item{\code{id}}{Numeric vector of bin identifiers (1 to n_bins)}
#'   \item{\code{bin}}{Character vector of bin labels, which are combinations
#'         of original categories separated by \code{bin_separator}}
#'   \item{\code{woe}}{Numeric vector of Weight of Evidence values for each bin}
#'   \item{\code{iv}}{Numeric vector of Information Values for each bin}
#'   \item{\code{count}}{Integer vector of total observations in each bin}
#'   \item{\code{count_pos}}{Integer vector of positive outcomes in each bin}
#'   \item{\code{count_neg}}{Integer vector of negative outcomes in each bin}
#'   \item{\code{event_rate}}{Numeric vector of the observed event rate in each bin}
#'   \item{\code{total_iv}}{Numeric scalar. Total Information Value across all
#'         bins}
#'   \item{\code{converged}}{Logical. Whether the algorithm converged}
#'   \item{\code{iterations}}{Integer. Number of iterations executed}
#' }
#'
#' @note
#' \itemize{
#'   \item Target variable must contain both 0 and 1 values.
#'   \item For datasets with 1 or 2 unique categories, no optimization occurs
#'         beyond basic WoE/IV calculation.
#'   \item The algorithm does not perform bin splitting; it only merges existing
#'         bins to respect \code{max_bins}.
#'   \item Rare category pooling improves stability of WoE estimates for
#'         infrequent values.
#' }
#'
#' @examples
#' # Generate sample data with skewed category distribution
#' set.seed(789)
#' n <- 3000
#' # Power-law distributed categories
#' categories <- c(
#'   rep("X1", 1200), rep("X2", 800), rep("X3", 400),
#'   sample(LETTERS[4:20], 600, replace = TRUE)
#' )
#' feature <- sample(categories, n, replace = TRUE)
#' # Target probabilities based on category importance
#' probs <- ifelse(grepl("X", feature), 0.7,
#'   ifelse(grepl("[A-C]", feature), 0.5, 0.3)
#' )
#' target <- rbinom(n, 1, prob = probs)
#'
#' # Perform user-defined technique binning
#' result <- ob_categorical_udt(feature, target)
#' print(result[c("bin", "woe", "iv", "count")])
#'
#' # Adjust parameters for finer control
#' result_custom <- ob_categorical_udt(
#'   feature = feature,
#'   target = target,
#'   min_bins = 2,
#'   max_bins = 7,
#'   bin_cutoff = 0.03
#' )
#'
#' # Handling missing values
#' feature_with_na <- feature
#' feature_with_na[sample(length(feature_with_na), 150)] <- NA
#' result_na <- ob_categorical_udt(feature_with_na, target)
#'
#' @export
ob_categorical_udt <- function(feature,
                               target,
                               min_bins = 3L,
                               max_bins = 5L,
                               bin_cutoff = 0.05,
                               max_n_prebins = 20L,
                               bin_separator = "%;%",
                               convergence_threshold = 1e-6,
                               max_iterations = 1000L) {
  # Input validation and conversion
  if (!is.character(feature)) {
    feature <- as.character(feature)
  }

  # Convert NA values to special string to ensure they are handled as a category
  feature[is.na(feature)] <- "NA"

  # Ensure target is integer
  target <- as.integer(target)

  # Call the C++ implementation
  .Call("_OptimalBinningWoE_optimal_binning_categorical_udt",
    target = target,
    feature = feature,
    min_bins = as.integer(min_bins),
    max_bins = as.integer(max_bins),
    bin_cutoff = bin_cutoff,
    max_n_prebins = as.integer(max_n_prebins),
    bin_separator = bin_separator,
    convergence_threshold = convergence_threshold,
    max_iterations = as.integer(max_iterations),
    PACKAGE = "OptimalBinningWoE"
  )
}
