# R/cleaning.R
# ============================================================
#  INTERNAL CLEANING HELPERS FOR GACE 1.0.0 (STABILITY PATCH)
# ============================================================

#' Internal helper: safe linear interpolation for missing values
#'
#' Linearly interpolates missing values in a numeric vector using
#' \code{stats::approx}. If all values are missing, returns the input
#' unchanged. If only one non-missing value exists, it is repeated.
#'
#' @param x Numeric vector.
#'
#' @return Numeric vector with missing values interpolated when possible.
#'
#' @keywords internal
#' @noRd
#' @importFrom stats approx
.gace_interpolate_na <- function(x) {
  if (all(is.na(x))) return(x)
  idx <- which(!is.na(x))
  if (length(idx) == 1L) {
    return(rep(x[idx], length(x)))
  }
  approx_res <- stats::approx(
    x    = idx,
    y    = x[idx],
    xout = seq_along(x),
    rule = 2
  )
  approx_res$y
}

#' Internal helper: winsorization of extremes
#'
#' Caps a numeric vector at lower and upper empirical quantiles to reduce
#' the influence of extreme outliers.
#'
#' @param x Numeric vector.
#' @param probs Length-2 numeric vector of probabilities in (0, 1).
#'
#' @return Winsorized numeric vector.
#'
#' @keywords internal
#' @noRd
#' @importFrom stats quantile
.gace_winsorize <- function(x, probs = c(0.01, 0.99)) {
  if (all(is.na(x))) return(x)
  
  finite_x <- x[is.finite(x)]
  # For very short or constant series, winsorization does not help
  if (length(finite_x) < 5L || length(unique(finite_x)) <= 1L) {
    return(x)
  }
  
  q <- stats::quantile(finite_x, probs = probs, na.rm = TRUE, names = FALSE)
  x[x < q[1]] <- q[1]
  x[x > q[2]] <- q[2]
  x
}

#' Internal series cleaner for GACE
#'
#' Cleans a numeric time series by:
#' \itemize{
#'   \item Treating clearly invalid values (negative) as missing,
#'   \item Optionally treating a small number of zeros as gaps in
#'         an otherwise strictly positive series,
#'   \item Interpolating missing values,
#'   \item Optionally winsorizing extremes.
#' }
#'
#' @param y Numeric vector of historical values.
#' @param zero_to_na Logical; if TRUE, zeros may be treated as missing
#'   only when they appear as sparse gaps in an otherwise positive series.
#' @param winsorize Logical; if TRUE, extremes are winsorized.
#' @param winsor_probs Quantile bounds used for winsorization.
#'
#' @return Cleaned numeric vector.
#'
#' @keywords internal
#' @noRd
.gace_clean_series <- function(y,
                               zero_to_na   = TRUE,
                               winsorize    = TRUE,
                               winsor_probs = c(0.01, 0.99)) {
  y <- as.numeric(y)
  
  if (length(y) == 0L) {
    return(y)
  }
  
  # Treat non-finite values as missing
  y[!is.finite(y)] <- NA_real_
  
  # Always treat NEGATIVE values as missing (revenues, volumes, rates)
  neg_idx <- which(y < 0)
  if (length(neg_idx) > 0L) {
    y[neg_idx] <- NA_real_
  }
  
  # Optionally treat zeros as gaps, but only if:
  #  - there is at least one positive value,
  #  - zeros are relatively sparse (<= 20% of observations).
  if (zero_to_na) {
    has_positive <- any(y > 0, na.rm = TRUE)
    zero_idx     <- which(y == 0)
    
    if (has_positive && length(zero_idx) > 0L &&
        length(zero_idx) <= 0.20 * length(y)) {
      y[zero_idx] <- NA_real_
    }
  }
  
  # If everything is missing after cleaning, just return as-is
  if (all(is.na(y))) {
    return(y)
  }
  
  # Interpolate internal gaps (if any)
  if (anyNA(y)) {
    y <- .gace_interpolate_na(y)
  }
  
  # Winsorize extremes only when we have enough data
  if (winsorize) {
    y <- .gace_winsorize(y, probs = winsor_probs)
  }
  
  y
}