% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/hstats.R
\name{hstats}
\alias{hstats}
\alias{hstats.default}
\alias{hstats.ranger}
\alias{hstats.explainer}
\title{Calculate Interaction Statistics}
\usage{
hstats(object, ...)

\method{hstats}{default}(
  object,
  X,
  v = NULL,
  pred_fun = stats::predict,
  pairwise_m = 5L,
  threeway_m = 0L,
  approx = FALSE,
  grid_size = 50L,
  n_max = 500L,
  eps = 1e-10,
  w = NULL,
  verbose = TRUE,
  ...
)

\method{hstats}{ranger}(
  object,
  X,
  v = NULL,
  pred_fun = NULL,
  pairwise_m = 5L,
  threeway_m = 0L,
  approx = FALSE,
  grid_size = 50L,
  n_max = 500L,
  eps = 1e-10,
  w = NULL,
  verbose = TRUE,
  survival = c("chf", "prob"),
  ...
)

\method{hstats}{explainer}(
  object,
  X = object[["data"]],
  v = NULL,
  pred_fun = object[["predict_function"]],
  pairwise_m = 5L,
  threeway_m = 0L,
  approx = FALSE,
  grid_size = 50L,
  n_max = 500L,
  eps = 1e-10,
  w = object[["weights"]],
  verbose = TRUE,
  ...
)
}
\arguments{
\item{object}{Fitted model object.}

\item{...}{Additional arguments passed to \code{pred_fun(object, X, ...)},
for instance \code{type = "response"} in a \code{\link[=glm]{glm()}} model, or \code{reshape = TRUE} in a
multiclass XGBoost model.}

\item{X}{A data.frame or matrix serving as background dataset.}

\item{v}{Vector of feature names. The default (\code{NULL}) will use all column names of
\code{X} except the column name of the optional case weight \code{w} (if specified as name).}

\item{pred_fun}{Prediction function of the form \verb{function(object, X, ...)},
providing \eqn{K \ge 1} predictions per row. Its first argument represents the
model \code{object}, its second argument a data structure like \code{X}. Additional arguments
(such as \code{type = "response"} in a GLM, or \code{reshape = TRUE} in a multiclass XGBoost
model) can be passed via \code{...}. The default, \code{\link[stats:predict]{stats::predict()}}, will work in
most cases.}

\item{pairwise_m}{Number of features for which pairwise statistics are to be
calculated. The features are selected based on Friedman and Popescu's overall
interaction strength \eqn{H^2_j}. Set to to 0 to avoid pairwise calculations.
For multivariate predictions, the union of the \code{pairwise_m} column-wise
strongest variable names is taken. This can lead to very long run-times.}

\item{threeway_m}{Like \code{pairwise_m}, but controls the feature count for
three-way interactions. Cannot be larger than \code{pairwise_m}.
To save computation time, the default is 0.}

\item{approx}{Should quantile approximation be applied to dense numeric features?
The default is \code{FALSE}. Setting this option to \code{TRUE} brings a massive speed-up
for one-way calculations. It can, e.g., be used when the number of features is
very large.}

\item{grid_size}{Integer controlling the number of quantile midpoints used to
approximate dense numerics. The quantile midpoints are calculated after
subampling via \code{n_max}. Only relevant if \code{approx = TRUE}.}

\item{n_max}{If \code{X} has more than \code{n_max} rows, a random sample of \code{n_max} rows is
selected from \code{X}. In this case, set a random seed for reproducibility.}

\item{eps}{Threshold below which numerator values are set to 0. Default is 1e-10.}

\item{w}{Optional vector of case weights. Can also be a column name of \code{X}.}

\item{verbose}{Should a progress bar be shown? The default is \code{TRUE}.}

\item{survival}{Should cumulative hazards ("chf", default) or survival
probabilities ("prob") per time be predicted? Only in \code{ranger()} survival models.}
}
\value{
An object of class "hstats" containing these elements:
\itemize{
\item \code{X}: Input \code{X} (sampled to \code{n_max} rows, after optional quantile approximation).
\item \code{w}: Case weight vector \code{w} (sampled to \code{n_max} values), or \code{NULL}.
\item \code{v}: Vector of column names in \code{X} for which overall
H statistics have been calculated.
\item \code{f}: Matrix with (centered) predictions \eqn{F}.
\item \code{mean_f2}: (Weighted) column means of \code{f}. Used to normalize \eqn{H^2} and
\eqn{H^2_j}.
\item \code{F_j}: List of matrices, each representing (centered)
partial dependence functions \eqn{F_j}.
\item \code{F_not_j}: List of matrices with (centered) partial dependence
functions \eqn{F_{\setminus j}} of other features.
\item \code{K}: Number of columns of prediction matrix.
\item \code{pred_names}: Column names of prediction matrix.
\item \code{pairwise_m}: Like input \code{pairwise_m}, but capped at \code{length(v)}.
\item \code{threeway_m}: Like input \code{threeway_m}, but capped at the smaller of
\code{length(v)} and \code{pairwise_m}.
\item \code{eps}: Like input \code{eps}.
\item \code{pd_importance}: List with numerator and denominator of \eqn{\textrm{PDI}_j}.
\item \code{h2}: List with numerator and denominator of \eqn{H^2}.
\item \code{h2_overall}: List with numerator and denominator of \eqn{H^2_j}.
\item \code{v_pairwise}: Subset of \code{v} with largest \eqn{H^2_j} used for pairwise
calculations. Only if pairwise calculations have been done.
\item \code{combs2}: Named list of variable pairs for which pairwise partial
dependence functions are available. Only if pairwise calculations have been done.
\item \code{F_jk}: List of matrices, each representing (centered) bivariate
partial dependence functions \eqn{F_{jk}}.
Only if pairwise calculations have been done.
\item \code{h2_pairwise}: List with numerator and denominator of \eqn{H^2_{jk}}.
Only if pairwise calculations have been done.
\item \code{v_threeway}: Subset of \code{v} with largest \code{h2_overall()} used for three-way
calculations. Only if three-way calculations have been done.
\item \code{combs3}: Named list of variable triples for which three-way partial
dependence functions are available. Only if three-way calculations have been done.
\item \code{F_jkl}: List of matrices, each representing (centered) three-way
partial dependence functions \eqn{F_{jkl}}.
Only if three-way calculations have been done.
\item \code{h2_threeway}: List with numerator and denominator of \eqn{H^2_{jkl}}.
Only if three-way calculations have been done.
}
}
\description{
This is the main function of the package. It does the expensive calculations behind
the following H-statistics:
\itemize{
\item Total interaction strength \eqn{H^2}, a statistic measuring the proportion of
prediction variability unexplained by main effects of \code{v}, see \code{\link[=h2]{h2()}} for details.
\item Friedman and Popescu's statistic \eqn{H^2_j} of overall interaction strength per
feature, see \code{\link[=h2_overall]{h2_overall()}} for details.
\item Friedman and Popescu's statistic \eqn{H^2_{jk}} of pairwise interaction strength,
see \code{\link[=h2_pairwise]{h2_pairwise()}} for details.
\item Friedman and Popescu's statistic \eqn{H^2_{jkl}} of three-way interaction strength,
see \code{\link[=h2_threeway]{h2_threeway()}} for details. To save time, this statistic is not calculated
by default. Set \code{threeway_m} to a value above 2 to get three-way statistics of the
\code{threeway_m} variables with strongest overall interaction.
}

Furthermore, it allows to calculate an experimental partial dependence based
measure of feature importance, \eqn{\textrm{PDI}_j^2}. It equals the proportion of
prediction variability unexplained by other features, see \code{\link[=pd_importance]{pd_importance()}}
for details. This statistic is not shown by \code{summary()} or \code{plot()}.

Instead of using \code{summary()}, interaction statistics can also be obtained via the
more flexible functions \code{\link[=h2]{h2()}}, \code{\link[=h2_overall]{h2_overall()}}, \code{\link[=h2_pairwise]{h2_pairwise()}}, and
\code{\link[=h2_threeway]{h2_threeway()}}.
}
\section{Methods (by class)}{
\itemize{
\item \code{hstats(default)}: Default hstats method.

\item \code{hstats(ranger)}: Method for "ranger" models.

\item \code{hstats(explainer)}: Method for DALEX "explainer".

}}
\examples{
# MODEL 1: Linear regression
fit <- lm(Sepal.Length ~ . + Petal.Width:Species, data = iris)
s <- hstats(fit, X = iris[, -1])
s
plot(s)
plot(s, zero = FALSE)  # Drop 0
summary(s)
  
# Absolute pairwise interaction strengths
h2_pairwise(s, normalize = FALSE, squared = FALSE, zero = FALSE)

# MODEL 2: Multi-response linear regression
fit <- lm(as.matrix(iris[, 1:2]) ~ Petal.Length + Petal.Width * Species, data = iris)
s <- hstats(fit, X = iris[, 3:5], verbose = FALSE)
plot(s)
summary(s)

# MODEL 3: Gamma GLM with log link
fit <- glm(Sepal.Length ~ ., data = iris, family = Gamma(link = log))

# No interactions for additive features, at least on link scale
s <- hstats(fit, X = iris[, -1], verbose = FALSE)
summary(s)

# On original scale, we have interactions everywhere. 
# To see three-way interactions, we set threeway_m to a value above 2.
s <- hstats(fit, X = iris[, -1], type = "response", threeway_m = 5)
plot(s, ncol = 1)  # All three types use different denominators

# All statistics on same scale (of predictions)
plot(s, squared = FALSE, normalize = FALSE, facet_scale = "free_y")
}
\references{
Friedman, Jerome H., and Bogdan E. Popescu. \emph{"Predictive Learning via Rule Ensembles."}
The Annals of Applied Statistics 2, no. 3 (2008): 916-54.
}
\seealso{
\code{\link[=h2]{h2()}}, \code{\link[=h2_overall]{h2_overall()}}, \code{\link[=h2_pairwise]{h2_pairwise()}}, \code{\link[=h2_threeway]{h2_threeway()}},
and \code{\link[=pd_importance]{pd_importance()}} for specific statistics calculated from the resulting object.
}
