% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/feature_effects.R
\name{feature_effects}
\alias{feature_effects}
\alias{feature_effects.default}
\alias{feature_effects.ranger}
\alias{feature_effects.explainer}
\title{Feature Effects}
\usage{
feature_effects(object, ...)

\method{feature_effects}{default}(
  object,
  v,
  data,
  y = NULL,
  pred = NULL,
  pred_fun = stats::predict,
  trafo = NULL,
  which_pred = NULL,
  w = NULL,
  breaks = "Sturges",
  right = TRUE,
  discrete_m = 5L,
  outlier_iqr = 2,
  calc_pred = TRUE,
  pd_n = 500L,
  ale_n = 50000L,
  ale_bin_size = 200L,
  seed = NULL,
  ...
)

\method{feature_effects}{ranger}(
  object,
  v,
  data,
  y = NULL,
  pred = NULL,
  pred_fun = NULL,
  trafo = NULL,
  which_pred = NULL,
  w = NULL,
  breaks = "Sturges",
  right = TRUE,
  discrete_m = 5L,
  outlier_iqr = 2,
  calc_pred = TRUE,
  pd_n = 500L,
  ale_n = 50000L,
  ale_bin_size = 200L,
  ...
)

\method{feature_effects}{explainer}(
  object,
  v = colnames(data),
  data = object$data,
  y = object$y,
  pred = NULL,
  pred_fun = object$predict_function,
  trafo = NULL,
  which_pred = NULL,
  w = object$weights,
  breaks = "Sturges",
  right = TRUE,
  discrete_m = 5L,
  outlier_iqr = 2,
  calc_pred = TRUE,
  pd_n = 500L,
  ale_n = 50000L,
  ale_bin_size = 200L,
  ...
)
}
\arguments{
\item{object}{Fitted model.}

\item{...}{Further arguments passed to \code{pred_fun()}, e.g., \code{type = "response"} in
a \code{glm()} or (typically) \code{prob = TRUE} in classification models.}

\item{v}{Vector of variable names to calculate statistics.}

\item{data}{Matrix or data.frame.}

\item{y}{Numeric vector with observed values of the response.
Can also be a column name in \code{data}. Omitted if \code{NULL} (default).}

\item{pred}{Numeric vector with predictions. If \code{NULL}, it is calculated as
\code{pred_fun(object, data, ...)}. Used to save time if \code{d()} is to be
called multiple times.}

\item{pred_fun}{Prediction function, by default \code{stats::predict}.
The function takes three arguments (names irrelevant): \code{object}, \code{data}, and \code{...}.}

\item{trafo}{How should predictions be transformed?
A function or \code{NULL} (default). Examples are \code{log} (to switch to link scale)
or \code{exp} (to switch from link scale to the original scale).}

\item{which_pred}{If the predictions are multivariate: which column to pick
(integer or column name). By default \code{NULL} (picks last column).}

\item{w}{Optional vector with case weights. Can also be a column name in \code{data}.}

\item{breaks}{An integer, vector, string or function specifying the bins
of the numeric X variables as in \code{\link[graphics:hist]{graphics::hist()}}. The default is "Sturges".
To allow varying values of \code{breaks} across variables, it can be a list of the
same length as \code{v}, or a \emph{named} list with \code{breaks} for certain variables.}

\item{right}{Should bins be right-closed? The default is \code{TRUE}.
Vectorized over \code{v}. Only relevant for numeric X.}

\item{discrete_m}{Numeric X variables with up to this number of unique values
should not be binned and treated as a factor (after calculating partial dependence)
The default is 5. Vectorized over \code{v}.}

\item{outlier_iqr}{Outliers of a numeric X are capped via the boxplot rule, i.e.,
outside \code{outlier_iqr} * IQR from the quartiles. The default is 2 is more
conservative than the usual rule to account for right-skewed distributions.
Set to 0 or \code{Inf} for no capping. Note that at most 10k observations are sampled
to calculate quartiles. Vectorized over \code{v}.}

\item{calc_pred}{Should predictions be calculated? Default is \code{TRUE}. Only relevant
if \code{pred = NULL}.}

\item{pd_n}{Size of the data used for calculating partial dependence.
The default is 500. For larger \code{data} (and \code{w}), \code{pd_n} rows are randomly sampled.
Each variable specified by \code{v} uses the same subsample. Set to 0 to omit.}

\item{ale_n}{Size of the data used for calculating ALE.
The default is 50000. For larger \code{data} (and \code{w}), \code{ale_n} rows are randomly
sampled. Each variable specified by \code{v} uses the same subsample. Set to 0 to omit.}

\item{ale_bin_size}{Maximal number of observations used per bin for ALE calculations.
If there are more observations in a bin, \code{ale_bin_size} indices are
randomly sampled. The default is 200. Applied after subsampling regarding \code{ale_n}.}

\item{seed}{Optional random seed (an integer) used for:
\itemize{
\item Partial dependence: select background data if \code{n > pd_n}.
\item ALE: select background data if \code{n > ale_n} and for bins > \code{ale_bin_size}.
\item Capping X: quartiles are selected based on 10k observations.
}}
}
\value{
A list (of class "EffectData") with a data.frame of statistics per feature. Use
single bracket subsetting to select part of the output.
}
\description{
This is the main function of the package. By default, it calculates
the following statistics per feature X over values/bins:
\itemize{
\item "y_mean": Average observed \code{y} values. Used to assess descriptive associations
between response and features.
\item "pred_mean": Average predictions. Corresponds to "M Plots" (from  "marginal")
in Apley (2020). Shows the combined effect of X and other (correlated) features.
The difference to average observed y values shows model bias.
\item "resid_mean": Average residuals. Calculated when
both \code{y} and predictions are available. Useful to study model bias.
\item "pd": Partial dependence (Friedman, 2001): See \code{\link[=partial_dependence]{partial_dependence()}}.
Evaluated at bin averages, not at bin midpoints.
\item "ale": Accumulated local effects (Apley, 2020): See \code{\link[=ale]{ale()}}. Only for numeric X.
}

Additionally, corresponding counts/weights are calculated, and
standard deviations of observed y and residuals.

Numeric X with more than \code{discrete_m = 5} disjoint values are binned as in
\code{\link[graphics:hist]{graphics::hist()}} via \code{breaks}. Before calculating bins, outliers are capped
at +-2 IQR from the quartiles.

All averages and standard deviation are weighted by optional weights \code{w}.

If you need only one specific statistic, you can use the simplified APIs of
\itemize{
\item \code{\link[=average_observed]{average_observed()}},
\item \code{\link[=average_predicted]{average_predicted()}},
\item \code{\link[=bias]{bias()}},
\item \code{\link[=partial_dependence]{partial_dependence()}}, and
\item \code{\link[=ale]{ale()}}.
}
}
\section{Methods (by class)}{
\itemize{
\item \code{feature_effects(default)}: Default method.

\item \code{feature_effects(ranger)}: Method for "ranger" models.

\item \code{feature_effects(explainer)}: Method for DALEX "explainer".

}}
\examples{
fit <- lm(Sepal.Length ~ ., data = iris)
xvars <- colnames(iris)[2:5]
M <- feature_effects(fit, v = xvars, data = iris, y = "Sepal.Length", breaks = 5)
M
M |> update(sort = "pd") |> plot(share_y = "all")
}
\references{
\enumerate{
\item Molnar, Christoph. 2019. \emph{Interpretable Machine Learning: A Guide for Making Black Box Models Explainable}.
\url{https://christophm.github.io/interpretable-ml-book/}.
\item Friedman, Jerome H. 2001, \emph{Greedy Function Approximation: A Gradient Boosting Machine.}
Annals of Statistics 29 (5): 1189-1232. doi:10.1214/aos/1013203451.3.
\item Apley, Daniel W., and Jingyu Zhu. 2016. \emph{Visualizing the Effects of Predictor Variables in Black Box Supervised Learning Models.}
Journal of the Royal Statistical Society Series B: Statistical Methodology,
82 (4): 1059–1086. doi:10.1111/rssb.12377.
}
}
\seealso{
\code{\link[=plot.EffectData]{plot.EffectData()}}, \code{\link[=update.EffectData]{update.EffectData()}}, \code{\link[=partial_dependence]{partial_dependence()}},
\code{\link[=ale]{ale()}}, \link{average_observed}, \code{\link[=average_predicted]{average_predicted()}}, \code{\link[=bias]{bias()}}
}
