% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/nestcv.train.R
\name{nestcv.train}
\alias{nestcv.train}
\title{Nested cross-validation for caret}
\usage{
nestcv.train(
  y,
  x,
  filterFUN = NULL,
  filter_options = NULL,
  outer_method = c("cv", "LOOCV"),
  n_outer_folds = 10,
  outer_folds = NULL,
  cv.cores = 1,
  metric = ifelse(is.factor(y), "logLoss", "RMSE"),
  trControl = NULL,
  tuneGrid = NULL,
  savePredictions = "final",
  na.option = "pass",
  ...
)
}
\arguments{
\item{y}{Response vector. For classification this should be a factor.}

\item{x}{Matrix or dataframe of predictors}

\item{filterFUN}{Filter function, e.g. \link{ttest_filter} or \link{relieff_filter}.
Any function can be provided and is passed \code{y} and \code{x}. Must return a
character vector with names of filtered predictors.}

\item{filter_options}{List of additional arguments passed to the filter
function specified by \code{filterFUN}.}

\item{outer_method}{String of either \code{"cv"} or \code{"LOOCV"} specifying whether
to do k-fold CV or leave one out CV (LOOCV) for the outer folds}

\item{n_outer_folds}{Number of outer CV folds}

\item{outer_folds}{Optional list containing indices of test folds for outer
CV. If supplied, \code{n_outer_folds} is ignored.}

\item{cv.cores}{Number of cores for parallel processing of the outer loops.
NOTE: this uses \code{parallel::mclapply} on unix/mac and \code{parallel::parLapply}
on windows.}

\item{metric}{A string that specifies what summary metric will be used to
select the optimal model. By default, "logLoss" is used for classification
and "RMSE" is used for regression. Note this differs from the default
setting in caret which uses "Accuracy" for classification. See details.}

\item{trControl}{A list of values generated by the \code{caret} function
\link{trainControl}. This defines how inner CV training through \code{caret} is
performed. Default for the inner loop is 10-fold CV. See
http://topepo.github.io/caret/using-your-own-model-in-train.html.}

\item{tuneGrid}{Data frame of tuning values, see \link[caret:train]{caret::train}.}

\item{savePredictions}{Indicates whether hold-out predictions for each inner
CV fold should be saved for ROC curves, accuracy etc see
\link[caret:trainControl]{caret::trainControl}. Default is \code{"final"} to capture predictions for
inner CV ROC.}

\item{na.option}{Character value specifying how \code{NA}s are dealt with.
\code{"omit"} is equivalent to \code{na.action = na.omit}. \code{"omitcol"} removes cases
if there are \code{NA} in 'y', but columns (predictors) containing \code{NA} are
removed from 'x' to preserve cases. Any other value means that \code{NA} are
ignored (a message is given).}

\item{...}{Arguments passed to \link[caret:train]{caret::train}}
}
\value{
An object with S3 class "nestcv.train"
\item{call}{the matched call}
\item{output}{Predictions on the left-out outer folds}
\item{outer_result}{List object of results from each outer fold containing
predictions on left-out outer folds, caret result and number of filtered
predictors at each fold.}
\item{dimx}{dimensions of \code{x}}
\item{outer_folds}{List of indices of outer test folds}
\item{final_fit}{Final fitted caret model using best tune parameters}
\item{final_vars}{Column names of filtered predictors entering final model}
\item{roc}{ROC AUC for binary classification where available.}
\item{trControl}{\code{caret::trainControl} object used for inner CV}
\item{bestTunes}{best tuned parameters from each outer fold}
\item{finalTune}{final parameters used for final model}
\item{summary}{Overall performance summary. Accuracy and balanced accuracy
for classification. ROC AUC for binary classification. RMSE for
regression.}
}
\description{
This function applies nested cross-validation (CV) to training of models
using the \code{caret} package. The function also allows the option of embedded
filtering of predictors for feature selection nested within the outer loop of
CV. Predictions on the outer test folds are brought back together and error
estimation/ accuracy determined. The default is 10x10 nested CV.
}
\details{
Parallelisation is performed on the outer folds using \code{mclapply}.
For classification \code{metric} defaults to using 'logLoss' with the
\code{trControl} arguments \verb{classProbs = TRUE, summaryFunction = mnLogLoss},
rather than 'Accuracy' which is the default classification metric in
\code{caret}. See \link{trainControl}. LogLoss is arguably more consistent than
Accuracy for tuning parameters in datasets with small sample size.

Models can be fitted with a single set of fixed parameters, in which case
\code{trControl} defaults to \code{trainControl(method = "none")} which disables
inner CV as it is unnecessary. See
https://topepo.github.io/caret/model-training-and-tuning.html#fitting-models-without-parameter-tuning
}
\examples{
\donttest{
## sigmoid function
sigmoid <- function(x) {1 / (1 + exp(-x))}

## load iris dataset and simulate a binary outcome
data(iris)
x <- iris[, 1:4]
colnames(x) <- c("marker1", "marker2", "marker3", "marker4")
x <- as.data.frame(apply(x, 2, scale))
y2 <- sigmoid(0.5 * x$marker1 + 2 * x$marker2) > runif(nrow(x))
y2 <- factor(y2, labels = c("class1", "class2"))

## Example using random forest with caret
cvrf <- nestcv.train(y2, x, method = "rf",
                     n_outer_folds = 3,
                     cv.cores = 2)
summary(cvrf)

## Example of glmnet tuned using caret
## set up small tuning grid for quick execution
## length.out of 20-100 is usually recommended for lambda
## and more alpha values ranging from 0-1
tg <- expand.grid(lambda = exp(seq(log(2e-3), log(1e0), length.out = 5)),
                  alpha = 1)

ncv <- nestcv.train(y = y2, x = x,
                    method = "glmnet",
                    n_outer_folds = 3,
                    tuneGrid = tg, cv.cores = 2)
summary(ncv)

## plot tuning for outer fold #1
plot(ncv$outer_result[[1]]$fit, xTrans = log)

## plot final ROC curve
plot(ncv$roc)

## plot ROC for left-out inner folds
inroc <- innercv_roc(ncv)
plot(inroc)
}
}
\author{
Myles Lewis
}
