% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ipd.R
\name{ipd}
\alias{ipd}
\title{Inference on Predicted Data (ipd)}
\usage{
ipd(
  formula,
  method,
  model,
  data,
  label = NULL,
  unlabeled_data = NULL,
  seed = NULL,
  intercept = TRUE,
  alpha = 0.05,
  alternative = "two-sided",
  n_t = Inf,
  na_action = "na.fail",
  ...
)
}
\arguments{
\item{formula}{An object of class \code{formula}: a symbolic description of
the model to be fitted. Must be of the form \code{Y - f ~ X}, where \code{Y}
is the name of the column corresponding to the observed outcome in the
labeled data, \code{f} is the name of the column corresponding to the
predicted outcome in both labeled and unlabeled data, and \code{X}
corresponds to the features of interest (i.e., \code{X = X1 + ... + Xp}).
See \strong{1. Formula} in the \strong{Details} below for more information.}

\item{method}{The IPD method to be used for fitting the model. Must be one of
\code{"postpi_analytic"}, \code{"postpi_boot"}, \code{"ppi"},
\code{"ppi_plusplus"}, or \code{"pspa"}.
See \strong{3. Method} in the \strong{Details} below for more information.}

\item{model}{The type of downstream inferential model to be fitted, or the
parameter being estimated. Must be one of \code{"mean"},
\code{"quantile"}, \code{"ols"}, \code{"logistic"}, or \code{"poisson"}.
See \strong{4. Model} in the \strong{Details} below for more information.}

\item{data}{A \code{data.frame} containing the variables in the model,
either a stacked data frame with a specific column identifying the labeled
versus unlabeled observations (\code{label}), or only the labeled data
set. Must contain columns for the observed outcomes (\code{Y}), the
predicted outcomes (\code{f}), and the features (\code{X}) needed to specify
the \code{formula}. See \strong{2. Data} in the \strong{Details} below for
more information.}

\item{label}{A \code{string}, \code{int}, or \code{logical} specifying the
column in the data that distinguishes between the labeled and unlabeled
observations. See the \code{Details} section for more information. If NULL,
\code{unlabeled_data} must be specified. See \strong{2. Data} in the
\strong{Details} below for more information.}

\item{unlabeled_data}{(optional) A \code{data.frame} of unlabeled data. If
NULL, \code{label} must be specified. Specifying both the \code{label} and
\code{unlabeled_data} arguments will result in an error message. If
specified, must contain columns for the predicted outcomes (\code{f}), and
the features (\code{X}) needed to specify the \code{formula}. See
\strong{2. Data} in the \strong{Details} below for more information.}

\item{seed}{(optional) An \code{integer} seed for random number generation.}

\item{intercept}{\code{Logical}. Should an intercept be included in the
model? Default is \code{TRUE}.}

\item{alpha}{The significance level for confidence intervals. Default is
\code{0.05}.}

\item{alternative}{A string specifying the alternative hypothesis. Must be
one of \code{"two-sided"}, \code{"less"}, or \code{"greater"}.}

\item{n_t}{(integer, optional) Size of the dataset used to train the
prediction function (necessary for the \code{"postpi_analytic"} and
\code{"postpi_boot"} methods if \code{n_t} < \code{nrow(X_l)}.
Defaults to \code{Inf}.}

\item{na_action}{(string, optional) How missing covariate data should be
handled. Currently \code{"na.fail"} and \code{"na.omit"} are accommodated.
Defaults to \code{"na.fail"}.}

\item{...}{Additional arguments to be passed to the fitting function. See
the \code{Details} section for more information. See
\strong{5. Auxiliary Arguments} and \strong{6. Other Arguments} in the
\strong{Details} below for more information.}
}
\value{
a summary of model output.

A list containing the fitted model components:

\describe{
\item{coefficients}{Estimated coefficients of the model}
\item{se}{Standard errors of the estimated coefficients}
\item{ci}{Confidence intervals for the estimated coefficients}
\item{formula}{The formula used to fit the ipd model.}
\item{data}{The data frame used for model fitting.}
\item{method}{The method used for model fitting.}
\item{model}{The type of model fitted.}
\item{intercept}{Logical. Indicates if an intercept was included in the
model.}
\item{fit}{Fitted model object containing estimated coefficients, standard
errors, confidence intervals, and additional method-specific output.}
\item{...}{Additional output specific to the method used.}
}
}
\description{
The main wrapper function to conduct ipd using various methods
and models, and returns a list of fitted model components.
}
\details{
\strong{1. Formula:}

The \code{ipd} function uses one formula argument that specifies both the
calibrating model (e.g., PostPI "relationship model", PPI "rectifier" model)
and the inferential model. These separate models will be created internally
based on the specific \code{method} called.

\strong{2. Data:}

The data can be specified in two ways:

\enumerate{
\item Single data argument (\code{data}) containing a stacked
\code{data.frame} and a label identifier (\code{label}).
\item Two data arguments, one for the labeled data (\code{data}) and one
for the unlabeled data (\code{unlabeled_data}).
}

For option (1), provide one data argument (\code{data}) which contains a
stacked \code{data.frame} with both the unlabeled and labeled data and a
\code{label} argument that specifies the column identifying the labeled
versus the unlabeled observations in the stacked \code{data.frame} (e.g.,
\code{label = "set_label"} if the column "set_label" in the stacked data
denotes which set an observation belongs to).

NOTE: Labeled data identifiers can be:

\describe{
\item{String}{"l", "lab", "label", "labeled", "labelled", "tst", "test",
"true"}
\item{Logical}{TRUE}
\item{Factor}{Non-reference category (i.e., binary 1)}
}

Unlabeled data identifiers can be:

\describe{
\item{String}{"u", "unlab", "unlabeled", "unlabelled", "val",
"validation", "false"}
\item{Logical}{FALSE}
\item{Factor}{Non-reference category (i.e., binary 0)}
}

For option (2), provide separate data arguments for the labeled data set
(\code{data}) and the unlabeled data set (\code{unlabeled_data}). If the
second argument is provided, the function ignores the \code{label} identifier
and assumes the data provided are not stacked.

NOTE: Not all columns in \code{data} or \code{unlabeled_data} may be used
unless explicitly referenced in the \code{formula} argument or in the
\code{label} argument (if the data are passed as one stacked data frame).

\strong{3. Method:}

Use the \code{method} argument to specify the fitting method:

\describe{
\item{"postpi_analytic"}{Wang et al. (2020) Post-Prediction Inference (PostPI) Analytic Correction}
\item{"postpi_boot"}{Wang et al. (2020) Post-Prediction Inference (PostPI) Bootstrap Correction}
\item{"ppi"}{Angelopoulos et al. (2023) Prediction-Powered Inference
(PPI)}
\item{"ppi_plusplus"}{Angelopoulos et al. (2023) PPI++}
\item{"pspa"}{Miao et al. (2023) Assumption-Lean and Data-Adaptive
Post-Prediction Inference (PSPA)}
}

\strong{4. Model:}

Use the \code{model} argument to specify the type of downstream inferential
model or parameter to be estimated:

\describe{
\item{"mean"}{Mean value of a continuous outcome}
\item{"quantile"}{\code{q}th quantile of a continuous outcome}
\item{"ols"}{Linear regression coefficients for a continuous outcome}
\item{"logistic"}{Logistic regression coefficients for a binary outcome}
\item{"poisson"}{Poisson regression coefficients for a count outcome}
}

The \code{ipd} wrapper function will concatenate the \code{method} and
\code{model} arguments to identify the required helper function, following
the naming convention "method_model".

\strong{5. Auxiliary Arguments:}

The wrapper function will take method-specific auxiliary arguments (e.g.,
\code{q} for the quantile estimation models) and pass them to the helper
function through the "..." with specified defaults for simplicity.

\strong{6. Other Arguments:}

All other arguments that relate to all methods (e.g., alpha, ci.type), or
other method-specific arguments, will have defaults.
}
\examples{

#-- Generate Example Data

set.seed(12345)

dat <- simdat(n = c(300, 300, 300), effect = 1, sigma_Y = 1)

head(dat)

formula <- Y - f ~ X1

#-- PostPI Analytic Correction (Wang et al., 2020)

ipd(formula, method = "postpi_analytic", model = "ols",

    data = dat, label = "set_label")

#-- PostPI Bootstrap Correction (Wang et al., 2020)

nboot <- 200

ipd(formula, method = "postpi_boot", model = "ols",

    data = dat, label = "set_label", nboot = nboot)

#-- PPI (Angelopoulos et al., 2023)

ipd(formula, method = "ppi", model = "ols",

    data = dat, label = "set_label")

#-- PPI++ (Angelopoulos et al., 2023)

ipd(formula, method = "ppi_plusplus", model = "ols",

    data = dat, label = "set_label")

#-- PSPA (Miao et al., 2023)

ipd(formula, method = "pspa", model = "ols",

    data = dat, label = "set_label")

}
