% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/generalized_bootstrap.R
\name{as_gen_boot_design}
\alias{as_gen_boot_design}
\title{Convert a survey design object to a generalized bootstrap replicate design}
\usage{
as_gen_boot_design(
  design,
  variance_estimator = NULL,
  replicates = 500,
  tau = "auto",
  exact_vcov = FALSE,
  psd_option = "warn",
  mse = getOption("survey.replicates.mse"),
  compress = TRUE
)
}
\arguments{
\item{design}{A survey design object created using the 'survey' (or 'srvyr') package,
with class \code{'survey.design'} or \code{'svyimputationList'}.}

\item{variance_estimator}{The name of the variance estimator
whose quadratic form matrix should be created.
See \link[svrep]{variance-estimators} for a
detailed description of each variance estimator.
Options include:
\itemize{
  \item{\strong{"Yates-Grundy"}: }{The Yates-Grundy variance estimator based on
  first-order and second-order inclusion probabilities.}
  \item{\strong{"Horvitz-Thompson"}: }{The Horvitz-Thompson variance estimator based on
  first-order and second-order inclusion probabilities.}
  \item{\strong{"Poisson Horvitz-Thompson"}: }{The Horvitz-Thompson variance estimator
  based on assuming Poisson sampling, with first-order inclusion probabilities
  inferred from the sampling probabilities of the survey design object.}
  \item{\strong{"Stratified Multistage SRS"}: }{The usual stratified multistage variance estimator
  based on estimating the variance of cluster totals within strata at each stage.}
  \item{\strong{"Ultimate Cluster"}: }{The usual variance estimator based on estimating
  the variance of first-stage cluster totals within first-stage strata.}
  \item{\strong{"Deville-1"}: }{A variance estimator for unequal-probability
  sampling without replacement, described in Matei and Tillé (2005)
  as "Deville 1".}
  \item{\strong{"Deville-2"}: }{A variance estimator for unequal-probability
  sampling without replacement, described in Matei and Tillé (2005)
  as "Deville 2".}
  \item{\strong{"SD1"}: }{The non-circular successive-differences variance estimator described by Ash (2014),
  sometimes used for variance estimation for systematic sampling.}
  \item{\strong{"SD2"}: }{The circular successive-differences variance estimator described by Ash (2014).
  This estimator is the basis of the "successive-differences replication" estimator commonly used
  for variance estimation for systematic sampling.}
}}

\item{replicates}{Number of bootstrap replicates (should be as large as possible, given computer memory/storage limitations).
A commonly-recommended default is 500.}

\item{tau}{Either \code{"auto"}, or a single number. This is the rescaling constant
used to avoid negative weights through the transformation \eqn{\frac{w + \tau - 1}{\tau}},
where \eqn{w} is the original weight and \eqn{\tau} is the rescaling constant \code{tau}. \cr
If \code{tau="auto"}, the rescaling factor is determined automatically as follows:
if all of the adjustment factors are nonnegative, then \code{tau} is set equal to 1;
otherwise, \code{tau} is set to the smallest value needed to rescale
the adjustment factors such that they are all at least \code{0.01}.}

\item{exact_vcov}{If \code{exact_vcov=TRUE}, the replicate factors will be generated
such that variance estimates for totals exactly match the results from the target variance estimator.
This requires that \code{num_replicates} exceeds the rank of \code{Sigma}.
The replicate factors are generated by applying PCA-whitening to a collection of draws
from a multivariate Normal distribution, then applying a coloring transformation
to the whitened collection of draws.}

\item{psd_option}{Either \code{"warn"} (the default) or \code{"error"}.
This option specifies what will happen if the target variance estimator
has a quadratic form matrix which is not positive semidefinite. This
can occasionally happen, particularly for two-phase designs. \cr
If \code{psd_option="error"}, then an error message will be displayed. \cr
If \code{psd_option="warn"}, then a warning message will be displayed,
and the quadratic form matrix will be approximated by the most similar
positive semidefinite matrix.
This approximation was suggested by Beaumont and Patak (2012),
who note that this is conservative in the sense of producing
overestimates of variance.
Beaumont and Patak (2012) argue that this overestimation is expected to be
small in magnitude. See \code{\link[svrep]{get_nearest_psd_matrix}}
for details of the approximation.}

\item{mse}{If \code{TRUE}, compute variances from sums of squares around the point estimate from the full-sample weights,
If \code{FALSE}, compute variances from sums of squares around the mean estimate from the replicate weights.}

\item{compress}{This reduces the computer memory required to represent the replicate weights and has no
impact on estimates.}
}
\value{
A replicate design object, with class \code{svyrep.design}, which can be used with the usual functions,
such as \code{svymean()} or \code{svyglm()}.

Use \code{weights(..., type = 'analysis')} to extract the matrix of replicate weights.

Use \code{as_data_frame_with_weights()} to convert the design object to a data frame with columns
for the full-sample and replicate weights.
}
\description{
Converts a survey design object to a replicate design object
with replicate weights formed using the generalized bootstrap method.
The generalized survey bootstrap is a method for forming bootstrap replicate weights
from a textbook variance estimator, provided that the variance estimator
can be represented as a quadratic form whose matrix is positive semidefinite
(this covers a large class of variance estimators).
}
\section{Statistical Details}{

Let \eqn{v( \hat{T_y})} be the textbook variance estimator for an estimated population total \eqn{\hat{T}_y} of some variable \eqn{y}.
The base weight for case \eqn{i} in our sample is \eqn{w_i}, and we let \eqn{\breve{y}_i} denote the weighted value \eqn{w_iy_i}.
Suppose we can represent our textbook variance estimator as a quadratic form: \eqn{v(\hat{T}_y) = \breve{y}\Sigma\breve{y}^T},
for some \eqn{n \times n} matrix \eqn{\Sigma}.
The only constraint on \eqn{\Sigma} is that, for our sample, it must be symmetric and positive semidefinite.

The bootstrapping process creates \eqn{B} sets of replicate weights, where the \eqn{b}-th set of replicate weights is a vector of length \eqn{n} denoted \eqn{\mathbf{a}^{(b)}}, whose \eqn{k}-th value is denoted \eqn{a_k^{(b)}}.
This yields \eqn{B} replicate estimates of the population total, \eqn{\hat{T}_y^{*(b)}=\sum_{k \in s} a_k^{(b)} \breve{y}_k}, for \eqn{b=1, \ldots B}, which can be used to estimate sampling variance.

\deqn{
  v_B\left(\hat{T}_y\right)=\frac{\sum_{b=1}^B\left(\hat{T}_y^{*(b)}-\hat{T}_y\right)^2}{B}
}

This bootstrap variance estimator can be written as a quadratic form:

  \deqn{
    v_B\left(\hat{T}_y\right) =\mathbf{\breve{y}}^{\prime}\Sigma_B \mathbf{\breve{y}}
  }
  where
  \deqn{
    \boldsymbol{\Sigma}_B = \frac{\sum_{b=1}^B\left(\mathbf{a}^{(b)}-\mathbf{1}_n\right)\left(\mathbf{a}^{(b)}-\mathbf{1}_n\right)^{\prime}}{B}
  }

Note that if the vector of adjustment factors \eqn{\mathbf{a}^{(b)}} has expectation \eqn{\mathbf{1}_n} and variance-covariance matrix \eqn{\boldsymbol{\Sigma}},
then we have the bootstrap expectation \eqn{E_{*}\left( \boldsymbol{\Sigma}_B \right) = \boldsymbol{\Sigma}}. Since the bootstrap process takes the sample values \eqn{\breve{y}} as fixed, the bootstrap expectation of the variance estimator is \eqn{E_{*} \left( \mathbf{\breve{y}}^{\prime}\Sigma_B \mathbf{\breve{y}}\right)= \mathbf{\breve{y}}^{\prime}\Sigma \mathbf{\breve{y}}}.
Thus, we can produce a bootstrap variance estimator with the same expectation as the textbook variance estimator simply by randomly generating \eqn{\mathbf{a}^{(b)}} from a distribution with the following two conditions:
\cr
    \strong{Condition 1}: \eqn{\quad \mathbf{E}_*(\mathbf{a})=\mathbf{1}_n}
\cr
    \strong{Condition 2}: \eqn{\quad \mathbf{E}_*\left(\mathbf{a}-\mathbf{1}_n\right)\left(\mathbf{a}-\mathbf{1}_n\right)^{\prime}=\mathbf{\Sigma}}
\cr \cr
While there are multiple ways to generate adjustment factors satisfying these conditions,
the simplest general method is to simulate from a multivariate normal distribution: \eqn{\mathbf{a} \sim MVN(\mathbf{1}_n, \boldsymbol{\Sigma})}.
This is the method used by this function.
}

\section{Details on Rescaling to Avoid Negative Adjustment Factors}{

Let \eqn{\mathbf{A} = \left[ \mathbf{a}^{(1)} \cdots \mathbf{a}^{(b)} \cdots \mathbf{a}^{(B)} \right]} denote the \eqn{(n \times B)} matrix of bootstrap adjustment factors.
To eliminate negative adjustment factors, Beaumont and Patak (2012) propose forming a rescaled matrix of nonnegative replicate factors \eqn{\mathbf{A}^S} by rescaling each adjustment factor \eqn{a_k^{(b)}} as follows:
\deqn{
   a_k^{S,(b)} = \frac{a_k^{(b)} + \tau - 1}{\tau}
 }
where \eqn{\tau \geq 1 - a_k^{(b)} \geq 1} for all \eqn{k} in \eqn{\left\{ 1,\ldots,n \right\}} and all \eqn{b} in \eqn{\left\{1, \ldots, B\right\}}.

The value of \eqn{\tau} can be set based on the realized adjustment factor matrix \eqn{\mathbf{A}} or by choosing \eqn{\tau} prior to generating the adjustment factor matrix \eqn{\mathbf{A}} so that \eqn{\tau} is likely to be large enough to prevent negative bootstrap weights.

If the adjustment factors are rescaled in this manner, it is important to adjust the scale factor used in estimating the variance with the bootstrap replicates, which becomes \eqn{\frac{\tau^2}{B}} instead of \eqn{\frac{1}{B}}.
\deqn{
 \textbf{Prior to rescaling: } v_B\left(\hat{T}_y\right) = \frac{1}{B}\sum_{b=1}^B\left(\hat{T}_y^{*(b)}-\hat{T}_y\right)^2
 }
\deqn{
 \textbf{After rescaling: } v_B\left(\hat{T}_y\right) = \frac{\tau^2}{B}\sum_{b=1}^B\left(\hat{T}_y^{S*(b)}-\hat{T}_y\right)^2
}
When sharing a dataset that uses rescaled weights from a generalized survey bootstrap, the documentation for the dataset should instruct the user to use replication scale factor \eqn{\frac{\tau^2}{B}} rather than \eqn{\frac{1}{B}} when estimating sampling variances.
}

\section{Two-Phase Designs}{

For a two-phase design, \code{variance_estimator} should be a list of variance estimators' names,
with two elements, such as \code{list('Ultimate Cluster', 'Poisson Horvitz-Thompson')}.
In two-phase designs, only the following estimators may be used for the second phase:
\itemize{
  \item "Ultimate Cluster"
  \item "Stratified Multistage SRS"
  \item "Poisson Horvitz-Thompson"
}
For statistical details on the handling of two-phase designs,
see the documentation for \link[svrep]{make_twophase_quad_form}.
}

\examples{
\dontrun{
library(survey)

# Example 1: Bootstrap based on the Yates-Grundy estimator ----
   set.seed(2014)

   data('election', package = 'survey')

   ## Create survey design object
   pps_design_yg <- svydesign(
     data = election_pps,
     id = ~1, fpc = ~p,
     pps = ppsmat(election_jointprob),
     variance = "YG"
   )

   ## Convert to generalized bootstrap replicate design
   gen_boot_design_yg <- pps_design_yg |>
     as_gen_boot_design(variance_estimator = "Yates-Grundy",
                        replicates = 1000, tau = "auto")

   svytotal(x = ~ Bush + Kerry, design = pps_design_yg)
   svytotal(x = ~ Bush + Kerry, design = gen_boot_design_yg)

# Example 2: Bootstrap based on the successive-difference estimator ----

   data('library_stsys_sample', package = 'svrep')

   ## First, ensure data are sorted in same order as was used in sampling
   library_stsys_sample <- library_stsys_sample[
     order(library_stsys_sample$SAMPLING_SORT_ORDER),
   ]

   ## Create a survey design object
   design_obj <- svydesign(
     data = library_stsys_sample,
     strata = ~ SAMPLING_STRATUM,
     ids = ~ 1,
     fpc = ~ STRATUM_POP_SIZE
   )

   ## Convert to generalized bootstrap replicate design
   gen_boot_design_sd2 <- as_gen_boot_design(
     design = design_obj,
     variance_estimator = "SD2",
     replicates = 2000
   )

   ## Estimate sampling variances
   svytotal(x = ~ TOTSTAFF, na.rm = TRUE, design = gen_boot_design_sd2)
   svytotal(x = ~ TOTSTAFF, na.rm = TRUE, design = design_obj)

# Example 3: Two-phase sample ----
# -- First stage is stratified systematic sampling,
# -- second stage is response/nonresponse modeled as Poisson sampling

  nonresponse_model <- glm(
    data = library_stsys_sample,
    family = quasibinomial('logit'),
    formula = I(RESPONSE_STATUS == "Survey Respondent") ~ 1,
    weights = 1/library_stsys_sample$SAMPLING_PROB
  )

  library_stsys_sample[['RESPONSE_PROPENSITY']] <- predict(
    nonresponse_model,
    newdata = library_stsys_sample,
    type = "response"
  )

  twophase_design <- twophase(
    data = library_stsys_sample,
    # Identify cases included in second phase sample
    subset = ~ I(RESPONSE_STATUS == "Survey Respondent"),
    strata = list(~ SAMPLING_STRATUM, NULL),
    id = list(~ 1, ~ 1),
    probs = list(NULL, ~ RESPONSE_PROPENSITY),
    fpc = list(~ STRATUM_POP_SIZE, NULL),
  )

  twophase_boot_design <- as_gen_boot_design(
    design = twophase_design,
    variance_estimator = list(
      "SD2", "Poisson Horvitz-Thompson"
    )
  )

  svytotal(x = ~ LIBRARIA, design = twophase_boot_design)

}
}
\references{
The generalized survey bootstrap was first proposed by Bertail and Combris (1997).
See Beaumont and Patak (2012) for a clear overview of the generalized survey bootstrap.
The generalized survey bootstrap represents one strategy for forming replication variance estimators
in the general framework proposed by Fay (1984) and Dippo, Fay, and Morganstein (1984).
\cr \cr
- Ash, S. (2014). "\emph{Using successive difference replication for estimating variances}."
\strong{Survey Methodology}, Statistics Canada, 40(1), 47–59.
\cr \cr
- Bellhouse, D.R. (1985). "\emph{Computing Methods for Variance Estimation in Complex Surveys}."
\strong{Journal of Official Statistics}, Vol.1, No.3.
\cr \cr
- Beaumont, Jean-François, and Zdenek Patak. 2012. “On the Generalized Bootstrap for Sample Surveys with Special Attention to Poisson Sampling: Generalized Bootstrap for Sample Surveys.” International Statistical Review 80 (1): 127–48. https://doi.org/10.1111/j.1751-5823.2011.00166.x.
\cr \cr
- Bertail, and Combris. 1997. “Bootstrap Généralisé d’un Sondage.” Annales d’Économie Et de Statistique, no. 46: 49. https://doi.org/10.2307/20076068.
\cr \cr
- Dippo, Cathryn, Robert Fay, and David Morganstein. 1984. “Computing Variances from Complex Samples with Replicate Weights.” In, 489–94. Alexandria, VA: American Statistical Association. http://www.asasrms.org/Proceedings/papers/1984_094.pdf.
\cr \cr
- Fay, Robert. 1984. “Some Properties of Estimates of Variance Based on Replication Methods.” In, 495–500. Alexandria, VA: American Statistical Association. http://www.asasrms.org/Proceedings/papers/1984_095.pdf.
\cr \cr
- Matei, Alina, and Yves Tillé. (2005).
“\emph{Evaluation of Variance Approximations and Estimators
in Maximum Entropy Sampling with Unequal Probability and Fixed Sample Size.}”
\strong{Journal of Official Statistics}, 21(4):543–70.
}
\seealso{
Use \code{\link[svrep]{estimate_boot_reps_for_target_cv}} to help choose the number of bootstrap replicates. \cr

For greater customization of the method, \code{\link[svrep]{make_quad_form_matrix}} can be used to
represent several common variance estimators as a quadratic form's matrix,
which can then be used as an input to \code{\link[svrep]{make_gen_boot_factors}}.
The function \code{\link[svrep]{rescale_reps}} is used to implement
the rescaling of the bootstrap adjustment factors.

See \link[svrep]{variance-estimators} for a
description of each variance estimator.
}
