% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/meerva_210412.R
\name{meerva.fit}
\alias{meerva.fit}
\title{Analysis of Data with Measurement Error Using a Validation Subsample}
\usage{
meerva.fit(
  x_val,
  y_val,
  xs_val,
  ys_val,
  xs_non,
  ys_non,
  e_val = NULL,
  es_val = NULL,
  es_non = NULL,
  id_val = NULL,
  id_non = NULL,
  weights_val = NULL,
  weights_non = NULL,
  familyr = NULL,
  vmethod = NULL,
  jksize = 0,
  compare = 1
)
}
\arguments{
\item{x_val}{A matrix object including reference predictor variables (and predictors "without" error) in validation subsample.
This and other x_ matrices must not include any missing values (NA).
All data vectors and matrices must be numerical.  For categorical variables one
should first construct corresponding numerical variables to represent these categories.}

\item{y_val}{A vector object for the reference outcome variable in validation subsample.
This and other y_ vectors must not include any missing values (NA).}

\item{xs_val}{A matrix object including surrogate predictors (and predictors "without" error) in validation subsample}

\item{ys_val}{A vector object for the surrogate outcome variable in validation sample.}

\item{xs_non}{A matrix object including surrogate predictors (and predictors "without" error) in NON validation data}

\item{ys_non}{A vector object for the surrogate outcome variable in the NON validation sample.}

\item{e_val}{A vector object for the survival data reference event outcome variable in validation subsample.
This and the other e_ vectors are optional.
The e_ vectors are required when analyzing survival data based upon an underlying Cox regression model (survival package).
This and other e_ vectors must not include any missing values (NA).}

\item{es_val}{A vector object for the survival data surrogate event outcome variable in validation subsample.}

\item{es_non}{A vector object for the survival data surrogate event outcome variable in NON validation data.}

\item{id_val}{A vector object identifying clusters in case of multiple records per subject in the validation subsample.
This and id_non are optional. They must not include any missing values (NA).
No subjects should be included in both the validation subsample and the NON validation data.}

\item{id_non}{A vector object identifying clusters in case of multiple records per subject in the NON validation data.}

\item{weights_val}{A vector object with weights used in model fit of the validation subsample.
This can be used, for example, to describe inverse sampling probability weights.
Note, when fitting the "binomial" or logistic model, weights for weights_val and weights_non
must be integer.  This is a restriction of the glm.fit routine called from meerva.  The user may rescale or round the
weights to achieve integers.  By using robust variance estimates meerva provides correct variance estimates.}

\item{weights_non}{A vector object with weights used in model fit of the NON validation subsample.
This and weights_val, can be used, for example, to down weight records from patients with multiple records.}

\item{familyr}{The family for the underlying regression model amongst "binomial", "gaussian" and "Cox".
If not specified the program chooses among these three based upon a simple data inspection. 
In principle, though not (yet) implemented here, the regression model for the reference variables may 
be of a different type than for the surrogate variables.  For example the reference outcome could be yes/no 
in nature while the surrogate outcome could be a numeric, and the method would continue to work.}

\item{vmethod}{Method for robust estimation of variance covariance matrices needed for calculation of the augmented estimates (beta aug).
0, 1 or 2 determines JK (slow), IJK using dfbeta of glm or coxph, or IJK using an alternate formula for dfbeta.
Recommendations:  For "gaussian" use 1, for "Cox" use 1 for speed and 0 for accuracy,
and for "binomial" use 2 for speed, and 0 for accuracy.}

\item{jksize}{Number of elements to leave out number in each cycle of the grouped jackknife
for non validation data.  The default is 0 where the program chooses jksize
so that the number of leave out groups is about validation subsample size.
For the grouped jackknife the program randomly sorts the non validation subsample.  To get
the exact same results twice one can set the seed for the random generator with the
statement set.seed(seed) for some value of seed, and to get a "random" seed one can
first run the statement seed = round(runif(1)*1000000000) .}

\item{compare}{1 to compare gamma_val with gamma_ful (default) or 0 with gamma_non.
Comparisons of gamma_val with gamma_ful is consistent with the principle of the
validation set being a subsample of the entire dataset.  This assures the 
correlations between gamma_val and beta_val are representative of what
should be the case based upon the whole dataset.  If there were an 
external validation sample where one could be reasonably certain 
that the correlation between gamma_val and beta_val would be
representative then one could also use this method.}
}
\value{
meerva.fit returns an object of class meerva which contains
the augmented estimates based upon the full data set accounting for measurement error,
estimates based upon reference variables from the validation subsample,
estimates based upon the surrogate variables from the whole sample,
along with robust variance-covariances matrix estimates for these estimates.
This meerva class list contains the following objects.

Call     --- The call used to invoke meerva.fit.

FitInput --- A list with

--- familyr --- The type of regression model fit to the data.

--- compare --- The input parameter compare, 1 to compare the validation
           data with the whole dataset, or 0 to compare with the NON validation data.

--- comparec --- A short text interpretation of compare.

--- vmethod --- The method used to estimate the variance-covariance
           matrices needed for calculation of the estimates.

--- vmethodc --- A short text description of vmethod.

--- n_val    --- The number of observations in the validation subsample.

--- n_ful    --- The number of observations in the whole dataset.

--- n_val_id --- The number of clusters identified by id_val in the validation subsample.

--- n_ful_id --- The number of clusters identified by id_val and id_non in the whole dataset.

--- dim_beta --- The number of parameters in the regression model
          for reference variables including a possible intercept.

--- dim_gamma --- The number of parameters in the regression
          model for surrogate variables including a possible intercept.

names_x  ---  The reference variable predictors used in analysis.

names_xs ---  The surrogate variable predictors used in analysis.

names_y  ---  The reference outcome variable used in analysis.

names_ys ---  The surrogate outcome variable used in analysis.

coef_beta --- The regression parameter estimates for the reference
   variables including both beta_val based upon the reference variables alone
   (available only in the validation subsample) and beta_aug, the augmented 
   estimates based upon the reference variables in the validation subsample
   augmented by the surrogate variables in the whole dataset.

coef_gamma --- The regression parameter estimates for the surrogate
   variables for both gamma_val derived using dataset elements included
   in the validation subsample, and
   either gamma_ful or gamma_non, derived using either the whole 
   sample or the NON validation data.

var_beta --- Robust variance estimates for coef_beta, which are
   also included in vcov_beta and vcov_beta_val.

var_gamma --- Robust variance estimates for coef_gamma, which are
   also included in vcov_gamma.

vcov_beta_aug --- Robust variance-covariance estimates for beta_aug of coef_beta.

vcov_beta_val ---  Robust variance-covariance estimates for beta_val of coef_beta.

vcov_beta_val_naive --- Naive variance-covariance estimates for beta_val
   of coef_beta obtained without any consideration of clustering optionally
   described by input parameters id_val and id_non.

vcov_gamma_ful --- Robust variance-covariance estimates for gamma_ful of coef_gamma.

or vcov_gamma_non --- Robust variance-covariance estimates for gamma_non of coef_gamma.

vcov_gamma_ful_naive --- Naive variance-covariance estimates for
   gamma_ful of coef_gamma obtained without any consideration of
   clustering optionally described by input parameters id_val and id_non.

or vcov_gamma_non_naive --- Like vcov_gamma_ful_naive but for gamma_non.

omega --- The robust covariance estimate between beta_val and either
   gamma_ful or gamma_non, which is integral for derivation of beta_aug.

omega_cor --- The robust correlation estimate between beta_val and
   either gamma_ful or gamma_non, which reflects the relative amount
   of information on reference variable estimates contained in the
   surrogate variables.

kappa --- The robust variance covariance estimate of either
   (gamma_val - gamma_ful) or (gamma_val - gamma_non), which is
   integral for derivation of beta_aug.
}
\description{
The meerva package is designed to analyze data with measurement error when there is a
validation subsample randomly selected from the full sample.  The method assumes
surrogate variables measured with error are available for the full sample,
and reference variables measured with little or no error are available for this randomly
chosen subsample of the full sample.  Measurement errors may be differential or
non differential, in any or all predictors (simultaneously) as well as outcome.
The "augmented" estimates derived by meerva are based upon the multivariate correlation between regression
models based upon the reference variables and the surrogate variables in the validation subset.  Because the
validation subsample is chosen at random whatever biases are imposed by measurement error, non-differential
or differential, are reflected in this correlation and can be used to derives estimates for the reference
variables using data from the whole sample.

Intuitively one expects there to be at least one surrogate for each reference variable but the method is based
upon multivariate correlations and therefore also works if there are more or fewer surrogate than reference variables.
The package fits linear, logistic or Cox regression models individually to the reference variales and to the
surrogate varibles, then combines the results to descibe a model in terms of the reference variables based
upon the entire dataset.
}
\details{
As currently implemented the package requires the data to be input as
vectors and matrices with no missing values (NA).
All data vectors and matrices must be numerical.  For categorical variables one
should first construct corresponding numerical variables to represent these categories.
Note, variables thought of as measured without error should be included in both the reference variable set and
the surrogate variable set.  Such variables may be thought of as perfect surrogates.  This applies for both
outcome variables and predictor variables.  For the Cox model both the time to event and the event indicator
may be measured with error.

The length of the vectors for the validation subsample must all be the same, and be the same as the number of rows
in the predictor matrices for the validation subsample.  Data for sample elements not included in the validation
subsample are referred to as NON validation data and are to be included in separate vectors and matrix.  Here, too, the
length of all vectors must be the same as number of rows in the predictor matrix.  The columns in the data matrix for the
validation subsample surrogates must be logically the same as the columns in the data matrix for the
NON validation surrogates.

The data for analysis may include weights, for example to account for non identical
sampling probabilities when selecting the subsample, by taking weights as the inverse of these probabilities.
The data may include cluster identifiers in case of multiple observations on study participants.
Weights may also be used to lessen the influence of individuals with multiple observations.

Internally the analysis uses robust variance estimation which accounts for deviations from the usual regression
model assumptions for the surrogate variables, and accounts for multiple observations per patient.

This package came out of our work analyzing electronic health records data, where different sources, e.g diagnosis codes
and natural language processing, may provide different surrogate variables.  Reference variables were obtained by
manual chart review.  For our datasets to date with tens of thousands of patients the analyses take a few seconds
when run on a PC.

In the examples we generate simulated data of the form expected for input, call the main program,
and summarize the output.
}
\examples{
#======================================================

# Simulate logistic regression data with measurement error
simd = meerva.sim.brn(n=4000, m=400,
     beta = c(-0.5, 0.5, 0.2, 1, 0.5) , 
     alpha1 = c(0.95, 0.90, 0.90, 0.95) , 
     alpha2 = c(0.98,0.94,0.95,0.95) , 
     bx3s1 = c(0.05, 0, 0, NA, NA) , 
     bx3s2 = c(NA,NA,NA) )

# Read the simulated data to input data format
x_val  = simd$x_val
y_val  = simd$y_val
xs_val = simd$xs_val
ys_val = simd$ys_val
xs_non = simd$xs_non
ys_non = simd$ys_non

# Analyze the data
brn.me = meerva.fit(x_val, y_val, xs_val, ys_val, xs_non, ys_non)
summary(brn.me)

#======================================================

# Simulate linear regression data with measurement error
simd = meerva.sim.nrm(n=4000, m=400,
     beta=c(-0.5,0.5,0.2,1,0.5),
     alpha1=c(-0.05,0.1,0.05,0.1), 
     alpha2=c(0.95,0.91,0.9,0.9),
     bx3s1= c(0.05, 0, 0, NA, NA), 
     bx3s2=c(1.1,0.9,0.05),
     sd=5)

# Read the simulated data to input data format
x_val  = simd$x_val
y_val  = simd$y_val
xs_val = simd$xs_val
ys_val = simd$ys_val
xs_non = simd$xs_non
ys_non = simd$ys_non

# Analyze the data
nrm.me = meerva.fit(x_val, y_val, xs_val, ys_val, xs_non, ys_non)
summary(nrm.me)

#======================================================
# Simulate Cox regression data with measurement error
simd = meerva.sim.cox(n=4000, m=400,
     beta   = c(-0.5, 0.5, 0.2, 1, 0.5) ,
     alpha1 = c(0.95,0.90,0.90,0.95)  ,
     alpha2 = c(0.98,0.94,0.94,0.98) ,
     bx3s1  = c(0.05,0,0,NA,NA) ,
     bx3s2  = c(1.1, NA, NA) ,
     sd=0.1)

x_val  = simd$x_val
y_val  = simd$y_val
xs_val = simd$xs_val
ys_val = simd$ys_val
xs_non = simd$xs_non
ys_non = simd$ys_non
e_val  = simd$e_val
es_val = simd$es_val
es_non = simd$es_non

cox.me = meerva.fit(x_val, y_val, xs_val, ys_val, xs_non, ys_non,
                    e_val, es_val, es_non)
summary(cox.me)

#======================================================

}
\references{
Chen Y-H, Chen H.  A Unified Approach to Regression Analysis under Double-Sampling Designs.
 Journal of the Royal Statistical Society. Series B (Statistical Methodology) , 2000 (62) 449-460.

 Chen Y-H. Cox regression in cohort studies with validation sampling.
 Journal of the Royal Statistical Society. Series B (Statistical Methodology), 2002 64, 51-62.

 Wang X, Wang QH. Semiparametric linear transformation model with differential measurement error
 and validation sampling. J Multivariate Anal. 2015;141:67-80.

 Tong JY, Huang J, Chubak J, et al. An augmented estimation procedure for EHR-based association
 studies accounting for differential misclassification. J Am Med Inform Assn. 2020;27(2):244-253.
}
\seealso{
\code{\link{meerva.sim.block}} , \code{\link{meerva.sim.nrm}} , \code{\link{meerva.sim.brn}} , \code{\link{meerva.sim.cox}}
}
\author{
Walter Kremers (kremers.walter@mayo.edu)
}
