% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/do_cv_direct.R
\name{do_cv_direct}
\alias{do_cv_direct}
\title{Direct HTRX: k-fold cross-validation on short haplotypes}
\usage{
do_cv_direct(
  data_nosnp,
  featuredata,
  featurecap = dim(featuredata)[2],
  usebinary = 1,
  method = "simple",
  criteria = "BIC",
  gain = TRUE,
  runparallel = FALSE,
  mc.cores = 6,
  fold = 10,
  kfoldseed = 123,
  verbose = FALSE
)
}
\arguments{
\item{data_nosnp}{a data frame with outcome (the outcome must be the first column),
fixed covariates (for example, sex, age and the first 18 PCs) if there are,
and without SNPs or haplotypes.}

\item{featuredata}{a data frame of the feature data, e.g. haplotype data created by HTRX or SNPs.
These features exclude all the data in \code{data_nosnp}, and will be selected using 2-step cross-validation.}

\item{featurecap}{a positive integer which manually sets the maximum number of independent features.
By default, \code{featurecap=40}.}

\item{usebinary}{a non-negative number representing different models.
Use linear model if \code{usebinary=0},
use logistic regression model via \code{fastglm} if \code{usebinary=1} (by default),
and use logistic regression model via \code{glm} if \code{usebinary>1}.}

\item{method}{the method used for data splitting, either \code{"simple"} (default) or \code{"stratified"}.}

\item{criteria}{the criteria for model selection, either \code{"BIC"} (default), \code{"AIC"} or \code{"lasso"}.}

\item{gain}{logical. If \code{gain=TRUE} (default), report the variance explained in addition to fixed covariates;
otherwise, report the total variance explained by all the variables.}

\item{runparallel}{logical. Use parallel programming based on \code{mclapply} function from R package \code{"parallel"} or not.
Note that for Windows users, \code{mclapply} doesn't work, so please set \code{runparallel=FALSE} (default).}

\item{mc.cores}{an integer giving the number of cores used for parallel programming.
By default, \code{mc.cores=6}.
This only works when \code{runparallel=TRUE}.}

\item{fold}{a positive integer specifying how many folds
the data should be split into for cross-validation.}

\item{kfoldseed}{a positive integer specifying the seed used to
split data for k-fold cross validation. By default, \code{kfoldseed=123}.}

\item{verbose}{logical. If \code{verbose=TRUE}, print out the inference steps. By default, \code{verbose=FALSE}.}
}
\value{
\code{do_cv_direct} returns a list of the out-of-sample variance explained in each of the test set,
and the features selected in each of the k training sets.
}
\description{
Direct k-fold cross-validation used to compute the out-of-sample variance explained by selected features from HTRX.
It can be applied to select haplotypes based on HTR, or select single nucleotide polymorphisms (SNPs).
}
\details{
Function \code{do_cv_direct} directly performs k-fold cross-validation: features are
selected from the training set using a specified \code{criteria},
and the out-of-sample variance explained by the selected features are computed on the test set.
This function runs faster than \code{\link{do_cv}} with large \code{sim_times}, but may lose
some accuracy, and it doesn't return a fixed set of features.
}
\examples{
## use dataset "example_hap1", "example_hap2" and "example_data_nosnp"
## "example_hap1" and "example_hap2" are
## both genomes of 8 SNPs for 5,000 individuals (diploid data)
## "example_data_nosnp" is an example dataset
## which contains the outcome (binary), sex, age and 18 PCs

## visualise the covariates data
## we will use only the first two covariates: sex and age in the example
head(HTRX::example_data_nosnp)

## visualise the genotype data for the first genome
head(HTRX::example_hap1)

## we perform HTRX on the first 4 SNPs
## we first generate all the haplotype data, as defined by HTRX
HTRX_matrix=make_htrx(HTRX::example_hap1[,1:4],
                      HTRX::example_hap2[,1:4])

## If the data is haploid, please set
## HTRX_matrix=make_htrx(HTRX::example_hap1[,1:4],
##                       HTRX::example_hap1[,1:4])

## next compute the maximum number of independent features
featurecap=htrx_max(nsnp=4,cap=10)
## then perform HTRX using direct cross-validation
## If we want to compute the total variance explained
## we can set gain=FALSE in the above example
\donttest{
htrx_results <- do_cv_direct(HTRX::example_data_nosnp[,1:3],
                             HTRX_matrix,featurecap=featurecap,
                             usebinary=1,method="stratified",
                             criteria="lasso",gain=TRUE,
                             runparallel=FALSE,verbose=TRUE)
}
}
\references{
Barrie, William, et al. "Genetic risk for Multiple Sclerosis originated in Pastoralist Steppe populations." bioRxiv (2022).

Eforn, B. "Bootstrap methods: another look at the jackknife." The Annals of Statistics 7 (1979): 1-26.

Schwarz, Gideon. "Estimating the dimension of a model." The annals of statistics (1978): 461-464.

McFadden, Daniel. "Conditional logit analysis of qualitative choice behavior." (1973).

Akaike, Hirotugu. "A new look at the statistical model identification." IEEE transactions on automatic control 19.6 (1974): 716-723.

Tibshirani, Robert. "Regression shrinkage and selection via the lasso." Journal of the Royal Statistical Society: Series B (Methodological) 58.1 (1996): 267-288.
}
