% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/SimulateData.R
\name{sim_IMIFA}
\alias{sim_IMIFA}
\alias{sim_IMIFA_data}
\alias{sim_IMIFA_model}
\title{Simulate Data from a Mixture of Factor Analysers Structure}
\usage{
sim_IMIFA_data(N = 300L,
               G = 3L,
               P = 50L,
               Q = rep(floor(log(P)), G),
               pis = rep(1/G, G),
               mu = NULL,
               psi = NULL,
               loadings = NULL,
               scores = NULL,
               nn = NULL,
               loc.diff = 2,
               non.zero = P,
               forceQg = TRUE,
               method = c("conditional", "marginal"))

sim_IMIFA_model(res,
                method = c("conditional", "marginal"))
}
\arguments{
\item{N, G, P}{Desired overall number of observations, number of clusters, and number of variables in the simulated data set. All must be a single integer.}

\item{Q}{Desired number of cluster-specific latent factors in the simulated data set. Can be specified either as a single integer if all clusters are to have the same number of factors, or a vector of length \code{G}. Defaults to \code{floor(log(P))} in each cluster. Should be less than the associated \code{\link{Ledermann}} bound and the number of observations in the corresponding cluster. The argument \code{forceQg} can be used to enforce this upper limit.}

\item{pis}{Mixing proportions of the clusters in the dataset if \code{G} > 1. Must sum to 1. Defaults to \code{rep(1/G, G)}.}

\item{mu}{True values of the mean parameters, either as a single value, a vector of length \code{G}, a vector of length \code{P}, or a \code{G * P} matrix. If \code{mu} is missing, \code{loc.diff} is invoked to simulate distinct means for each cluster by default.}

\item{psi}{True values of uniqueness parameters, either as a single value, a vector of length \code{G}, a vector of length \code{P}, or a \code{G * P} matrix. As such the user can specify uniquenesses as a diagonal or isotropic matrix, and further constrain uniquenesses across clusters if desired. If \code{psi} is missing, uniquenesses are simulated via \code{1/rgamma(P, 2, 1)} within each cluster by default.}

\item{loadings}{True values of the loadings matrix/matrices. Must be supplied in the form of a list of numeric matrices when \code{G > 1}, otherwise a single matrix. Matrices must contain \code{P} rows and the number of columns must correspond to the values in \code{Q}. If \code{loadings} are not supplied, such matrices are populated with standard normal random variates by default (see \code{non.zero}).}

\item{scores}{True values of the latent factor scores, as a \code{N * max(Q)} numeric matrix. If \code{scores} are not supplied, such a matrix is populated with standard normal random variates by default. Only relevant when \code{method="conditional"}.}

\item{nn}{An alternative way to specify the size of each cluster, by giving the exact number of observations in each cluster explicitly. Must sum to \code{N}.}

\item{loc.diff}{A parameter to control the closeness of the clusters in terms of the difference in their location vectors. Only relevant if \code{mu} is NOT supplied. Defaults to \code{2}.

More specifically, \code{loc.diff} (if invoked) is invoked as follows: means are simulated with the vector of cluster-specific hypermeans given by:

\code{scale(1:G, center=TRUE, scale=FALSE) * loc.diff}.}

\item{non.zero}{Controls the number of non-zero entries in each loadings column (per cluster) \strong{only} when \code{loadings} is not explicitly supplied. Values must be integers in the interval \code{[1,P]}. Defaults to \code{P}. The positions of the zeros are randomised, and non-zero entries are drawn from a standard normal.

Must be given as a list of length \code{G} of vectors of length corresponding to \code{Q} when \code{G>1}. Can be given either as such a list or simply a vector of length \code{Q} when \code{G=1}. Alternatively, a single integer can be supplied, common across all loadings columns across all clusters. In any case, \code{non.zero} will be affected by \code{forceQg=TRUE} by default (see below).}

\item{forceQg}{A logical indicating whether the upper limit on the number of cluster-specific factors \code{Q} is enforced. Defaults to \code{TRUE} for \code{sim_IMIFA_data}, but is always \code{FALSE} for \code{sim_IMIFA_model}. Note that when \code{forceQg=TRUE} is invoked, \code{non.zero} (see above) is also affected.}

\item{method}{A switch indicating whether the mixture to be simulated from is the conditional distribution of the data given the latent variables (default), or simply the marginal distribution of the data.}

\item{res}{An object of class \code{"Results_IMIFA"} generated by \code{\link{get_IMIFA_results}}.}
}
\value{
Invisibly returns a \code{data.frame} with \code{N} observations (rows) of \code{P} variables (columns). The true values of the parameters which generated these data are also stored as attributes.
}
\description{
Functions to simulate data of any size and dimension from a (infinite) mixture of (infinite) factor analysers parameterisation or fitted object.
}
\details{
\code{sim_IMIFA_model} is a simple wrapper to \code{sim_IMIFA_data} which uses the estimated parameters of a fitted IMIFA related model, as generated by \code{\link{get_IMIFA_results}}. The necessary parameters must have been originally stored via \code{\link{storeControl}} in the creation of \code{res}.
}
\note{
\code{N}, \code{G}, \code{P} & \code{Q} will \strong{NOT} be inferred from the supplied parameters \code{pis}, \code{mu}, \code{psi}, \code{loadings}, \code{scores} & \code{nn} - rather, the parameters' length/dimensions must adhere to the supplied values of \code{N}, \code{G}, \code{P} & \code{Q}.

Missing values are not allowed in any of \code{pis}, \code{mu}, \code{psi}, \code{loadings}, \code{scores} & \code{nn}.
}
\examples{
# Simulate 100 observations from 3 balanced clusters with cluster-specific numbers of latent factors
# Specify isotropic uniquenesses within each cluster
# Supply cluster means directly
sim_data  <- sim_IMIFA_data(N=100, G=3, P=20, Q=c(2, 2, 5), psi=1:3,
                            mu=matrix(rnorm(60, -2 + 1:3, 1), nrow=20, ncol=3, byrow=TRUE))
names(attributes(sim_data))
labels    <- attr(sim_data, "Labels")

# Visualise the data in two-dimensions
plot(cmdscale(dist(sim_data), k=2), col=labels)

# Examine the overlap with a pairs plot of 5 randomly chosen variables
pairs(sim_data[,sample(1:20, 5)], col=labels)

\donttest{# Fit a MIFA model to this data
# tmp     <- mcmc_IMIFA(sim_data, method="MIFA", range.G=3, n.iters=5000)

# Simulate from this model
# res     <- get_IMIFA_results(tmp, zlabels=labels)
# sim_mod <- sim_IMIFA_model(res)}
}
\references{
Murphy, K., Viroli, C., and Gormley, I. C. (2020) Infinite mixtures of infinite factor analysers, \emph{Bayesian Analysis}, 15(3): 937-963. <\href{https://projecteuclid.org/euclid.ba/1570586978}{doi:10.1214/19-BA1179}>.
}
\seealso{
\code{\link{mcmc_IMIFA}} for fitting an IMIFA related model to the simulated data set.

\code{\link{get_IMIFA_results}} for generating input for \code{sim_IMIFA_model}.

\code{\link{Ledermann}} for details on the upper-bound for \code{Q}. Note that this function accounts for isotropic uniquenesses, if \code{psi} is supplied in that manner, in computing this bound.
}
\author{
Keefe Murphy - <\email{keefe.murphy@mu.ie}>
}
\keyword{utility}
