% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/JANE.R
\name{JANE}
\alias{JANE}
\title{Fit JANE}
\usage{
JANE(
  A,
  D = 2,
  K = 2,
  family = "bernoulli",
  noise_weights = FALSE,
  guess_noise_weights = NULL,
  model,
  initialization = "GNN",
  case_control = FALSE,
  DA_type = "none",
  seed = NULL,
  control = list()
)
}
\arguments{
\item{A}{A square matrix or sparse matrix of class 'dgCMatrix' representing the adjacency matrix of the network of interest.}

\item{D}{Integer (scalar or vector) specifying the dimension of the latent space (default is 2).}

\item{K}{Integer (scalar or vector) specifying the number of clusters to consider (default is 2).}

\item{family}{A character string specifying the distribution of the edge weights.
\itemize{
\item{'bernoulli': Expects an \strong{unweighted} network; utilizes a Bernoulli distribution with a logit link (default)}
\item{'lognormal': Expects a \strong{weighted} network with positive, non-zero, continuous edge weights; utilizes a log-normal distribution with an identity link}
\item{'poisson': Expects a \strong{weighted} network with edge weights representing non-zero counts; utilizes a zero-truncated Poisson distribution with a log link}
}}

\item{noise_weights}{A logical; if \code{TRUE} then a Hurdle model is used to account for noise weights, if \code{FALSE} simply utilizes the supplied network (converted to an unweighted binary network if a weighted network is supplied, i.e., (A > 0.0)*1.0) and fits a latent space cluster model (default is \code{FALSE}).}

\item{guess_noise_weights}{Only applicable if \code{noise_weights = TRUE}. A numeric value specifying the best guess for the mean of the noise weight distribution for \code{family \%in\% c('lognormal', 'poisson')} (mean is on the log-scale for lognormal) \strong{OR} proportion (i.e. in (0,1)) of all edges that are noise edges for \code{family = 'bernoulli'}. If \code{NULL} (i.e., default) and \code{noise_weights = TRUE} then the 1st percentile of the non-zero weights will be used for \code{family \%in\% c('lognormal', 'poisson')} and 1\% will be used for \code{family = 'bernoulli'}.}

\item{model}{A character string specifying the model to fit:
\itemize{
\item{'NDH': \strong{undirected} network with no degree heterogeneity (or connection strength heterogeneity if working with weighted network)}
\item{'RS': \strong{undirected} network with degree heterogeneity (and connection strength heterogeneity if working with weighted network)}
\item{'RSR': \strong{directed} network with degree heterogeneity (and connection strength heterogeneity if working with weighted network)}
}}

\item{initialization}{A character string or a list to specify the initial values for the EM algorithm:
\itemize{
\item{'GNN': uses a type of graphical neural network approach to generate initial values (default)}
\item{'random': uses random initial values}
\item{A user-supplied list of S3 \code{\link{class}} "\code{JANE.initial_values}" representing initial values for the EM algorithm. See \code{\link[JANE]{specify_initial_values}} for details on how to specify initial values}
}}

\item{case_control}{A logical; if \code{TRUE} then uses a case-control approximation approach (default is \code{FALSE}).}

\item{DA_type}{(Experimental) A character string to specify the type of deterministic annealing approach to use
\itemize{
\item{'none': does not employ a deterministic annealing approach (default)}
\item{'cooling': (Experimental) employs a traditional deterministic annealing approach where temperature decreases}
\item{'heating': (Experimental) employs a deterministic anti-annealing approach where temperature increases}
\item{'hybrid': (Experimental) employs a combination of the 'cooling' and 'heating' approach}
}}

\item{seed}{(optional) An integer value to specify the seed for reproducibility.}

\item{control}{A list of control parameters. See 'Details'.}
}
\value{
A list of S3 \code{\link{class}} "\code{JANE}" containing the following components:
\item{\code{input_params}}{ A list containing the input parameters for \code{IC_selection}, \code{case_control}, \code{DA_type}, \code{model}, \code{family}, and \code{noise_weights} used in the function call.}
\item{\code{A}}{ The square sparse adjacency matrix of class 'dgCMatrix' used in fitting the latent space cluster model. This matrix can be different than the input A matrix as isolates are removed.}
\item{\code{IC_out}}{ A matrix containing the relevant information criteria for all combinations of \code{K}, \code{D}, and \code{n_start} considered. The 'selected' column indicates the chosen optimal fit.}
\item{\code{all_convergence_ind}}{ A matrix containing the convergence information (i.e., 1 = converged, 0 = did not converge) and number of iterations for all combinations of \code{K}, \code{D}, \code{n_start}, and \code{beta_temperature} considered.}
\item{\code{optimal_res}}{ A list containing the estimated parameters of interest based on the optimal fit selected. It is recommended to use \code{summary()} to extract the parameters of interest. See \code{\link[JANE]{summary.JANE}} for more details.}
\item{\code{optimal_starting}}{ A list of S3 \code{\link{class}} "\code{JANE.initial_values}" containing the starting parameters used in the EM algorithm that resulted in the optimal fit selected. It is recommended to use \code{summary()} to extract the parameters of interest. See \code{\link[JANE]{summary.JANE}} for more details.}
}
\description{
Fit a latent space cluster model, with or without noise edges, using an EM algorithm.
}
\details{
Isolates are removed from the adjacency matrix A. If an unsymmetric adjacency matrix A is supplied for \code{model \%in\% c('NDH', 'RS')} the user will be asked if they would like to proceed with converting A to a symmetric matrix (i.e., \code{A <- 1.0 * ( (A + t(A)) > 0.0 )}); only able to do so if \code{family = 'bernoulli'}. Additionally, if a weighted network is supplied and \code{noise_weights = FALSE}, then the network will be converted to an unweighted binary network (i.e., (A > 0.0)*1.0) and a latent space cluster model is fit.

\strong{\code{control}:}

The \code{control} argument is a named list that the user can supply containing the following components:
\describe{
\item{\code{verbose}}{A logical; if \code{TRUE} causes additional information to be printed out about the progress of the EM algorithm (default is \code{FALSE}).}
\item{\code{max_its}}{An integer specifying the maximum number of iterations for the EM algorithm (default is \code{1e3}).}
\item{\code{min_its}}{An integer specifying the minimum number of iterations for the EM algorithm (default is \code{10}).}
\item{\code{priors}}{A list of S3 \code{\link{class}} "\code{JANE.priors}" representing prior hyperparameter specifications (default is \code{NULL}). See \code{\link[JANE]{specify_priors}} for details on how to specify the hyperparameters.}
\item{\code{n_interior_knots}}{(only relevant for \code{model \%in\% c('RS', 'RSR')}) An integer specifying the number of interior knots used in fitting a natural cubic spline for degree heterogeneity (and connection strength heterogeneity if working with weighted network) models (default is \code{5}).}
\item{\code{termination_rule}}{A character string to specify the termination rule to determine the convergence of the EM algorithm: \itemize{
\item{\code{'prob_mat'}: uses change in the absolute difference in \eqn{\hat{Z}^{U}} (i.e., the \eqn{N \times K} cluster membership probability matrix) between subsequent iterations (default)}
\item{\code{'Q'}: uses change in the absolute difference in the objective function of the E-step evaluated using parameters from subsequent iterations}
\item{\code{'ARI'}: comparing the classifications between subsequent iterations using adjusted Rand index}
\item{\code{'NMI'}: comparing the classifications between subsequent iterations using normalized mutual information}
\item{\code{'CER'}: comparing the classifications between subsequent iterations using classification error rate}
}}
\item{\code{tolerance}}{A numeric specifying the tolerance used for \code{termination_rule \%in\% c('Q', 'prob_mat')} (default is \code{1e-3}).}
\item{\code{tolerance_ARI}}{A numeric specifying the tolerance used for \code{termination_rule = 'ARI'} (default is \code{0.999}).}
\item{\code{tolerance_NMI}}{A numeric specifying the tolerance used for \code{termination_rule = 'NMI'} (default is \code{0.999}).}
\item{\code{tolerance_CER}}{A numeric specifying the tolerance used for \code{termination_rule = 'CER'} (default is \code{0.01}).}
\item{\code{n_its_start_CA}}{An integer specifying what iteration to start computing the change in cumulative averages (note: the change in the cumulative average of \eqn{\hat{U}}, the latent position matrix, is not tracked when \code{termination_rule = 'Q'}) (default is \code{20}).}
\item{\code{tolerance_diff_CA}}{A numeric specifying the tolerance used for the change in cumulative average of \code{termination_rule} metric and \eqn{\hat{U}} (note: the change in the cumulative average of \eqn{\hat{U}} is not tracked when \code{termination_rule = 'Q'}) (default is \code{1e-3}).}
\item{\code{consecutive_diff_CA}}{An integer specifying the tolerance for the number of consecutive instances where the change in cumulative average is less than \code{tolerance_diff_CA} (default is \code{5}).}
\item{\code{quantile_diff}}{A numeric in \code{[0,1]} specifying the quantile used in computing the change in the absolute difference of \eqn{\hat{Z}^{U}} and \eqn{\hat{U}} between subsequent iterations (default is \code{1}, i.e., max).}
\item{\code{beta_temp_schedule}}{(Experimental) A numeric vector specifying the temperature schedule for deterministic annealing (default is \code{1}, i.e., deterministic annealing not utilized).}
\item{\code{n_control}}{An integer specifying the fixed number of controls (i.e., non-links) sampled for each actor; only relevant when \code{case_control = TRUE} (default is \code{100} when \code{case_control = TRUE} and \code{NULL} when \code{case_control = FALSE}).}
\item{\code{n_start}}{An integer specifying the maximum number of starts for the EM algorithm (default is \code{5}).}
\item{\code{max_retry}}{An integer specifying the maximum number of re-attempts if starting values cause issues with EM algorithm (default is \code{5}).}
\item{\code{IC_selection}}{A character string to specify the information criteria used to select the optimal fit based on the combinations of \code{K}, \code{D}, and \code{n_start} considered: \itemize{
\item{\code{'BIC_model'}: BIC computed from logistic regression or Hurdle model component}
\item{\code{'BIC_mbc'}: BIC computed from model based clustering component}
\item{\code{'ICL_mbc'}: ICL computed from model based clustering component}
\item{\code{'Total_BIC'}: sum of \code{'BIC_model'} and \code{'BIC_mbc'}}
\item{\code{'Total_ICL'}: sum of \code{'BIC_model'} and \code{'ICL_mbc'} (default)}
}}
\item{\code{sd_random_U_GNN}}{(only relevant when \code{initialization = 'GNN'}) A positive numeric value specifying the standard deviation for the random draws from a normal distribution to initialize \eqn{U} (default is \code{1}).}
\item{\code{max_retry_GNN}}{(only relevant when \code{initialization = 'GNN'}) An integer specifying the maximum number of re-attempts for the \code{GNN} approach before switching to random starting values (default is \code{10}).}
\item{\code{n_its_GNN}}{(only relevant when \code{initialization = 'GNN'}) An integer specifying the maximum number of iterations for the \code{GNN} approach (default is \code{10}).}
\item{\code{downsampling_GNN}}{(only relevant when \code{initialization = 'GNN'}) A logical; if \code{TRUE} employs downsampling s.t. the number of links and non-links are balanced for the \code{GNN} approach (default is \code{TRUE}).}
}

\strong{Running \code{JANE} in parallel:}

\code{JANE} integrates the \pkg{future} and \pkg{future.apply} packages to fit the various combinations of \code{K}, \code{D}, and \code{n_start} in parallel. The 'Examples' section below provides an example of how to run \code{JANE} in parallel. See \code{\link[future]{plan}} and \code{\link[future.apply]{future.apply}} for more details.

\strong{Choosing the number of clusters:}

\code{JANE} allows for the following model selection criteria to choose the number of clusters (smaller values are favored):
\itemize{
\item{\code{'BIC_model'}: BIC computed from logistic regression or Hurdle model component}
\item{\code{'BIC_mbc'}: BIC computed from model based clustering component}
\item{\code{'ICL_mbc'}: ICL (Biernacki et al. (2000)) computed from model based clustering component}
\item{\code{'Total_BIC'}: Sum of \code{'BIC_model'} and \code{'BIC_mbc'}, this is the model selection criterion proposed by Handcock et al. (2007)}
\item{\code{'Total_ICL'}: (default) sum of \code{'BIC_model'} and \code{'ICL_mbc'}, this criterion is similar to \code{'Total_BIC'}, but uses ICL for the model based clustering component, which tends to favor more well-separated clusters.}
}

Based on simulation studies, Biernacki et al. (2000) recommends that when the interest in mixture modeling is cluster analysis, instead of density estimation, the \eqn{ICL_{mbc}} criterion should be favored over the \eqn{BIC_{mbc}} criterion, as the \eqn{BIC_{mbc}} criterion tends to overestimate the number of clusters. The \eqn{BIC_{mbc}} criterion is designed to choose the number of components in a mixture model rather than the number of clusters.

\emph{Warning}: It is not certain whether it is appropriate to use the model selection criterion above to select \code{D}.
}
\examples{
\donttest{
# Simulate network
mus <- matrix(c(-1,-1,1,-1,1,1), 
              nrow = 3,
              ncol = 2, 
              byrow = TRUE)
omegas <- array(c(diag(rep(7,2)),
                  diag(rep(7,2)), 
                  diag(rep(7,2))), 
                  dim = c(2,2,3))
p <- rep(1/3, 3)
beta0 <- 1.0
sim_data <- JANE::sim_A(N = 100L, 
                        model = "NDH",
                        mus = mus, 
                        omegas = omegas, 
                        p = p, 
                        params_LR = list(beta0 = beta0),
                        remove_isolates = TRUE)
                        
# Run JANE on simulated data
res <- JANE::JANE(A = sim_data$A,
                  D = 2L,
                  K = 3L,
                  initialization = "GNN", 
                  model = "NDH",
                  case_control = FALSE,
                  DA_type = "none")

# Run JANE on simulated data - consider multiple D and K
res <- JANE::JANE(A = sim_data$A,
                  D = 2:5,
                  K = 2:10,
                  initialization = "GNN", 
                  model = "NDH",
                  case_control = FALSE,
                  DA_type = "none")
                  
# Run JANE on simulated data - parallel with 5 cores (NOT RUN)
# future::plan(future::multisession, workers = 5)
# res <- JANE::JANE(A = sim_data$A,
#                   D = 2L,
#                   K = 3L,
#                   initialization = "GNN", 
#                   model = "NDH",
#                   case_control = FALSE,
#                   DA_type = "none")
# future::plan(future::sequential)

# Run JANE on simulated data - case/control approach with 20 controls sampled for each actor
res <- JANE::JANE(A = sim_data$A,
                  D = 2L,
                  K = 3L,
                  initialization = "GNN", 
                  model = "NDH",
                  case_control = TRUE,
                  DA_type = "none",
                  control = list(n_control = 20))
                   
# Reproducibility
res1 <- JANE::JANE(A = sim_data$A,
                   D = 2L,
                   K = 3L,
                   initialization = "GNN", 
                   seed = 1234,
                   model = "NDH",
                   case_control = FALSE,
                   DA_type = "none")

res2 <- JANE::JANE(A = sim_data$A,
                   D = 2L,
                   K = 3L,
                   initialization = "GNN", 
                   seed = 1234,
                   model = "NDH",
                   case_control = FALSE,
                   DA_type = "none")  

## Check if results match
all.equal(res1, res2)    

# Another reproducibility example where the seed was not set. 
# It is possible to replicate the results using the starting values due to 
# the nature of EM algorithms
res3 <- JANE::JANE(A = sim_data$A,
                   D = 2L,
                   K = 3L,
                   initialization = "GNN", 
                   model = "NDH",
                   case_control = FALSE,
                   DA_type = "none")
## Extract starting values                    
start_vals <- res3$optimal_start

## Run JANE using extracted starting values, no need to specify D and K 
## below as function will determine those values from start_vals
res4 <- JANE::JANE(A = sim_data$A,
                   initialization = start_vals, 
                   model = "NDH",
                   case_control = FALSE,
                   DA_type = "none")
                   
## Check if optimal_res are identical
all.equal(res3$optimal_res, res4$optimal_res)                   
}                            
}
\references{
Biernacki, C., Celeux, G., Govaert, G., 2000. Assessing a mixture model for clustering with the integrated completed likelihood. IEEE Transactions on Pattern Analysis and Machine Intelligence 22, 719–725.

Handcock, M.S., Raftery, A.E., Tantrum, J.M., 2007. Model-based clustering for social networks. Journal of the Royal Statistical Society Series A: Statistics in Society 170, 301–354.
}
