% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/fit_poisson_nmf.R, R/init_poisson_nmf.R
\name{fit_poisson_nmf}
\alias{fit_poisson_nmf}
\alias{fit_poisson_nmf_control_default}
\alias{init_poisson_nmf}
\alias{init_poisson_nmf_from_clustering}
\title{Fit Non-negative Matrix Factorization to Count Data}
\usage{
fit_poisson_nmf(
  X,
  k,
  fit0,
  numiter = 100,
  update.factors = seq(1, ncol(X)),
  update.loadings = seq(1, nrow(X)),
  method = c("scd", "em", "mu", "ccd"),
  init.method = c("topicscore", "random"),
  control = list(),
  verbose = c("progressbar", "detailed", "none")
)

fit_poisson_nmf_control_default()

init_poisson_nmf(
  X,
  F,
  L,
  k,
  init.method = c("topicscore", "random"),
  beta = 0.5,
  betamax = 0.99,
  control = list(),
  verbose = c("detailed", "none")
)

init_poisson_nmf_from_clustering(X, clusters, ...)
}
\arguments{
\item{X}{The n x m matrix of counts; all entries of X should be
non-negative. It can be a sparse matrix (class \code{"dgCMatrix"})
or dense matrix (class \code{"matrix"}), with some exceptions (see
\sQuote{Details}).}

\item{k}{An integer 2 or greater giving the matrix rank. This
argument should only be specified if the initial fit (\code{fit0}
or \code{F, L}) is not provided.}

\item{fit0}{The initial model fit. It should be an object of class
\dQuote{poisson_nmf_fit}, such as an output from
\code{init_poisson_nmf}, or from a previous call to
\code{fit_poisson_nmf}.}

\item{numiter}{The maximum number of updates of the factors and
loadings to perform.}

\item{update.factors}{A numeric vector specifying which factors
(rows of \code{F}) to update. By default, all factors are
updated. Note that the rows that are not updated may still change
by rescaling. When \code{NULL}, all factors are fixed. This option
is only implemented for \code{method = "em"} and \code{method =
"scd"}. If another method is selected, the default setting of
\code{update.factors} must be used.}

\item{update.loadings}{A numeric vector specifying which loadings
(rows of \code{L}) to update. By default, all loadings are
updated. Note that the rows that are not updated may still change
by rescaling. When \code{NULL}, all loadings are fixed. This option
is only implemented for \code{method = "em"} and \code{method =
"scd"}. If another method is selected, the default setting of
\code{update.loadings} must be used.}

\item{method}{The method to use for updating the factors and
loadings. Four methods are implemented: multiplicative updates,
\code{method = "mu"}; expectation maximization (EM), \code{method =
"em"}; sequential co-ordinate descent (SCD), \code{method = "scd"};
and cyclic co-ordinate descent (CCD), \code{method = "ccd"}. See
\sQuote{Details} for a detailed description of these methods.}

\item{init.method}{The method used to initialize the factors and
loadings. When \code{init.method = "random"}, the factors and
loadings are initialized uniformly at random; when
\code{init.method = "topicscore"}, the factors are initialized
using the (very fast) Topic SCORE algorithm (Ke & Wang, 2017), and
the loadings are initialized by running a small number of SCD
updates. This input argument is ignored if initial estimates of the
factors and loadings are already provided via input \code{fit0}, or
inputs \code{F} and \code{L}.}

\item{control}{A list of parameters controlling the behaviour of
the optimization algorithm (and the Topic SCORE algorithm if it
is used to initialize the model parameters). See \sQuote{Details}.}

\item{verbose}{When \code{verbose = "detailed"}, information about
the algorithm's progress is printed to the console at each
iteration; when \code{verbose = "progressbar"}, a progress bar is
shown; and when \code{verbose = "none"}, no progress information is
printed. See the description of the \dQuote{progress} return value
for an explanation of \code{verbose = "detailed"} console
output. (Note that some columns of the \dQuote{progress} data frame
are not shown in the console output.)}

\item{F}{An optional argument giving is the initial estimate of the
factors (also known as \dQuote{basis vectors}). It should be an m x
k matrix, where m is the number of columns in the counts matrix
\code{X}, and k > 1 is the rank of the matrix factorization
(equivalently, the number of \dQuote{topics}). All entries of
\code{F} should be non-negative. When \code{F} and \code{L} are not
provided, input argument \code{k} should be specified instead.}

\item{L}{An optional argument giving the initial estimate of the
loadings (also known as \dQuote{activations}). It should be an n x k
matrix, where n is the number of rows in the counts matrix
\code{X}, and k > 1 is the rank of the matrix factorization
(equivalently, the number of \dQuote{topics}). All entries of
\code{L} should be non-negative. When \code{F} and \code{L} are not
provided, input argument \code{k} should be specified instead.}

\item{beta}{Initial setting of the extrapolation parameter. This is
\eqn{beta} in Algorithm 3 of Ang & Gillis (2019).}

\item{betamax}{Initial setting for the upper bound on the
extrapolation parameter. This is \eqn{\bar{\gamma}} in Algorithm 3
of Ang & Gillis (2019).}

\item{clusters}{A factor specifying a grouping, or clustering, of
the rows of \code{X}.}

\item{\dots}{Additional arguments passed to \code{init_poisson_nmf}.}
}
\value{
\code{init_poisson_nmf} and \code{fit_poisson_nmf} both
return an object capturing the optimization algorithm state (for
\code{init_poisson_nmf}, this is the initial state). It is a list
with the following elements:

\item{F}{A matrix containing the current best estimates of the
  factors.}

\item{L}{A matrix containing the current best estimates of the
  loadings.}

\item{Fn}{A matrix containing the non-extrapolated factor estimates.
  If extrapolation is not used, \code{Fn} and \code{F} will be the
  same.}

\item{Ln}{A matrix containing the non-extrapolated estimates of the
  loadings. If extrapolation is not used, \code{Ln} and \code{L} will
  be the same.}

\item{Fy}{A matrix containing the extrapolated factor estimates. If
  the extrapolation scheme is not used, \code{Fy} and \code{F} will
  be the same.}

\item{Ly}{A matrix containing the extrapolated estimates of the
  loadings. If extrapolation is not used, \code{Ly} and \code{L} will
  be the same.}

\item{loss}{Value of the objective (\dQuote{loss}) function
  computed at the current best estimates of the factors and
  loadings.}

\item{loss.fnly}{Value of the objective (\dQuote{loss}) function
  computed at the extrapolated solution for the loadings (\code{Ly})
  and the non-extrapolated solution for the factors (\code{Fn}). This
  is used internally to implement the extrapolated updates.}

\item{iter}{The number of the most recently completed iteration.}

\item{beta}{The extrapolation parameter, \eqn{beta} in Algorithm 3
  of Ang & Gillis (2019).}

\item{betamax}{Upper bound on the extrapolation parameter. This is
  \eqn{\bar{\gamma}} in Algorithm 3 of Ang & Gillis (2019).}

\item{beta0}{The setting of the extrapolation parameter at the
  last iteration that improved the solution.}

\item{progress}{A data frame containing detailed information about
  the algorithm's progress. The data frame should have at most
  \code{numiter}
  rows. The columns of the data frame are: \dQuote{iter}, the
  iteration number; \dQuote{loglik}, the Poisson NMF log-likelihood
  at the current best factor and loading estimates;
  \dQuote{loglik.multinom}, the multinomial topic model
  log-likelihood at the current best factor and loading estimates;
  \dQuote{dev}, the deviance at the current best factor and loading
  estimates; \dQuote{res}, the maximum residual of the
  Karush-Kuhn-Tucker (KKT) first-order optimality conditions at the
  current best factor and loading estimates; \dQuote{delta.f}, the
  largest change in the factors matrix; \dQuote{delta.l}, the largest
  change in the loadings matrix; \dQuote{nonzeros.f}, the proportion
  of entries in the factors matrix that are nonzero;
  \dQuote{nonzeros.l}, the proportion of entries in the loadings
  matrix that are nonzero; \dQuote{extrapolate}, which is 1 if
  extrapolation is used, otherwise it is 0; \dQuote{beta}, the
  setting of the extrapolation parameter; \dQuote{betamax}, the
  setting of the extrapolation parameter upper bound; and
  \dQuote{timing}, the elapsed time in seconds (recorded using
  \code{\link{proc.time}}).}
}
\description{
Approximate the input matrix \code{X} by the
  non-negative matrix factorization \code{tcrossprod(L,F)}, in which
  the quality of the approximation is measured by a
  \dQuote{divergence} criterion; equivalently, optimize the
  likelihood under a Poisson model of the count data, \code{X}, in
  which the Poisson rates are given by \code{tcrossprod(L,F)}.
  Function \code{fit_poisson_nmf} runs a specified number of
  coordinate-wise updates to fit the L and F matrices.
}
\details{
In Poisson non-negative matrix factorization (Lee & Seung,
2001), counts \eqn{x_{ij}} in the \eqn{n \times m} matrix, \eqn{X},
are modeled by the Poisson distribution: \deqn{x_{ij} \sim
\mathrm{Poisson}(\lambda_{ij}).} Each Poisson rate,
\eqn{\lambda_{ij}}, is a linear combination of parameters
\eqn{f_{jk} \geq 0, l_{ik} \geq 0} to be fitted to the data:
\deqn{\lambda_{ij} = \sum_{k=1}^K l_{ik} f_{jk},} in which \eqn{K}
is a user-specified tuning parameter specifying the rank of the
matrix factorization. Function \code{fit_poisson_nmf} computes
maximum-likelihood estimates (MLEs) of the parameters. For
additional mathematical background, and an explanation of how
Poisson NMF is connected to topic modeling, see the vignette:
\code{vignette(topic = "relationship",package = "fastTopics")}.

Using this function requires some care; only minimal argument
checking is performed, and error messages may not be helpful.

The EM and multiplicative updates are simple and fast, but can be
slow to converge to a stationary point. When \code{control$numiter
= 1}, the EM and multiplicative updates are mathematically
equivalent to the multiplicative updates, and therefore share the
same convergence properties. However, the implementation of the EM
updates is quite different; in particular, the EM updates are more
suitable for sparse counts matrices. The implementation of the
multiplicative updates is adapted from the MATLAB code by Daichi
Kitamura \url{http://d-kitamura.net}.

Since the multiplicative updates are implemented using standard
matrix operations, the speed is heavily dependent on the
BLAS/LAPACK numerical libraries used. In particular, using
optimized implementations such as OpenBLAS or Intel MKL can result
in much improved performance of the multiplcative updates.

The cyclic co-ordinate descent (CCD) and sequential co-ordinate
descent (SCD) updates adopt the same optimization strategy, but
differ in the implementation details. In practice, we have found
that the CCD and SCD updates arrive at the same solution when
initialized \dQuote{sufficiently close} to a stationary point. The
CCD implementation is adapted from the C++ code developed by
Cho-Jui Hsieh and Inderjit Dhillon, which is available for download
at \url{https://www.cs.utexas.edu/~cjhsieh/nmf/}. The SCD
implementation is based on version 0.4-3 of the \sQuote{NNLM}
package.

An additional re-scaling step is performed after each update to
promote numerical stability.

We use three measures of progress for the model fitting: (1)
improvement in the log-likelihood (or deviance), (2) change in the
model parameters, and (3) the residuals of the Karush-Kuhn-Tucker
(KKT) first-order conditions. As the iterates approach a stationary
point of the loss function, the change in the model parameters
should be small, and the residuals of the KKT system should vanish.
Use \code{\link{plot_progress}} to plot the improvement in the
solution over time.

See \code{\link{fit_topic_model}} for additional guidance on model
fitting, particularly for large or complex data sets.

The \code{control} argument is a list in which any of the
following named components will override the default optimization
algorithm settings (as they are defined by
\code{fit_poisson_nmf_control_default}):

\describe{

\item{\code{numiter}}{Number of \dQuote{inner loop} iterations to
  run when performing and update of the factors or loadings. This
  must be set to 1 for \code{method = "mu"} and \code{method =
  "ccd"}.}

\item{\code{nc}}{Number of RcppParallel threads to use for the
  updates. When \code{nc} is \code{NA}, the number of threads is
  determined by calling
  \code{\link[RcppParallel]{defaultNumThreads}}. This setting is
  ignored for the multiplicative upates (\code{method = "mu"}).}

\item{\code{nc.blas}}{Number of threads used in the numerical
  linear algebra library (e.g., OpenBLAS), if available. For best
  performance, we recommend setting this to 1 (i.e., no
  multithreading).}

\item{\code{min.delta.loglik}}{Stop performing updates if the
  difference in the Poisson NMF log-likelihood between two successive
  updates is less than \code{min.delta.loglik}. This should not be
  kept at zero when \code{control$extrapolate = TRUE} because the
  extrapolated updates are expected to occasionally keep the
  likelihood unchanged. Ignored if \code{min.delta.loglik < 0}.}

\item{\code{min.res}}{Stop performing updates if the maximum KKT
  residual is less than \code{min.res}. Ignored if \code{min.res < 0}.}

\item{\code{minval}}{A small, positive constant used to safeguard
  the multiplicative updates. The safeguarded updates are implemented
  as \code{F <- pmax(F1,minval)} and \code{L <- pmax(L1,minval)},
  where \code{F1} and \code{L1} are the factors and loadings matrices
  obtained by applying an update. This is motivated by Theorem 1 of
  Gillis & Glineur (2012). Setting \code{minval = 0} is allowed, but
  some methods are not guaranteed to converge to a stationary point
  without this safeguard, and a warning will be given in this case.}

\item{\code{extrapolate}}{When \code{extrapolate = TRUE}, the
  extrapolation scheme of Ang & Gillis (2019) is used.}

\item{\code{extrapolate.reset}}{To promote better numerical
  stability of the extrapolated updates, they are \dQuote{reset}
  every so often. This parameter determines the number of iterations
  to wait before resetting.}

\item{\code{beta.increase}}{When the extrapolated update improves
  the solution, scale the extrapolation parameter by this amount.}

\item{\code{beta.reduce}}{When the extrapolaaed update does not
  improve the solution, scale the extrapolation parameter by this
  amount.}

\item{\code{betamax.increase}}{When the extrapolated update
  improves the solution, scale the extrapolation parameter by this
  amount.}

\item{\code{eps}}{A small, non-negative number that is added to the
  terms inside the logarithms to sidestep computing logarithms of
  zero. This prevents numerical problems at the cost of introducing a
  small inaccuracy in the solution. Increasing this number may lead
  to faster convergence but possibly a less accurate solution.}

\item{\code{zero.threshold}}{A small, non-negative number used to
  determine which entries of the solution are exactly zero. Any
  entries that are less than or equal to \code{zero.threshold} are
  considered to be exactly zero.}}

An additional setting, \code{control$init.numiter}, controls the
number of sequential co-ordinate descent (SCD) updates that are
performed to initialize the loadings matrix when \code{init.method
= "topicscore"}.
}
\examples{
# Simulate a (sparse) 80 x 100 counts matrix.
library(Matrix)
set.seed(1)
X <- simulate_count_data(80,100,k = 3,sparse = TRUE)$X

# Remove columns (words) that do not appear in any row (document).
X <- X[,colSums(X > 0) > 0]

# Run 10 EM updates to find a good initialization.
fit0 <- fit_poisson_nmf(X,k = 3,numiter = 10,method = "em")

# Fit the Poisson NMF model by running 50 EM updates.
fit_em <- fit_poisson_nmf(X,fit0 = fit0,numiter = 50,method = "em")

# Fit the Poisson NMF model by running 50 extrapolated SCD updates.
fit_scd <- fit_poisson_nmf(X,fit0 = fit0,numiter = 50,method = "scd",
                           control = list(extrapolate = TRUE))

# Compare the two fits.
fits <- list(em = fit_em,scd = fit_scd)
compare_fits(fits)
plot_progress(fits,y = "loglik")
plot_progress(fits,y = "res")

# Recover the topic model. After this step, the L matrix contains the
# mixture proportions ("loadings"), and the F matrix contains the
# word frequencies ("factors").
fit_multinom <- poisson2multinom(fit_scd)

}
\references{
Ang, A. and Gillis, N. (2019). Accelerating nonnegative matrix
  factorization algorithms using extrapolation. \emph{Neural
  Computation} \bold{31}, 417–439.

  Cichocki, A., Cruces, S. and Amari, S. (2011). Generalized
  alpha-beta divergences and their application to robust nonnegative
  matrix factorization. \emph{Entropy} \bold{13}, 134–170.

  Gillis, N. and Glineur, F. (2012). Accelerated multiplicative
  updates and hierarchical ALS algorithms for nonnegative matrix
  factorization. \emph{Neural Computation} \code{24}, 1085–1105.

  Hsieh, C.-J. and Dhillon, I. (2011). Fast coordinate descent
  methods with variable selection for non-negative matrix
  factorization. In \emph{Proceedings of the 17th ACM SIGKDD
  international conference on Knowledge discovery and data mining},
  p. 1064-1072

  Lee, D. D. and Seung, H. S. (2001). Algorithms for non-negative
  matrix factorization. In \emph{Advances in Neural Information
  Processing Systems} \bold{13}, 556–562.

  Lin, X. and Boutros, P. C. (2018). Optimization and expansion of
  non-negative matrix factorization. \emph{BMC Bioinformatics}
  \bold{21}, 7.

  Ke, Z. & Wang, M. (2017). A new SVD approach to optimal topic
  estimation. \emph{arXiv} \url{https://arxiv.org/abs/1704.07016}
}
\seealso{
\code{\link{fit_topic_model}}, \code{\link{plot_progress}}
}
