\name{NMF-utils}
\Rdversion{1.1}
\docType{methods}
\alias{NMF-utils}

\alias{clusters}
\alias{clusters-methods}
\alias{clusters,NMF-method}
\alias{clusters,matrix-method}

\alias{cophcor}
\alias{cophcor-methods}
\alias{cophcor,matrix-method}

\alias{connectivity}
\alias{connectivity-methods}
\alias{connectivity,NMF-method}

\alias{dispersion}
\alias{dispersion-methods}
\alias{dispersion,matrix-method}

\alias{entropy}
\alias{entropy-methods}
\alias{entropy,factor,factor-method}
\alias{entropy,NMF,factor-method}
\alias{entropy,NMF,factor-method}
\alias{entropy,table,missing-method}

\alias{extractFeatures}
\alias{extractFeatures,NMF-method}
\alias{featureScore}
\alias{featureScore,NMF-method}
\alias{featureScore,matrix-method}

\alias{metaHeatmap}
\alias{metaHeatmap-methods}
\alias{metaHeatmap,matrix-method}
\alias{metaHeatmap,NMF-method}

\alias{purity}
\alias{purity-methods}
\alias{purity,factor,factor-method}
\alias{purity,NMF,factor-method}
\alias{purity,table,missing-method}

\alias{residuals}
\alias{residuals,NMFfit-method}
\alias{rss}
\alias{rss,NMF-method}

\alias{sparseness}
\alias{sparseness-methods}
\alias{sparseness,matrix-method}
\alias{sparseness,NMF-method}
\alias{sparseness,numeric-method}

\alias{syntheticNMF}

\title{ Class and Utility Methods for NMF objects}

\description{
Define generic interface methods for class \code{\linkS4class{NMF}}, which is 
the base -- virtual -- class of the results from any NMF algorithms implemented 
within package NMF's framework.

}

\usage{

\S4method{clusters}{NMF}(x, what = c('samples', 'features'), ...)

\S4method{connectivity}{NMF}(x, ...)

cophcor(object, ...)

dispersion(object, ...)

\S4method{entropy}{NMF,factor}(x, class, ...)

\S4method{residuals}{NMFfit}(object, track=FALSE)

rss(object, ...)
\S4method{rss}{NMF}(object, target)

\S4method{featureScore}{NMF}(x, ...)

\S4method{extractFeatures}{NMF}(x, ...)

\S4method{metaHeatmap}{NMF}(object, what=c('samples', 'features'), filter=FALSE, ...)

\S4method{purity}{NMF,factor}(x, class, ...)

\S4method{sparseness}{NMF}(x, what = c('features', 'samples'), ...)

syntheticNMF(n, r, p, offset=NULL, noise=FALSE, return.factors=FALSE)

}

\arguments{

	\item{class}{ A \code{factor} giving a known class membership for each sample.}
	
	\item{filter}{ if \code{TRUE}, only the features that are basis-specific are used. 
	Those features are those returned by function \code{extractFeatures}. }		
	
	\item{n}{ Number of rows of the synthetic target matrix. }	
	
	\item{noise}{ if \code{TRUE}, a random noise is added the target matrix. }
	
	\item{object}{ A \code{matrix} or an object that inherits from class 
	\code{\linkS4class{NMF}} or \code{\linkS4class{NMFfit}} -- depending on the method. }
	
	\item{offset}{ a vector giving the offset to add to the synthetic target matrix. 
	Its length should be equal to the number of rows \code{n}.}
	
	\item{p}{ Number of columns of the synthetic target matrix. Not used if parameter 
	\code{r} is a vector (see description of argument \code{r}).}
	
%	\item{palette}{ A character ctring that defines a color palette used 
%	internally with function \code{\link[RColorBrewer]{brewer.pal}}. }
	
	\item{r}{ Underlying factorization rank. If a single \code{numeric} is given, 
	the classes are randomly generated from a multinomial distribution. 
	If a numerical vector is given, then it should contain the counts in the different 
	classes (i.e integers). In such a case argument \code{p} is not used and the number of columns 
	is forced to be the sum of the counts.}
	
	\item{return.factors}{ If \code{TRUE}, the underlying matrices \code{W} and 
	\code{H} are also returned.}
	
%	\item{scale.to.one}{ if \code{TRUE}, the columns are scaled to sum up to 1. }
	
	\item{target}{ the target object estimated by model \code{object}. It can be 
	a \code{matrix} or an \code{ExpressionSet}.
	}
	
	\item{track}{ if \code{TRUE}, the whole residuals track is returned. 
	Otherwise only the last residuals computed is returned.}
	
	\item{what}{ Specifies on which matrix (basis components or mixture coefficients) 
	the computation should be performed.}
	
	\item{x}{ An object that inherits from class \code{\linkS4class{NMF}}. }	
	
	\item{...}{ Graphical parameters passed to function \code{\link[gplots]{heatmap.2}} }
}


\details{

\describe{
	
	\item{clusters}{ Computes the dominant basis component for each sample (resp. feature) 
	based on the corresponding coefficient in the mixture coefficient matrix (i.e in \eqn{H}) 
	(resp. basis component matrix matrix (i.e in \eqn{W})). }
	
	\item{connectivity}{
	Computes the connectivity matrix for the samples based on their mixture coefficients.
	
 	The connectivity matrix of a clustering is a matrix \eqn{C} containing 
 	only 0 or 1 entries such that:
 	\deqn{C_{ij} = \left\{\begin{array}{l}1\mbox{ if sample }i\mbox{ belongs to the same cluster as sample }j\\0\mbox{ otherwise}\end{array}\right..}{%
 	C_{ij} = 1 if sample i belongs to the same cluster as sample j, 0 otherwise}
	
	}
	
	\item{cophcor}{
	Computes the cophenetic correlation coefficient of consensus matrix \code{object}, 
	generally obtained from multiple NMF runs.
	
	The cophenetic correlation coeffificient is based on the consensus matrix 
	(i.e. the average of connectivity matrices) and was proposed by 
	\emph{Brunet et al. (2004)} to measure the stability of the clusters obtained 
	from NMF.
	
	It is defined as the Pearson correlation between the samples' distances 
	induced by the consensus matrix (seen as a similarity matrix) and their 
	cophenetic distances from a hierachical clustering based on these very 
	distances (by default an average linkage is used).
	See \emph{Brunet et al. (2004)}.
	
	Note that argument \code{...} is not used.
	}
	
	\item{dispersion}{
	Computes the dispersion coefficient of consensus matrix \code{object}, 
	generally obtained from multiple NMF runs.
	
	The dispersion coeffificient is based on the consensus matrix (i.e. the average 
	of connectivity matrices) and was proposed by \emph{Kim and Park (2007)} to 
	measure the reproducibility of the clusters obtained from NMF
	.
	It is defined as:
	\deqn{\rho = \sum_{i,j=1}^n 4 (C_{ij} - \frac{1}{2})^2 .}
	, where \eqn{n} is the total number of samples.

 	We have \eqn{0 \leq \rho \leq 1} and \eqn{\rho = 1} only for a perfect 
 	consensus matrix, where all entries 0 or 1. 
 	A perfect consensus matrix is obtained only when all the connectivity 
 	matrices are the same, meaning that the algorithm gave the same 
 	clusters at each run.
 	See \emph{Kim and Park (2007)}
 	
 	Note that argument \code{...} is not used.
	}
	
	\item{entropy}{
	The entropy is a measure of performance of a clustering method, in 
	recovering classes defined by factor a priori known	(i.e. one knows the 
	true class labels).
	Suppose we are given \eqn{l} categories, while the clustering method 
	generates \eqn{k} clusters. Entropy is given by:
 	\deqn{Entropy = - \frac{1}{n \log_2 l} \sum_{q=1}^k \sum_{j=1}^l n_q^j \log_2 \frac{n_q^j}{n_q}}
	, where:
	
 - \eqn{n} is the total number of samples;
 
 - \eqn{n} is the total number of samples in cluster \eqn{q};
 
 - \eqn{n_q^j} is the number of samples in cluster \eqn{q} that belongs to 
 original class \eqn{j} (\eqn{1 \leq j \leq l}).

	 The smaller the entropy, the better the clustering performance.
	 
	 See \emph{Kim and Park (2007)}.
	}

	\item{extractFeatures}{
	Identify the most basis-specific feature as suggested in \emph{Kim and Park (2007)}.

	The features are first scored using the function \code{featureScore}. 
	Then only the features that fulfil both following criteria are retained:
 	
 	- score greater than \eqn{\hat{\mu} + 3 \hat{\sigma}}, where \eqn{\hat{\mu}} 
 	and \eqn{\hat{\sigma}} are the median and the median absolute deviation (MAD) 
 	of the scores respectively;
 	
 	- the maximum contribution to a basis component is greater than the median 
 	of all contributions (i.e. of all elements of W)

	See \emph{Kim and Park (2007)}.
	}
	
	\item{featureScore}{
	Computes the feature scores as suggested in \emph{Kim and Park (2007)}.

	The score for feature \eqn{i} is defined as:
 	\deqn{S_i = 1 + \frac{1}{\log_2 k} \sum_{q=1}^k p(i,q) \log_2 p(i,q),}
 	where \eqn{p(i,q)} is the probability that the \eqn{i}-th feature 
 	contributes to basis \eqn{q}:
 	\deqn{p(i,q) = \frac{W(i,q)}{\sum_{r=1}^k W(i,r)} }

 	The feature scores are real values within the range [0,1]. 
 	The higher the feature score the more basis-specific the corresponding feature.

	}

	\item{metaHeatmap}{ Produces a heatmap of the basis components or mixture 
	coefficients using a \code{heatmap}-like custom function, 
	with parameters tuned for displaying NMF results.
	
	The used to draw the heatmap is a mixture of the function \code{heatmap.2} 
	from the \code{gplots} package, and the function \code{heatmap.plus}
	 from the \code{heatmap.plus} package. It allows to add extra annotation rows 
	 using the \code{ColSideColor} argument.
	 See \code{\link[gplots]{heatmap.2}} and \code{\link[heatmap.plus]{heatmap.plus}}.
	}

	\item{purity}{
	Computes the purity of a clustering given a known factor.

	The purity is a measure of performance of a clustering method, in 
	recovering the classes defined by a factor a priori known (i.e. one knows 
	the true class labels).
 	Suppose we are given \eqn{l} categories, while the clustering method 
 	generates \eqn{k} clusters. Purity is given by:
 	\deqn{Purity = \frac{1}{n} \sum_{q=1}^k \max_{1 \leq j \leq l} n_q^j} 	
	, where:
	
 	- \eqn{n} is the total number of samples;
 	
 	- \eqn{n_q^j} is the number of samples in cluster \eqn{q} that belongs to 
 	original class \eqn{j} (\eqn{1 \leq j \leq l}).

	The purity is therefore a real number in \eqn{[0,1]}. 
	The larger the purity, the better the clustering performance.
	
	See \emph{Kim and Park (2007)}.
	}
	
	\item{residuals}{ returns the -- final -- residuals between the target matrix and the 
	NMF result \code{object}. They are computed using the objective function 
	associated to the NMF algorithm that returned \code{object}.
	When called with \code{track=TRUE}, the whole residuals track is returned, 
	if available. Note that method \code{nmf} does not compute the residuals track, 
	unless explicitly required.
	
	It is a S4 methods defined for the associated generic functions from package 
	\code{stats} (See \link[stats]{residuals})
	
	See \code{\link{nmf}} and \code{\linkS4class{NMFfit}}.
	}
	
	
	\item{rss}{
	returns the Residual Sum of Squares (RSS) between the target object \code{target}
	 and its estimation by the \code{object}. \emph{Hutchins et al. (2008)} used 
	 the variation of the RSS in combination with \emph{Lee and Seung}'s algorithm
    to estimate the correct number of basis vectors. The optimal rank is chosen 
    where the graph of the RSS first shows an inflexion 
    point. See references.
    
    Note that this way of estimation may not be suitable for all models. Indeed, 
    if the NMF optimization problem is not based on the Frobenius norm, the RSS is 
    not directly linked to the quality of approximation of the NMF model.
    
	}
	
	\item{sparseness}{
	Computes the sparseness of a vector, matrix  as defined in \emph{Hoyer (2004)}.

	 This sparseness measure quantifies how much energy of a vector is packed into only few components.
	 It is defined by:
	 \deqn{Sparseness(x) = \frac{\sqrt{n} - \frac{\sum |x_i|}{\sqrt{\sum x_i^2}}}{\sqrt{n}-1}}
	, where \eqn{n} is the length of \code{x}.
	
	 The sparseness is a real number in \eqn{[0,1]}. It is equal to 1 if and only if \code{x} contains 
	 a single nonzero component, and is equal to 0 if and only if all components of \code{x} are equal.
	 It interpolates smoothly between these two extreme values. 
	 The closer to 1 is the sparseness the sparser is the vector.

	}
	
	\item{syntheticNMF}{
		Generate a synthetic matrix according to an underlying NMF model. 
		It can be used to quickly test NMF algorithms.	
	}

}

}

\seealso{	
	\code{\linkS4class{NMF}}  
}

\references{

 	\emph{Metagenes and molecular pattern discovery using matrix factorization}
	Brunet, J.~P., Tamayo, P., Golub, T.~R., and Mesirov, J.~P. (2004)
	Proc Natl Acad Sci U S A
	101(12), 4164--4169.

	\emph{Sparse non-negative matrix factorizations via alternating non-negativity-constrained least squares for microarray data analysis}
	Kim, H. & Park, H. (2007)
	Bioinformatics. 
	\url{http://dx.doi.org/10.1093/bioinformatics/btm134}.
	
	Hoyer, P. O. (2004)
	Non-negative Matrix Factorization with Sparseness Constraints
	\emph{Journal of Machine Learning Research} 5 (2004) 1457--1469
}

\author{ Renaud Gaujoux \email{renaud@cbio.uct.ac.za} }

\examples{


# generate a synthetic dataset with known classes
n <- 50; counts <- c(5, 5, 8);
V <- syntheticNMF(n, counts, noise=TRUE)
\dontrun{metaHeatmap(V)}

# build the class factor
groups <- as.factor(do.call('c', lapply(seq(3), function(x) rep(x, counts[x]))))

# perform default NMF
res <- nmf(V, 2)
res

\dontrun{metaHeatmap(res, class=groups)}
\dontrun{metaHeatmap(res, 'features')}
clusters(res)
entropy(res, class=groups)
purity(res, class=groups)

# perform NMF with the right number of basis components
res <- nmf(V, 3)

\dontrun{metaHeatmap(res)}
\dontrun{metaHeatmap(res, 'features')}
entropy(res, class=groups)
purity(res, class=groups)


}

