##We assume that a list of data-frame is provided with columns Chr; Start; N; Alt; Depth; 
##Weight;Genotype;Number of chromosomes; Number of copies; id;
##And that the number of clusters is known; contamination is also known.
#' Expectation step calculation
#'
#' 
#' @param Schrod A list of dataframes (one for each sample), generated by the Patient_schrodinger_cellularities() function.
#' @param centers Coordinates of the clones: a list of numeric vectors (1 per sample), with coordinates between 0 and 1.
#' @param weights Proportion of mutation in a clone
#' @param alpha Weight of each possibility / normalization so that the sum of all possibilities weighs equal to 1 mutation
#' @param adj.factor Factor to compute the probability: makes transition between the cellularity of the clone and the frequency observed
#' @keywords E-Step
e.step<-function(Schrod,centers,weights,alpha,adj.factor){ 
  f<-eval.fik(Schrod = Schrod,centers = centers,weights = weights,alpha = alpha,adj.factor = adj.factor)
  for(k in 1:length(weights)){ ##k corresponds to a clone
    f[,k]<-f[,k]*weights[k]
  }
  f_0<-f
  f_0<-t(apply(X = f,MARGIN = 1,FUN = function(z) {
    if(sum(z)>0){
      return(z/sum(z))
    }
    else{
      return(rep(0,times = length(z)))
    }
  }
  ))
  return(f_0)
}

eval.fik<-function(Schrod,centers,weights,keep.all.poss=F,alpha,adj.factor){
  al<-list()
  if(is.list(centers)){
    centers<-unlist(centers)
  }
  count<-0
  for(i in 1:length(Schrod)){ ## i is a sample
    al[[i]]<-matrix(nrow=dim(Schrod[[1]])[1],ncol=length(weights))
    Alt<-Schrod[[i]]$Alt
    Depth<-Schrod[[i]]$Depth
    for(k in 1:length(weights)){ ## k is a clone
      pro<-centers[(i-1)*length(weights)+k]*adj.factor[,i]
      pro_0<-pro
      pro_0[pro>1 | pro<0]<-0
      al[[i]][,k]<-dbinom(x =Alt ,size = Depth,prob = pro_0)
      al[[i]][pro>1 | pro<0,k]<-0
    }
  }
  f<-fik.from.al(al,Schrod[[1]]$id,keep.all.poss,alpha)
  return(f)
}

#'List product
#'
#' Returns the product of all elements in a list, e.g. a vector if the elements of the list are vectors, etc.
#' @param L list used
#' @param col If it is a list of matrices, and only one column should be used, name of the column.
#' @keywords List handling
#' #Write example for list_prod
list_prod<-function(L,col=NULL){
  if(is.null(col)){
    if(length(L)>1){
      result<-L[[1]]
      for(i in 2:length(L)){
        result<-result*L[[i]]
      }
    }
    else{
      return(L[[1]])
    }
  }
  else{
    if(length(L)>1){
      result<-L[[1]][,col]
      for(i in 2:length(L)){
        result<-result*L[[i]][,col]
      }
    }
    else{
      return(L[[1]][,col])
    }
  }
  return(result)
}

fik.from.al<-function(al,id,keep.all.poss,alpha=NULL){
  if(is.null(alpha)){
    alpha<-rep(1,times=length(id))
  }
  fik<-matrix(nrow = length(unique(id)),ncol = dim(al[[1]])[2])
  spare<-alpha*list_prod(al)
  u<-unique(id)
  if(keep.all.poss){
    return(spare*alpha)
  }
  else{
    for(i in 1:length(u)){
      if(sum(id==u[i])>1){ ##more than one possibility for a mutation
        fik[i,]<-apply(X = spare[id==u[i],],MARGIN = 2,function(z) sum(z)) ##normalize by sum of possibilities...
      }
      else{ ## only one possibility for a mutation
        fik[i,]<-spare[id==u[i],]
      }
    }
  }
  fik[fik==0]<-.Machine$double.xmin ## replace by machine limit to avoid the log(0) issue
  
  return(fik)
}

eval.fik.m<-function(Schrod,centers,weights,alpha,adj.factor){
  spare<-eval.fik(Schrod = Schrod,centers=centers,weights =weights,alpha = alpha,adj.factor=adj.factor)
  spare[spare==0]<-.Machine$double.xmin
  return(spare)
}

#'Maximization step
#'
#' Optimization of clone positions and proportion of mutations in each clone, based on the previously calculated expectation
#' @param fik Matrix giving the probability of each mutation to belong to a specific clone
#' @param Schrod A list of dataframes (one for each sample), generated by the Patient_schrodinger_cellularities() function.
#' @param previous.weights Weights from the previous optimization step (used as priors for this step)
#' @param previous.centers Clone coordinates from previous optimization step (used as priors for this step)
#' @param alpha Weight of each possibility / normalization so that the sum of all possibilities weighs equal to 1 mutation
#' @param adj.factor Factor to compute the probability: makes transition between the cellularity of the clone and the frequency observed
#' @param contamination Numeric vector with the fraction of normal cells contaminating the sample
#' @keywords EM Maximization

m.step<-function(fik,Schrod,previous.weights,previous.centers,contamination,alpha,adj.factor){
  weights<-apply(X = fik,MARGIN = 2,FUN = mean)
  cur.cent<-list()
  spare<-optim(par = unlist(previous.centers),
               fn = function(x) -sum(fik*log(eval.fik.m(Schrod = Schrod,centers = x,alpha = alpha,adj.factor = adj.factor,
                                                        weights = previous.weights))),
               method = "L-BFGS-B",lower = rep(0,times = length(unlist(previous.centers))),upper=rep(1,length(unlist(previous.centers))))
  if(!is.list(spare)){
    return(NA)
  }
  return(list(weights=weights,centers=spare$par,val=spare$val))
}

Compute.adj.fact<-function(Schrod,contamination){ ##Factor used to compute the probability of the binomial distribution
  n<-length(Schrod)
  adj.factor<-matrix(ncol = n,nrow=dim(Schrod[[1]])[1])
  for(i in 1:n){
    adj.factor[,i]<-Schrod[[i]]$NC*(1-contamination[i])/Schrod[[i]]$NCh
  }
  return(adj.factor)
}
#'Expectation Maximization algorithm
#'
#' Optimization of clone positions and proportion of mutations in each clone.
#' @param Schrod A list of dataframes (one for each sample), generated by the Patient_schrodinger_cellularities() function.
#' @param nclust Number of clones to look for (mandatory if prior_center or prior_weight are null)
#' @param prior_center Clone coordinates (from another analysis) to be used 
#' @param prior_weight Prior on the fraction of mutation in each clone
#' @param contamination Numeric vector with the fraction of normal cells contaminating the sample
#' @param epsilon Stopping condition for the algorithm: what is the minimal tolerated difference of position or weighted between two steps
#' @keywords EM
EM.algo<-function(Schrod, nclust=NULL, prior_center=NULL,prior_weight=NULL,contamination, epsilon=10**(-2)){
  if(is.null(prior_weight)){
    prior_weight<-rep(1/nclust,times = nclust)
    cur.weight<-rep(1/nclust,times = nclust)
  }
  else{
    cur.weight<-prior_weight
  }
  if(is.null(prior_center)){
    prior_center<-c(runif(n = (nclust-1)*length(Schrod),min = 0,max = 1),rep(1,times = length(Schrod)))
  }
  else{
    cur.center<-prior_center
  }
  prior_center<-unlist(cur.center)
  cur.val<-NULL
  eval<-1
  
  adj.factor<-Compute.adj.fact(Schrod = Schrod,contamination = contamination)
  alpha<-list_prod(L = Schrod,col = "alpha")
  
  while(eval>epsilon){
    tik<-e.step(Schrod = Schrod,centers = cur.center,weights = cur.weight,alpha,adj.factor)
    m<-m.step(fik = tik,Schrod = Schrod,previous.weights = cur.weight,
              previous.centers =cur.center, alpha =alpha, adj.factor=adj.factor )
    if(!is.list(m)){
      test<-create_priors(nclust = 2,nsample = 2)
      eval_1<-max(abs(prior_center-unlist(test)))
      break      
    }
    else{
      n.weights<-unlist(m$weights)
      n.centers<-list()
      n.val<-m$val
      
      for(i in 1:length(cur.center)){
        n.centers[[i]]<-m$centers[((i-1)*length(cur.center[[1]])+1):((i)*length(cur.center[[1]]))]
      }
      
      eval<-max(abs(c(n.weights,unlist(n.centers))-c(cur.weight,unlist(cur.center))))
      cur.weight<-n.weights
      prior_center<-c(prior_center,unlist(n.centers))
      cur.center<-n.centers
      cur.val<-n.val
    }
  }
  fik<-eval.fik(Schrod = Schrod,centers = cur.center,weights = cur.weight,keep.all.poss = T,alpha = alpha,adj.factor = adj.factor)
  return(list(fik=fik,weights=cur.weight,centers=cur.center,val=cur.val))
}

#'Data filter
#'
#' Keep one possibility per position and ajust weight accordingly
#' @param Schrod A list of dataframes (one for each sample), generated by the Patient_schrodinger_cellularities() function.
#' @param fik matrix of probability of each possibility to belong to a clone
#' @keywords filter
filter_on_fik<-function(Schrod,fik){
  keep<-numeric()
  for(i in 1:length(unique(Schrod[[1]]$id))){
    u<-Schrod[[1]]$id==(unique(Schrod[[1]]$id)[i])
    if(sum(u)>1){
      spare<-fik[u,]
      M<-max(spare)
      if(sum(spare==M)==1){
        l<-which(apply(X = spare,MARGIN = 1,FUN = function(z) sum(grepl(pattern = M,x = z))>0))
      }
      else{
        l<-which(apply(X = spare,MARGIN = 1,FUN = function(z) sum(grepl(pattern = M,x = z))>0))
        if(length(l)>1){
          l<-l[which.max(apply(X = spare[l,],MARGIN = 1,FUN = sum))]
        }
      }
      keep<-c(keep,which(u)[l])
    }
    else{
      keep<-c(keep,which(u))
    }
  }
  result<-Schrod
  for(l in 1:length(Schrod)){
    result[[l]]<-result[[l]][keep,]
    result[[l]]$alpha<-1
  }
  return(result)
}

#'Expectation Maximization algorithm
#'
#' Optimization of clone positions and proportion of mutations in each clone followed 
#' by filtering on most likely possibility for each mutation and a re-optimization.
#' @param Schrod A list of dataframes (one for each sample), generated by the Patient_schrodinger_cellularities() function.
#' @param nclust Number of clones to look for (mandatory if prior_center or prior_weight are null)
#' @param prior_center Clone coordinates (from another analysis) to be used 
#' @param prior_weight Prior on the fraction of mutation in each clone
#' @param contamination Numeric vector with the fraction of normal cells contaminating the sample
#' @param epsilon Stopping condition for the algorithm: what is the minimal tolerated difference of position or weighted between two steps
#' @keywords EM

FullEM<-function(Schrod, nclust, prior_center, prior_weight=NULL, contamination, epsilon=5*10**(-3) ){
  if(length(prior_weight!=nclust)){
    prior_weight<-rep(1/nclust,times = nclust)
  }
  E_out<-EM.algo(Schrod = Schrod, nclust = nclust,
                 prior_center = prior_center, prior_weight = prior_weight, contamination = contamination, epsilon = epsilon)
  if(is.list(E_out)){
    F_out<-filter_on_fik(Schrod = Schrod,fik = E_out$fik)
    E_out<-EM.algo(Schrod = F_out,nclust = nclust,prior_center = E_out$centers,prior_weight = E_out$weights,contamination = contamination,epsilon =epsilon )
  }
  return(list(EM.output = E_out, filtered.data=F_out))
}

#'Clonal fraction prior creation
#'
#' Semi-random generation of clonal priors
#' @param nclust Number of clones to look for.
#' @param nsample Number of samples
#' @param prior Possible priors known (the position of each element in a list corresponds to 1 clone)
#' @keywords EM

create_priors<-function(nclust,nsample,prior=NULL){
  result<-list()
  if(is.null(prior)){
    for(i in 1:nsample){
      result[[i]]<-c(runif(n = nclust-1,min = 0,max = 1),1)
    }
    return(result)
  }
  else if(length(prior[[1]])<nclust){## Need to complete the list
    if(sum(list_prod(prior)==1)>0){ ## there is an ancestral clone in the priors given
      for(i in 1:nsample){
        result[[i]]<-c(prior[[i]],runif(n = nclust-length(prior[[i]])))
      }
      return(result)
    }
    else{##need to add ancestral clone
      for(i in 1:nsample){
        result[[i]]<-c(prior[[i]],runif(n = nclust-1-length(prior[[i]])),1)
      }
      return(result)
    }
  }
  else{ ## need to remove elements
    lp<-list_prod(prior)
    if(sum(lp>0.95**nsample)>0){ ## there is an ancestral clone in the priors given
      w<-which.max(lp>0.95**nsample)
      for(i in 1:nsample){
        result[[i]]<-c(sample(x = prior[[i]],size = nclust-1,replace = F),prior[[i]][w])   
      }
      return(result)
    }
    else{
      for(i in 1:nsample){
        result[[i]]<-c(sample(x = prior[[i]],size = nclust-1,replace = F),1)
      }
      return(result)
    }
  }
}

add.to.list<-function(...){
  c(as.list(...))
}

#'Expectation Maximization algorithm
#'
#' Optimization of clone positions and proportion of mutations in each clone followed 
#' by filtering on most likely possibility for each mutation and a re-optimization. Then gives out the possibility with maximal likelihood
#' Relies on foreach
#' @param Schrod A list of dataframes (one for each sample), generated by the Patient_schrodinger_cellularities() function.
#' @param nclust Number of clones to look for (mandatory if prior_center or prior_weight are null)
#' @param prior_center Clone coordinates (from another analysis) to be used 
#' @param prior_weight Prior on the fraction of mutation in each clone
#' @param contamination Numeric vector with the fraction of normal cells contaminating the sample
#' @param epsilon Stopping condition for the algorithm: what is the minimal tolerated difference of position or weighted between two steps
#' @param ncores Number of CPUs to be used
#' @param maxit Maximal number of independant initial condition tests to be tried
#' @import foreach
#' @importFrom doParallel registerDoParallel
#' @importFrom parallel makeCluster
#' @importFrom parallel stopCluster
#' @keywords EM

parallelEM<-function(Schrod,nclust,epsilon,contamination,prior_center=NULL,prior_weight=NULL,maxit=1, ncores = 2){
  if(ncores>1){
  cl <- parallel::makeCluster( ncores )
  registerDoParallel(cl)
  
  result<-foreach::foreach(i=1:(maxit),.export = c("FullEM","EM.algo","create_priors",
                                          "add.to.list","e.step","m.step","list_prod",
                                          "Compute.adj.fact","eval.fik","eval.fik.m",
                                          "fik.from.al","filter_on_fik")) %dopar% {
                                            FullEM(Schrod = Schrod,nclust = nclust,prior_weight = prior_weight,contamination = contamination,epsilon = epsilon,
                                                   prior_center = create_priors(nclust = nclust,nsample = length(Schrod),prior = prior_center))
                                          }
  parallel::stopCluster(cl)
  }
  else{
    result<-list()
    for(i in 1:maxit){
      result[[i]]<-FullEM(Schrod = Schrod,nclust = nclust,prior_weight = prior_weight,contamination = contamination,epsilon = epsilon,
                                                   prior_center = create_priors(nclust = nclust,nsample = length(Schrod),prior = prior_center))
    } 
  }
#   result<-list()
#   for(i in 1:maxit){
#     result[[i]]<-FullEM(Schrod = Schrod,nclust = nclust,prior_weight = prior_weight,contamination = contamination,epsilon = epsilon,
#                                      prior_center = create_priors(nclust = nclust,nsample = length(Schrod),prior = prior_center))
#   }
  M<-result[[1]]$EM.output$val
  Mindex<-1
  if(length(result)>1){
    for(i in 2:length(result)){
      if(result[[i]]$EM.output$val<M){
        M<-result[[i]]$EM.output$val
      }
    }
  }
  return(result[[Mindex]])
}

#' Hard clustering based on EM output
#'
#' Attributes a mutation to its most likely clone based on the output of the EM algorithm
#' @param EM_out Output from EM.algo or FullEM
#' @keywords EM Hard clustering
hard.clustering<-function(EM_out){
  EM_out$clust<-apply(X = EM_out$fik,MARGIN = 1,FUN = function(z) {
    if(sum(z==max(z))>1){ ### Look for the multiple clones, and attribute with probability proportional to the weight
      if(max(z)>0){
		pos<-which(z==max(z))
		prob<-EM_out$weights[pos]/(sum(EM_out$weights[pos]))
		return(sample(x = pos, size = 1, prob = prob))
	  }
	  else{
		return(sample(1:length(z),size = z))
	  }
    }
    else{
      return(which(z==max(z)))
    }
  })
  return(EM_out$clust)
}

#' Bayesian Information Criterion
#'
#' Computes BIC from a list of outputs of EM algorithm, then returns the position with minimal BIC
#' @param EM_out_list list of outputs from EM.algo or FullEM
#' @keywords EM clustering number
BIC_criterion<-function(EM_out_list){
  Bic<-numeric()
  if(length(EM_out_list)==0){
	return(0)
  }
  Mut_num<-dim(EM_out_list[[1]]$EM.output$fik)[1]
  for(i in 1:length(EM_out_list)){
    Bic[i]<-2*EM_out_list[[i]]$EM.output$val+length(EM_out_list[[i]]$EM.output$centers[[1]])*log(Mut_num)
  }
  W<-which.min(Bic)
  L<-0
  ORD<-order(Bic)
#  while(L<=length(ORD)){
#	L<-L+1
	#H<-hard.clustering(EM_out =EM_out_list[[ORD[L]]]$EM.output)
  #  if(length(na.omit(unique(H))) == max(na.omit(H))){
	#	return(ORD[L])
	#}
#  }
  return(ORD[1])
  
}
#' Expectation Maximization
#'
#' Maximization of the likelihood given a mixture of binomial distributions
#' @param Schrod List of dataframes, output of the Schrodinger function or the EM algorithm
#' @param contamination The fraction of normal cells in the sample
#' @param prior_weight If known a list of priors (fraction of mutations in a clone) to be used in the clustering
#' @param nclone_range Number of clusters to look for
#' @param maxit Maximal number of independant initial condition tests to be tried
#' @param epsilon Stop value: maximal admitted value of the difference in cluster position and weights between two optimization steps.
#' @param ncores Number of CPUs to be used
#' @param clone_priors If known a list of priors (cell prevalence) to be used in the clustering
#' @keywords EM clustering number
EM_clustering<-function(Schrod,contamination,prior_weight=NULL, clone_priors=NULL, maxit=8, nclone_range=2:5, epsilon=5*(10**(-3)),ncores = 2){
  list_out_EM<-list()
  for(i in 1:length(nclone_range)){
    list_out_EM[[i]]<-parallelEM(Schrod = Schrod,nclust = nclone_range[i],epsilon = epsilon,
                                 contamination = contamination,prior_center = clone_priors,
                                 prior_weight = prior_weight,maxit = maxit , ncores = ncores)
  }
  result<-list_out_EM[[BIC_criterion(EM_out_list = list_out_EM)]]
  result$cluster<-hard.clustering(EM_out = result$EM.output)
  return(result)
}
