#' QTL search by MIM with Seletive Genotyping
#'
#' Expectation-maximization algorithm for QTL multiple interval mapping.
#' Find one more QTL in the presence of some known QTLs. This funtion
#' can handle the genotype witch is seletive genotyping.
#'
#' @param QTL matrix. A q*2 matrix contains the QTL information, where
#' the row dimension q is the number of QTLs in the chromosomes. The
#' first column labels the chromosomes where the QTLs are located, and
#' the second column labels the positions of QTLs (in morgan (M) or
#' centimorgan (cM)). Note that chromosomes and positions must be divided
#' in order.
#' @param marker matrix. A k*2 matrix contains the marker information,
#' where the row dimension k is the number of markers in the chromosomes.
#' The first column labels the chromosomes where the markers are located,
#' and the second column labels the positions of QTLs (in morgan (M) or
#' centimorgan (cM)). Note that chromosomes and positions must be divided
#' in order.
#' @param geno matrix. A n*k matrix contains the k markers of the n
#' individuals. The marker genotypes of P1 homozygote (MM),
#' heterozygote (Mm), and P2 homozygote (mm) are coded as 2, 1, and 0,
#' respectively, and NA for missing value.
#' @param y vector. A vector that contains the phenotype values of
#' individuals with genotypes.
#' @param yu vector. A vector that contains the phenotype value
#' of the individuals without genotypes.
#' @param sele.g character. If sele.g="n", it will consider that the
#' data is not a selective genotyping data but the complete genotyping
#' data. If sele.g="p", it will consider that the data is a selective
#' genotyping data, and use the proposed model (Lee 2014) to analyze.
#' If sele.g="t", it will consider that the data is a selective
#' genotyping data, and use the truncated model (Lee 2014) to analyze.
#' If sele.g="f, it will consider that the data is a selective
#' genotyping data, and use the frequency-based model (Lee 2014) to
#' analyze. Note that the yu must be input when sele.g="p" of "f".
#' @param tL numeric. The lower truncation point of phenotype value
#' when sele.g="t". Note that when sele.g="t" and tL=NULL, the yu
#' must be input and the function will consider the minimum of yu
#' as the lower truncation point.
#' @param tR numeric. The upper truncation point of phenotype value
#' when sele.g="t". Note that when sele.g="t" and tR=NULL, the yu
#' must be input and the function will consider the maximum of yu
#' as the upper truncation point.
#' @param method character. method="EM" means the interval mapping method
#' by Lander and Botstein (1989) is used in the analysis, while
#' method="REG" means  the approximate regression interval mapping method
#' by Haley and Knott (1992) is considered in the analysis.
#' @param type character. The population type of the dataset. Include
#' backcross (type="BC"), advanced intercross population (type="AI"), and
#' recombinant inbred population (type="RI").
#' @param ng integer. The generation number of the population type. For
#' example, the BC1 population is type="BC" with ng=1; the AI F3
#' population is type="AI" with ng=3.
#' @param D.matrix matrix. The design matrix of QTL effects is a g*p matrix,
#' where g is the number of possible QTL genotypes, and p is the number of
#' effects considered in the MIM model. Note that the QTL number of the
#' design matrix must be the original QTL number plus one. The design
#' matrix can be easily generated by the function D.make(). If being NULL,
#' it will automatically generate a design matrix with all additive and
#' dominant effects and without any epistasis effect.
#' @param cM logical. Specify the unit of marker position. cM=TRUE for
#' centi-Morgan. Or cM=FALSE for Morgan.
#' @param speed numeric. The walking speed of the QTL search (in cM).
#' @param QTLdist numeric. The minimum distance (cM) among different
#' linked significant QTL. The position near the position of the known
#' QTLs under this distance will not be considered as the candidate position
#' in the search process.
#' @param link logical. If being False, positions on the same chromosomes
#' as the known QTLs will not be searched.
#' @param crit numeric. The convergence criterion of the EM algorithm.
#' The E and M steps will be iterated until a convergence criterion
#' is satisfied. It must be between 0 and 1.
#' @param console logical. To decide whether the process of the algorithm
#' will be shown in the R console or not.
#'
#' @return
#' \item{effect}{The estimated effects, log-likelihood value, and LRT
#' statistics of all searched positions.}
#' \item{QTL.best}{The positions of the best QTL combination.}
#' \item{effect.best}{The estimated effects and LRT statistics of the best
#' QTL combination.}
#' \item{model}{The model of selective genotyping data in this analyze.}
#'
#' @export
#'
#' @references
#'
#' KAO, C.-H. and Z.-B. ZENG 1997 General formulas for obtaining the maximum
#' likelihood estimates and the asymptotic variance-covariance matrix in QTL
#' mapping when using the EM algorithm. Biometrics 53, 653-665.
#'
#' KAO, C.-H., Z.-B. ZENG and R. D. TEASDALE 1999 Multiple interval mapping
#' for Quantitative Trait Loci. Genetics 152: 1203-1216.
#'
#' H.-I LEE, H.-A. HO and C.-H. KAO 2014 A new simple method for improving
#' QTL mapping under selective genotyping. Genetics 198: 1685-1698.
#'
#' @seealso
#' \code{\link[QTLEMM]{EM.MIM2}}
#' \code{\link[QTLEMM]{MIM.search}}
#'
#' @examples
#' # load the example data
#' load(system.file("extdata", "exampledata.RDATA", package = "QTLEMM"))
#'
#' # make the seletive genotyping data
#' ys <- y[y > quantile(y)[4] | y < quantile(y)[2]]
#' yu <- y[y >= quantile(y)[2] & y <= quantile(y)[4]]
#' geno.s <- geno[y > quantile(y)[4] | y < quantile(y)[2],]
#'
#' # run and result
#' QTL <- c(1, 23)
#' result <- MIM.search2(QTL, marker, geno.s, ys, yu, sele.g = "p",
#'  type = "RI", ng = 2, speed = 15, QTLdist = 50)
#' result$QTL.best
#' result$effect.best
MIM.search2 <- function(QTL, marker, geno, y, yu = NULL, sele.g = "n", tL = NULL, tR = NULL, method = "EM",
                        type = "RI", D.matrix = NULL, ng = 2, cM = TRUE, speed = 1, QTLdist = 15, link = TRUE,
                        crit = 10^-3, console = TRUE){

  if(is.null(QTL) | is.null(marker) | is.null(geno) |  is.null(y)){
    stop("Input data is missing, please cheak and fix.", call. = FALSE)
  }

  genotest <- table(geno)
  datatry <- try(geno*geno, silent=TRUE)
  if(class(datatry)[1] == "try-error" | FALSE %in% (names(genotest) %in% c(0, 1, 2))  | !is.matrix(geno)){
    stop("Genotype data error, please cheak your genotype data.", call. = FALSE)
  }

  marker <- as.matrix(marker)
  markertest <- c(ncol(marker) != 2, NA %in% marker, marker[,1] != sort(marker[,1]), nrow(marker) != ncol(geno))
  datatry <- try(marker*marker, silent=TRUE)
  if(class(datatry)[1] == "try-error" | TRUE %in% markertest){
    stop("Marker data error, or the number of marker does not match the genetype data.", call. = FALSE)
  }

  QTL <- as.matrix(QTL)
  if(ncol(QTL) != 2){QTL <- t(QTL)}
  datatry <- try(QTL*QTL, silent=TRUE)
  if(class(datatry)[1] == "try-error" | ncol(QTL) != 2 | NA %in% QTL | max(QTL[, 1]) > max(marker[, 1])){
    stop("QTL data error, please cheak your QTL data.", call. = FALSE)
  }

  for(i in 1:nrow(QTL)){
    ch0 <- QTL[i, 1]
    q0 <- QTL[i, 2]
    if(!ch0 %in% marker[, 1]){
      stop("The specified QTL is not in the position that can estimate by the marker data.", call. = FALSE)
    }
    if(q0 > max(marker[marker[, 1] == ch0, 2]) | q0 < min(marker[marker[, 1] == ch0, 2])){
      stop("The specified QTL is not in the position that can estimate by the marker data.", call. = FALSE)
    }
  }

  y[is.na(y)] <- mean(y,na.rm = TRUE)

  datatry <- try(y%*%geno, silent=TRUE)
  if(class(datatry)[1] == "try-error"){
    stop("Phenotype data error, or the number of individual does not match the genetype data.", call. = FALSE)
  }

  if(!is.null(yu)){
    yu <- yu[!is.na(yu)]
    datatry <- try(yu%*%yu, silent=TRUE)
    if(class(datatry)[1] == "try-error"){
      stop("yu data error, please check your yu data.", call. = FALSE)
    }
  }

  if(!sele.g[1] %in% c("n", "t", "p", "f") | length(sele.g)!=1){
    stop("Parameter sele.g error, please check and fix.", call. = FALSE)
  }

  if(sele.g == "t"){
    lrtest <- c(tL[1] < min(c(y,yu)), tR[1] > max(c(y,yu)), tR[1] < tL[1],
                length(tL) > 1, length(tR) > 1)
    datatry <- try(tL[1]*tR[1], silent=TRUE)
    if(class(datatry)[1] == "try-error" | TRUE %in% lrtest){
      stop("Parameter tL or tR error, please check and fix.", call. = FALSE)
    }
    if(is.null(tL) | is.null(tR)){
      if(is.null(yu)){
        stop("yu data error, the yu data must be input for truncated model when parameter tL or tR is not set.", call. = FALSE)
      }
    }
  }

  nq <- nrow(QTL)+1

  if(!type[1] %in% c("AI","RI","BC") | length(type) > 1){
    stop("Parameter type error, please input AI, RI, or BC.", call. = FALSE)
  }

  if(!is.null(D.matrix)){
    D.matrix <- as.matrix(D.matrix)
    datatry <- try(D.matrix*D.matrix, silent=TRUE)
    if(type == "BC"){dn0 <- 2
    } else {dn0 <- 3}
    if(class(datatry)[1] == "try-error" | NA %in% D.matrix | nrow(D.matrix) != dn0^nq){
      stop("Parameter D.matrix error, or the combination of genotypes in design matrix is error.",
           call. = FALSE)
    }
  }

  if(!is.numeric(ng) | length(ng) > 1 | min(ng) < 1){
    stop("Parameter ng error, please input a positive integer.", call. = FALSE)
  }
  ng <- round(ng)

  if(!cM[1] %in% c(0,1) | length(cM > 1)){cM <- TRUE}
  if(!link[1] %in% c(0,1) | length(link > 1)){link <- TRUE}

  if(!is.numeric(speed) | length(speed) > 1 | min(speed) < 0){
    stop("Parameter speed error, please input a positive number.", call. = FALSE)
  }

  if(!is.numeric(QTLdist) | length(QTLdist) > 1 | min(QTLdist) < speed*2){
    stop("Parameter QTLdist error, please input a bigger positive number.", call. = FALSE)
  }

  if(!is.numeric(crit) | length(crit) > 1 | min(crit) <= 0 | max(crit) >= 1){
    stop("Parameter crit error, please input a positive number between 0 and 1.", call. = FALSE)
  }

  if(!console[1] %in% c(0,1) | length(console) > 1){console <- TRUE}

  if(!cM){
    QTL <- QTL*100
    marker[, 2] <- marker[, 2]*100
  }

  if(is.null(D.matrix)){
    D.matrix <- D.make(as.numeric(nq), type = type)
  }

  if(console){cat("chr", "cM", "LRT", "log.likelihood", "known QTL", "\n", sep = "\t")}

  if(method == "EM"){
    meth <- function(QTL, marker, geno, D.matrix, y, yu, tL, tR, type, ng, sele.g, crit){
      EM <- EM.MIM2(QTL, marker, geno, D.matrix, y = y, yu = yu, tL = tL, tR = tR, type = type,
                    ng = ng, sele.g = sele.g, crit = crit, console = FALSE)
      eff <- as.numeric(EM$E.vector)
      mu0 <- as.numeric(EM$beta)
      sigma <- sqrt(as.numeric(EM$variance))
      LRT <- EM$LRT
      model <- EM$model
      R2 <- EM$R2
      like <- EM$log.likelihood
      result <- list(eff, mu0, sigma, LRT, like, R2, model)
      return(result)
    }
    if(sele.g == "p" | sele.g == "f"){
      ya <- c(y, yu)
    } else {
      ya <- y
    }
  } else if (method=="REG"){
    if(sele.g == "p" | sele.g == "f"){
      ya <- c(y, yu)
    } else {
      ya <- y
    }

    mixprop <- function(QTL, nu, marker, geno, model, cM = TRUE, type = "RI", ng = 2, cp.matrix = NULL){
      nQTL <- nrow(QTL)
      if(nQTL > 1){QTL <- QTL[order(QTL[, 1], QTL[, 2]),]}
      marker <- marker[order(marker[, 1], marker[, 2]),]

      Q3 <- 3^nQTL
      Q2 <- 2^nQTL
      if(is.null(cp.matrix)){
        cp.matrix <- Q.make(QTL, marker, geno, cM = cM, type = type, ng = ng)[[(nQTL+1)]]
      }
      QTL.freq <- cp.matrix

      if(cM){
        QTL[, 2] <- QTL[, 2]/100
        marker[, 2] <- marker[, 2]/100
      }

      if(model == 2){
        mix.prop <- QTL.freq
        popu.freq <- NULL
        freq.u <- NULL
      } else {
        N <- nrow(QTL.freq)+nu
        freq.s <- colSums(QTL.freq)/N

        if(nQTL == 1){
          if(type == "BC"){
            popu.freq <- c(1-0.5^ng, 0.5^ng)
            Qn <- 2
          } else if (type == "RI"){
            hetro <- 0.5^(ng-1)
            popu.freq <- c((1-hetro)/2, hetro, (1-hetro)/2)
            Qn <- 3
          } else {
            popu.freq <- c(0.25, 0.5, 0.25)
            Qn <- 3
          }
        } else {
          M <- c(1, 0)
          GAM <- matrix(0, Q2, nQTL)
          Pg <- c(2, 1, 0)
          PG <- matrix(0, Q3, nQTL)
          rmn <- rep(0, nQTL-1)
          for(n0 in 1:nQTL){
            GAM[, n0] <- rep(rep(M, 2^(n0-1)), each <- 2^(nQTL-n0))
            PG[, n0] <- rep(rep(Pg, 3^(n0-1)), each <- 3^(nQTL-n0))
            if(n0 == nQTL){break}
            if(QTL[n0, 1]==QTL[(n0+1), 1]){
              rmd <- QTL[(n0+1), 2]-QTL[n0, 2]
              rmn[n0] <- (1-exp(-2*rmd))/2
            }else{
              rmn[n0] <- 0.5
            }
          }

          if(type == "AI" & ng > 2){
            k0 <- seq(0, (ng-1), 2)
            rmn0 <- matrix(0, (nQTL-1), length(k0))
            for(i in 1:length(k0)){
              rmn0[, i] <- choose((ng-1), k0[i])*rmn^k0[i]*(1-rmn)^(ng-1-k0[i])
            }
            rmn <- 1-apply(rmn0, 1, sum)
          }

          gamf <- GAM.f <- matrix(0, Q2, nQTL-1)
          GAM.freq <- rep(1, Q2)
          for(nr in 1:(nQTL-1)){
            gamf[, nr] <- GAM[, nr] == GAM[, nr+1]
            for(ngam in 1:Q2){
              GAM.f[ngam, nr] <- ifelse(gamf[ngam, nr] == 1, 1-rmn[nr], rmn[nr])
            }
            GAM.freq <- GAM.freq*GAM.f[, nr]
          }

          if(type == "BC"){
            popu.freq <- GAM.freq/2
            if(ng > 1){
              G.matrix <- matrix(0, Q2, Q2)
              G.matrix[1, 1] <- 1
              G.matrix[Q2,] <- GAM.freq/2
              for(i in 2:(Q2-1)){
                p0 <- apply(matrix(t(GAM[, which(GAM[i,] != 0)]) == GAM[i, which(GAM[i,] != 0)],
                                   nrow = Q2, byrow = TRUE), 1, sum) == sum(GAM[i,] != 0)
                p1 <- popu.freq[p0]/sum(popu.freq[p0])
                G.matrix[i, p0] <- p1
              }
              for(i in 1:(ng-1)){
                popu.freq <- crossprod(G.matrix, popu.freq)
              }
            }
            Qn <- Q2
          }else{
            Gfreq <- rep(0, (Q2)*(Q2))
            Gtype <- matrix(0, (Q2)*(Q2), nQTL)
            wg <- 0
            for(ga1 in 1:Q2){
              for(ga2 in 1:Q2){
                wg <- wg+1
                Gfreq[wg] <- (GAM.freq[ga1]/2)*(GAM.freq[ga2]/2)
                Gtype[wg,] <- GAM[ga1,]+GAM[ga2,]
              }
            }
            popu.freq <- rep(0, Q3)
            ind <- rep(0, (Q2)*(Q2))
            for(pfi in 1:Q3){
              for(gti in 1:((Q2)*(Q2))){
                if(sum(Gtype[gti,] == PG[pfi,]) == nQTL){ind[gti] <- pfi}
              }
              popu.freq[pfi] <- sum(Gfreq[ind == pfi])
            }

            if(type == "RI" & ng > 2){
              G.matrix <- matrix(0, Q3, Q3)
              G.matrix[1, 1] <- 1
              G.matrix[Q3,Q3] <- 1
              for(i in 2:(Q3-1)){
                p0 <- apply(matrix(t(PG[, which(PG[i,] != 1)]) == PG[i, which(PG[i,] != 1)],
                                   nrow <- Q3, byrow = TRUE), 1, sum) == sum(PG[i,] != 1)
                p1 <- popu.freq[p0]/sum(popu.freq[p0])
                G.matrix[i, p0] <- p1
              }
              for(i in 1:(ng-1)){
                popu.freq <- crossprod(G.matrix, popu.freq)
              }
            }
            Qn <- Q3
          }
        }

        freq.u <- as.vector(popu.freq)-freq.s
        if(model == 3){
          freq.u <- as.vector(popu.freq)
          names(freq.u) <- names(freq.s)
        }
        freq.u[freq.u < 0] <- 0
        Freq.u <- matrix(rep(freq.u/sum(freq.u), each <- nu), nu, Qn)
        mix.prop <- rbind(QTL.freq, Freq.u)
      }
      re <- list(popu.freq = popu.freq, mix.prop = mix.prop, QTL = QTL, marker = marker, freq.u = freq.u)
      return(re)
    }

    meth <- function(QTL, marker, geno, D.matrix, y, yu, tL, tR, type, ng,sele.g, crit){
      mp <- switch(sele.g,
                   p = mixprop(QTL, length(yu), marker, geno, model = 1, cM = cM, type = type, ng = ng)[[2]],
                   t = mixprop(QTL, length(yu), marker, geno, model = 2, cM = cM, type = type, ng = ng)[[2]],
                   f = mixprop(QTL, length(yu), marker, geno, model = 3, cM = cM, type = type, ng = ng)[[2]],
                   n = Q.make(QTL, marker, geno, type = type, ng = ng)$cp.matrix)
      X <- mp%*%D.matrix
      fit <- stats::lm(ya~X)
      eff <- as.numeric(fit$coefficients[-1])
      mu0 <- as.numeric(fit$coefficients[1])
      ms <- stats::anova(fit)$`Mean Sq`
      sigma <- ms[2]^0.5
      R2 <- summary(fit)$r.squared

      L0 <- c()
      L1 <- c()
      for(k in 1:nrow(mp)){
        L00 <- c()
        L01 <- c()
        for(m in 1:nrow(D.matrix)){
          L00[m] <- mp[k, m]*stats::dnorm((ya[k]-mu0)/sigma)
          L01[m] <- mp[k, m]*stats::dnorm((ya[k]-(mu0+D.matrix[m,]%*%eff))/sigma)
        }
        L0[k] <- sum(L00)
        L1[k] <- sum(L01)
      }
      like0 <- sum(log(L0))
      like1 <- sum(log(L1))
      LRT <- 2*(like1-like0)

      result <- list(eff, mu0, sigma, LRT, like1, R2, model = "regression interval mapping model")
      return(result)
    }
  }

  name0 <- cbind(paste("QTL", 1:(nq-1), ".ch", sep = ""), paste("QTL", 1:(nq-1), ".cM", sep = ""))
  name0 <- rbind(name0, c("new.ch", "new.cM"))
  knownQTL <- c()
  for(i in 1:(nq-1)){
    knownQTL <- c(knownQTL, paste("[", QTL[i, 1], ",", QTL[i, 2], "]", sep = ""))
  }

  effect <- c()
  cr0 <- sort(unique(marker[, 1]))
  if(!link){
    cr0 <- cr0[!cr0 %in% QTL[, 1]]
  }
  if(length(cr0) < 1){
    stop("No searchable positions.", call. = FALSE)
  }
  for(i in cr0){
    cr <- marker[marker[, 1] == i,]
    minpos <- min(cr[, 2])+speed
    if(speed%%1 == 0){minpos = ceiling(floor(min(cr[, 2]))+speed)}
    QTLs0 <- seq(minpos, (max(cr[,2])), speed)
    QTLc0 <- QTL[QTL[, 1] == i,]
    if(ncol(as.matrix(QTLc0))==1){
      QTLc0 = t(as.matrix(QTLc0))
    }
    if(nrow(QTLc0) != 0){
      for(k in 1:nrow(QTLc0)){
        QTLs0 <- QTLs0[!(QTLs0 <= QTLc0[k,2]+QTLdist & QTLs0 >= QTLc0[k,2]-QTLdist)]
      }
    }
    for(j in QTLs0){
      QTL0 <- rbind(QTL, c(i,j))
      fit0 <- meth(QTL0, marker, geno, D.matrix, y, yu, tL, tR, type, ng, sele.g, crit)
      effect0 <- c(t(QTL0), fit0[[1]], fit0[[4]], fit0[[5]], fit0[[6]])

      LRT0 <- round(effect0[length(effect0)-2], 3)
      like <- round(effect0[length(effect0)-1], 5)
      if(console){cat(i, j, LRT0, like, knownQTL, "\n", sep = "\t")}
      effect <- rbind(effect, effect0)
    }
    model <- fit0[[7]]
  }
  colnames(effect) <-  c(t(name0), colnames(D.matrix), "LRT", "log.likelihood", "R2")
  row.names(effect) <- 1:nrow(effect)

  b0 <- effect[,c(length(name0)-1, length(name0), ncol(effect)-1)]
  bs <- c()
  for(i in cr0){
    b1 <- b0[b0[,1] == i,]
    eff1 <- effect[b0[,1] == i,]
    b2 <- nrow(b1)
    b3 <- b1[-c(1,b2),3]-b1[-c(b2-1,b2),3]
    b4 <- b1[-c(1,b2),3]-b1[-(1:2),3]
    b5 <- which(b3 > 0 & b4 > 0)
    bs <- rbind(bs, eff1[b5+1,])
  }

  best <- bs[bs[,ncol(bs)-1] == max(bs[,ncol(bs)-1]),]
  QTL.best <- matrix(best[1:(2*nq)], nq, 2, byrow = TRUE)
  colnames(QTL.best) <- c("chromosome", "position(cM)")
  row.names(QTL.best) <- c(paste("QTL", 1:(nq-1)), "QTL new")

  effect.best <- best[-(1:(2*nq))]

  result <- list(effect = effect, QTL.best = QTL.best, effect.best = effect.best, model = model)
  return(result)
}
