### ctrdata package
### utility functions

## variable definitions
#
# EUCTR definitions
countriesEUCTR <- c(
  "AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR",
  "DE", "GR", "HU", "IE", "IT", "LV", "LT", "LU", "MT", "NL",
  "PL", "PT", "RO", "SK", "SE", "SI", "ES", "GB", "IS", "LI",
  "NO", "3RD")
#
# regexpr
# - EUCTR e.g. 2010-022945-52-3RD
regEuctr <- "[0-9]{4}-[0-9]{6}-[0-9]{2}"
# - CTGOV
regCtgov <- "NCT[0-9]{8}"


#' Check and prepare nodbi connection object for ctrdata
#'
#' @param con A \link[nodbi]{src} connection object, as obtained with
#'  nodbi::\link[nodbi]{src_mongo}() or nodbi::\link[nodbi]{src_sqlite}()
#'
#' @keywords internal
#'
#' @importFrom nodbi src_sqlite
#' @importFrom utils capture.output
#'
#' @return Connection object as list, with collection
#'  element under root
#'
ctrDb <- function(
  con = nodbi::src_sqlite(
    collection = "ctrdata_auto_generated")) {

  ## sqlite
  if (inherits(con, "src_sqlite")) {

    if (is.null(con$collection)) {
      stop("In src_sqlite(), a parameter 'collection' needs to specify ",
           "the name of a table, such as src_sqlite(collection = 'test'), ",
           "for package ctrdata to work with other nosql databases.",
           call. = FALSE)
    }

    # check
    if (!RSQLite::dbIsValid(con$con)) {
      con <- nodbi::src_sqlite(dbname = con$dbname,
                               collection = con$collection)
    }

    # add database as element under root
    con <- c(con,
             "db" = con$dbname,
             "ctrDb" = TRUE)

    # print warning from nodbi::src_sqlite()
    if (grepl(":memory:", con$dbname)) {
      warning("Database not persisting,\ncopy to persistant database like ",
              "this:\n\nRSQLite::sqliteCopyDatabase(",
              "\n  from = <your in-memory-database-object>$con,",
              "\n  to = RSQLite::dbConnect(RSQLite::SQLite(),",
              "\n                          dbname = 'local_file.db'))\n",
              call. = FALSE,
              noBreaks. = FALSE,
              immediate. = TRUE)
    }

    ## return
    return(structure(con,
                     class = c("src_sqlite", "docdb_src")))
  }

  ## mongo
  if (inherits(con, "src_mongo")) {

    # rights may be insufficient to call info(),
    # hence this workaround that should always
    # work and be stable to retrieve name of
    # collection in the mongo connection
    # suppress... for reconnect info from mongolite
    coll <- suppressMessages(utils::capture.output(con$con)[1])
    coll <- sub("^.*'(.*)'.*$", "\\1", coll)

    # add collection as element under root
    con <- c(con,
             "collection" = coll,
             "ctrDb" = TRUE)

    ## return
    return(structure(con,
                     class = c("src_mongo", "docdb_src")))
  }

  ## unprepared for other nodbi adapters so far
  stop("Please specify in parameter 'con' a database connection. ",
       "crdata supports so far only src_mongo() and src_sqlite().",
       call. = FALSE)

} # end ctrDb


#' Open advanced search pages of register(s) or execute search in browser
#'
#' @param input Show results of search for \code{queryterm} in
#'   browser. To open the browser with a previous search, (register or)
#'   queryterm can be the output of \link{ctrGetQueryUrl} or can
#'   be one row from \link{dbQueryHistory}.
#'
#' @param register Register(s) to open. Either "EUCTR" or "CTGOV" or a vector of
#'   both. Default is to open both registers' advanced search pages. To open the
#'   browser with a previous search, the output of \link{ctrGetQueryUrl}
#'   or one row from \link{dbQueryHistory} can be used.
#'
#' @param copyright (Optional) If set to \code{TRUE}, opens copyright pages of
#'   register(s).
#'
#' @param ... Any additional parameter to use with browseURL, which is called by
#'   this function.
#'
#' @export
#'
#' @return Is always true, invisibly.
#'
#' @examples
#' \dontrun{
#' ctrOpenSearchPagesInBrowser(
#'  "https://www.clinicaltrialsregister.eu/ctr-search/search?query=cancer")
#'
#' # for this example, the clipboard has to
#' # contain the URL from a search in a register
#' ctrOpenSearchPagesInBrowser(
#'  ctrGetQueryUrl())
#'
#' # open the last query that was
#' # loaded into the database
#' db <- nodbi::src_sqlite(
#'   collection = "previously_created"
#' )
#' ctrOpenSearchPagesInBrowser(
#'   dbQueryHistory(con = db))
#' }
#'
ctrOpenSearchPagesInBrowser <- function(
  input = "",
  register = "",
  copyright = FALSE,
  ...) {

  # check combination of arguments to select action
  #
  if (class(input) == "character" && is.atomic(input) && input == "") {
    #
    # if no register is specified, open both
    if (all(register == "", na.rm = TRUE)) register <- c("EUCTR", "CTGOV")
    #
    # open empty search pages
    if (any(register == "EUCTR"))
      try({
        utils::browseURL(
          "https://www.clinicaltrialsregister.eu/ctr-search/search",
          ...)}, silent = TRUE)
    #
    if (any(register == "CTGOV"))
      try({
        utils::browseURL(
          "https://clinicaltrials.gov/ct2/search/advanced",
          ...)}, silent = TRUE)
    #
    # if requested also show copyright pages
    if (copyright) {
      #
      if (any(register == "EUCTR"))
        try({
          utils::browseURL(
            "https://www.clinicaltrialsregister.eu/disclaimer.html",
            ...)}, silent = TRUE)
      #
      if (any(register == "CTGOV"))
        try({
          utils::browseURL(
            "https://clinicaltrials.gov/ct2/about-site/terms-conditions#Use",
            ...)}, silent = TRUE)
      #
    }
  } else {
    #
    # check input argument and determine action
    #
    # - is a url
    if (class(input) == "character" &&
        is.atomic(input) &&
        length(input) == 1 &&
        grepl("^https.+clinicaltrials.+", input)) {
      #
      input <- ctrGetQueryUrl(url = input)
      #
    }
    #
    # - data frame as returned from ctrQueryHistoryInDb()
    #   and ctrGetQueryUrl()
    if (is.data.frame(input) &&
        all(substr(names(input), 1, 6) == "query-")) {
      #
      nr <- nrow(input)
      #
      if (nr > 1) warning("Using last row of input.",
                          call. = FALSE, immediate. = TRUE)
      #
      register  <- input[nr, "query-register"]
      queryterm <- input[nr, "query-term"]
      #
    }
    #
    # - if input is not a complete url, but register is specified
    if (class(input) == "character" &&
        is.atomic(input) &&
        length(input) == 1 &&
        register != "") {
      #
      queryterm <- input
      #
    }
    #
    if (exists("queryterm") &&
        queryterm != "" &&
        register != "") {
      #
      message("Opening browser for search: \n\n", queryterm,
              "\n\nin register: ", register)
      #
      # sanity correction for naked terms
      if (register == "EUCTR") {
        queryterm <-
          sub("(^|&|[&]?\\w+=\\w+&)([ a-zA-Z0-9+-]+)($|&\\w+=\\w+)",
              "\\1query=\\2\\3",
              queryterm)
      }
      if (register == "CTGOV") {
        queryterm <-
          sub("(^|&|[&]?\\w+=\\w+&)(\\w+|[NCT0-9-]+)($|&\\w+=\\w+)",
              "\\1term=\\2\\3",
              queryterm)
      }
      #
      # protect against os where this does not work
      try({
        utils::browseURL(url = paste0(
          #
          switch(as.character(register),
                 "CTGOV" = ifelse(
                   grepl("^xprt=", queryterm),
                   "https://clinicaltrials.gov/ct2/results/refine?show_xprt=Y&",
                   "https://clinicaltrials.gov/ct2/results?"),
                 "EUCTR" =
                   "https://www.clinicaltrialsregister.eu/ctr-search/search?"),
          queryterm),
          encodeIfNeeded = TRUE, ...)
      },
      silent = TRUE)
    }
  }
  #
  invisible(TRUE)
}
# end ctrOpenSearchPagesInBrowser


#' Extract query parameters and register name from input or from
#' clipboard into which the URL of a register search was copied
#'
#' @param url URL such as from the browser address bar.
#' If not specified, clipboard contents will be checked for
#' a suitable URL. Can also contain a query term such as from
#' \link{dbQueryHistory}()["query-term"]
#'
#' @param register Optional name of register (i.e., "EUCTR" or
#' "CTGOV") in case url is a query term
#'
#' @return A string of query parameters that can be used to retrieve data
#' from the register.
#'
#' @export
#'
#' @return A data frame with column names query term and register name
#' that can directly be used in \link{ctrLoadQueryIntoDb} and in
#' \link{ctrOpenSearchPagesInBrowser}
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#'
#' # user now copies into the clipboard the URL from
#' # the address bar of the browser that shows results
#' # from a query in one of the trial registers
#' ctrLoadQueryIntoDb(
#'   ctrGetQueryUrl(),
#'   con = db
#' )
#' }
#'
#' @importFrom clipr read_clip
#'
ctrGetQueryUrl <- function(
  url = "",
  register = "") {
  #
  # check parameters expectations
  if (!is.atomic(url) || !is.atomic(register) ||
      is.null(url) || is.null(register) ||
      is.na(url) || is.na(register) ||
      !inherits(url, "character") || !inherits(register, "character") ||
      length(url) != 1L || length(register) != 1L) {
    stop("ctrGetQueryUrl(): 'url' and / or 'register' ",
         "is not a single character string, url: '",
         deparse(url), "', register: '", deparse(register), "'",
         call. = FALSE)
  }
  #
  # if no parameter specified,
  # check clipboard contents
  if (nchar(url) == 0L) {
    url <- suppressWarnings(
      clipr::read_clip(
        allow_non_interactive = TRUE)
    )
    if (is.null(url) || (length(url) != 1L) || (nchar(url) == 0L)) {
      stop("ctrGetQueryUrl(): no clinical trial register ",
           "search URL found in parameter 'url' or in clipboard.",
           call. = FALSE)
    }
    message("* Using clipboard content as register query URL: ", url)
  }
  #
  # EUCTR
  if (grepl("https://www.clinicaltrialsregister.eu/ctr-search/", url) ||
      (!grepl("https://", url) && register == "EUCTR")) {
    #
    queryterm <-
      sub("https://www.clinicaltrialsregister.eu/ctr-search/search[?](.*)",
          "\\1", url)
    #
    queryterm <-
      sub("https://www.clinicaltrialsregister.eu/ctr-search/trial/([-0-9]+)/.*",
          "\\1", queryterm)
    #
    # remove any intrapage anchor, e.g. #tableTop
    queryterm <- sub("#.+$", "", queryterm)
    #
    # sanity correction for naked terms
    # test cases:
    # queryterm = c(
    #   "cancer&age=adult",                      # add query=
    #   "cancer",                                # add query=
    #   "cancer+AND breast&age=adult&phase=0",   # add query=
    #   "cancer&age=adult&phase=0",              # add query=
    #   "cancer&age=adult&phase=1&results=true", # add query=
    #   "&age=adult&phase=1&abc=xyz&cancer&results=true", # insert query=
    #   "age=adult&cancer",                      # insert query=
    #   "2010-024264-18",                        # add query=
    #   "NCT1234567890",                         # add query=
    #   "teratoid&country=dk",                   # add query=
    #   "term=cancer&age=adult",                 # keep
    #   "age=adult&term=cancer")                 # keep
    queryterm <- sub(
      "(^|&|[&]?\\w+=\\w+&)([ a-zA-Z0-9+-]+)($|&\\w+=\\w+)",
      "\\1query=\\2\\3",
      queryterm)
    #
    # check if url was for results of single trial
    if (grepl(".*/results$", url)) {
      queryterm <- paste0(queryterm, "&resultsstatus=trials-with-results")
    }
    #
    message("* Found search query from EUCTR: ", queryterm)
    #
    df <- data.frame(cbind(queryterm, "EUCTR"), stringsAsFactors = FALSE)
    names(df) <- c("query-term", "query-register")
    #
    return(df)
  }
  #
  # CTGOV, e.g.
  # https://clinicaltrials.gov/ct2/results?term=2010-024264-18&Search=Search
  if (grepl("https://clinicaltrials.gov/ct2/results", url) ||
      (!grepl("https://", url) && register == "CTGOV")) {
    #
    queryterm <-
      sub("https://clinicaltrials.gov/ct2/results[?](.*)",
          "\\1", url)
    #
    queryterm <-
      sub("(.*)&Search[a-zA-Z]*=(Search|Find)[a-zA-Z+]*",
          "\\1", queryterm)
    #
    queryterm <- gsub("[a-z_0-9]+=&", "", queryterm)
    queryterm <- sub("&[a-z_0-9]+=$", "", queryterm)
    #
    message("* Found search query from CTGOV: ", queryterm)
    #
    df <- data.frame(cbind(queryterm, "CTGOV"),
                     stringsAsFactors = FALSE)
    names(df) <- c("query-term", "query-register")
    #
    return(df)
  }
  #
  warning("ctrGetQueryUrl(): no clinical trial register ",
          "search URL found in parameter 'url' or in clipboard.",
          call. = FALSE, immediate. = TRUE)
  #
  return(invisible(NULL))
}
# end ctrGetQueryUrl


#' Import from clipboard the URL of a search in one of the registers
#'
#' @inheritParams ctrGetQueryUrl
#'
#' @return A string of query parameters that can be used to retrieve data
#' from the register.
#'
#' @export
#'
#' @return A data frame with column names query term and register name
#' that can directly be used in \link{ctrLoadQueryIntoDb} and in
#' \link{ctrOpenSearchPagesInBrowser}
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#'
#' # user now copies into the clipboard the URL from
#' # the address bar of the browser that shows results
#' # from a query in one of the trial registers
#' ctrLoadQueryIntoDb(
#'   ctrGetQueryUrlFromBrowser(),
#'   con = db
#' )
#' }
#'
#' @importFrom clipr read_clip
#'
ctrGetQueryUrlFromBrowser <- function(
  url = "",
  register = "") {

  # deprecate
  .Deprecated(new = "ctrGetQueryUrl")

  # defer call
  ctrGetQueryUrl(url = url, register = register)

}
# end ctrGetQueryUrlFromBrowser


#' Find synonyms of an active substance
#'
#' An active substance can be identified by a recommended international
#' nonproprietary name, a trade or product name, or a company code(s).
#'
#' At this time, this function uses the register ClinicalTrials.Gov to
#' detect which substances were also searched for.
#'
#' @param activesubstance An active substance, in an atomic character vector
#'
#' @return A character vector of the active substance (input parameter) and
#'  synonyms, if any were found
#'
#' @importFrom xml2 read_html
#' @importFrom rvest html_node html_table
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' ctrFindActiveSubstanceSynonyms(
#'   activesubstance = "imatinib"
#' )
#' }
#'
ctrFindActiveSubstanceSynonyms <- function(activesubstance = "") {

  # check parameters
  if ((length(activesubstance) != 1) ||
      !is.character(activesubstance) ||
      (nchar(activesubstance) == 0)) {
    stop("ctrFindActiveSubstanceSynonyms(): ",
         "activesubstance should be a single string.",
         call. = FALSE)
  }

  # initialise output variable
  as <- activesubstance

  # check and set proxy if needed to access internet
  setProxy()

  # getting synonyms
  ctgovfirstpageurl <-
    paste0("https://clinicaltrials.gov/ct2/results/details?term=",
           activesubstance)
  tmp <- xml2::read_html(x = utils::URLencode(ctgovfirstpageurl))

  # extract from table "Terms and Synonyms Searched:"
  tmp <- rvest::html_node(
    tmp, xpath =
      '//*[@id="searchdetail"]//table[1]')
  tmp <- rvest::html_table(tmp, fill = TRUE)
  asx <- tmp[[1]]
  asx <- asx[!grepl(
    paste0("(more|synonyms|terms|", as, "|",
           paste0(unlist(strsplit(as, " "), use.names = FALSE),
                  collapse = "|"), ")"), asx,
    ignore.case = TRUE)]

  # prepare and return output
  as <- c(as, asx)
  as <- unique(as)
  return(as)
}
# end ctrFindActiveSubstanceSynonyms


#' Show the history of queries that were loaded into a database
#'
#' @inheritParams ctrDb
#'
#' @return A data frame with columns: query-timestamp, query-register,
#'  query-records (note: this is the number of records loaded when last
#'  executing \link{ctrLoadQueryIntoDb}, not the total record number) and
#'  query-term, and with one row for each \link{ctrLoadQueryIntoDb}
#'  loading trial records in this collection.
#'
#' @param verbose If \code{TRUE}, prints additional information
#' (default \code{FALSE}).
#'
#' @importFrom nodbi docdb_query
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' dbQueryHistory(
#'   con = db
#' )
#' }
#'
dbQueryHistory <- function(con,
                           verbose = FALSE) {

  ## check database connection
  if (is.null(con$ctrDb)) con <- ctrDb(con = con)

  # debug
  if (verbose) message("Running dbQueryHistory ...")

  tmp <- nodbi::docdb_query(
    src = con,
    key = con$collection,
    query = '{"_id": {"$eq": "meta-info"}}',
    fields = '{"queries": 1}')

  # access array of meta-info
  tmp <- tmp[["queries"]][[1]]

  # Check if meeting expectations
  if (is.null(tmp) ||
      nrow(tmp) == 0L) {
    #
    message("No history found in expected format.")
    #
    # return (class data.frame is expected)
    return(invisible(data.frame(NULL)))
    #
  }
  # else {

  # Inform user
  if (verbose) {

    message("Number of queries in history of \"",
            con$collection, "\": ", nrow(tmp))
    # }

    # total number of records in collection to inform user
    countall <- nodbi::docdb_query(
      src = con,
      key = con$collection,
      query =  '{"_id": {"$ne": "meta-info"}}',
      fields = '{"_id": 1}')[["_id"]]

    # if (verbose)
    message("Number of records in collection \"",
            con$collection, "\": ", length(countall))
  }

  # return
  return(tmp)

}
# end ctrQueryHistoryInDb


#' Find names of fields in the database collection
#'
#' Given part of the name of a field of interest to the user, this
#' function returns the full field names as found in the database.
#'
#' For fields in EUCTR (protocol- and results-related information),
#' see also the register's documentation at
#' \url{https://eudract.ema.europa.eu/}.
#'
#' For fields in CTGOV (protocol-related information), see also
#' the register's definitions at
#' \url{https://prsinfo.clinicaltrials.gov/definitions.html}.
#'
#' Note: Generating a list of fields with this function may take
#' some time, and may involve running a mapreduce function if using
#' a MongoDB server. If the user is not not authorized to run
#' such a function, random documents are sampled to generate a
#' list of fields.
#'
#' @param namepart A plain string (can include a regular expression,
#' including Perl-style) to be searched for among all field names
#' (keys) in the database.
#'
#' @param verbose If \code{TRUE}, prints additional information
#' (default \code{FALSE}).
#'
#' @importFrom nodbi docdb_query
#'
#' @inheritParams ctrDb
#'
#' @return Vector of names of found field(s) in alphabetical
#' order (that is, not by register or field frequency)
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' dbFindFields(
#'   nampepart = "date",
#'   con = db
#' )
#' }
#'
dbFindFields <- function(namepart = "",
                         con,
                         verbose = FALSE) {

  ## sanity checks
  if (!is.atomic(namepart)) stop("'namepart' should be atomic.", call. = FALSE)
  if (length(namepart) > 1) stop("'namepart' should have one element.", call. = FALSE)
  if (namepart == "")       stop("Empty 'namepart' parameter.", call. = FALSE)

  ## check database connection
  if (is.null(con$ctrDb)) con <- ctrDb(con = con)

  ## check if cache for list of keys in collection exists,
  # otherwise create new environment as session cache
  if (!exists(".dbffenv")) {
    .dbffenv <- new.env(parent = emptyenv())
  }

  ## check if cache environment has entry for the database
  if (exists(x = paste0(con$db, "/", con$collection),
             envir = .dbffenv)) {

    # if true, get keys list from cache
    keyslist <- get(x = paste0(con$db, "/", con$collection),
                    envir = .dbffenv)

    # informing user
    message("Using cache of fields.")

  } else {

    ## get keys list from database
    ## warn if no method yet for backend
    if (!any(c("src_mongo",
               "src_sqlite") %in%
             class(con))) {
      stop("Function dbFindFields() cannot yet handle nodbi ",
           "database backend ", class(con)[1], call. = FALSE)
      ## TODO extended function to additional backends
    }

    # inform user
    message("Finding fields in database (may take some time)")

    ## using storage backend- specific methods, since
    ## no canonical way was found yet to retrieve
    ## field / key names

    ## - method for mongodb
    if (inherits(con, "src_mongo")) {

      # try mapreduce to get all keys
      keyslist <- try({
        con$con$mapreduce(
          map = "function() {
      obj = this;
      return searchInObj(obj, '');
      function searchInObj(obj, pth) {
         for(var key in obj) {
            if(typeof obj[key] == 'object' && obj[key] !== null) {
               if(pth != '') {pth = pth + '.'}
                  searchInObj(obj[key], pth + key);
            }else{
               key = pth + '.' + key;
               key = key.replace(/[.][0-9]+[.]/g, '.');
               key = key.replace(/[.][0-9]+$/, '');
               key = key.replace(/[.][.]+/g, '.');
               key = key.replace(/^[.]/, '');
               emit(key, 1);
      }}}}",
      reduce = "function(id, counts) {return Array.sum(counts)}"
      # extract and keep only "_id" = first column, with keys
        )[["_id"]]},
      silent = TRUE)

      # if mapreduce does not work or is not permitted, revert to guessing
      if (inherits(keyslist, "try-error")) {

        warning("Mongo server returned: ", as.character(keyslist),
                "Using alternative method (extracting keys from ",
                "sample documents, may be incomplete).",
                call. = FALSE, immediate. = TRUE)

        # get 2 random documents, one for each register EUCTR and CTGOV,
        # if in collection, and retrieve keys from documents
        keyslist <- c(
          "", # avoid empty vector
          names(con$con$find(
            query = '{"_id": { "$regex": "^NCT[0-9]{8}", "$options": ""} }',
            limit = 1L)),
          names(con$con$find(
            query = '{"_id": { "$regex": "^[0-9]{4}-[0-9]{6}", "$options": ""} }',
            limit = 1L)))

      } # end if error with mapreduce
    } # end if src_mongo

    ## - method for sqlite
    if (inherits(con, "src_sqlite")) {

      # uses special function parameter for
      # src_sqlite query method: listfields
      keyslist <- c("", # avoid empty vector
                    nodbi::docdb_query(
                      src = con,
                      key = con$collection,
                      query = "",
                      listfields = TRUE))

    }

    ## store keyslist to environment (cache)
    if (length(keyslist) > 1) {
      assign(x = paste0(con$db, "/", con$collection),
             value = keyslist,
             envir = .dbffenv)
      message("Field names cached for this session.")
    }

  } # end get cached list or generate new list

  ## inform user of unexpected situation
  if ((length(keyslist) == 0) || all(keyslist == "")) {
    warning("No keys could be extracted, please check database ",
            "and contents: ", con$db, "/", con$collection, call. = FALSE)
  }

  ## now do the actual search and find for key name parts
  fields <- keyslist[grepl(pattern = namepart, x = keyslist,
                           ignore.case = TRUE, perl = TRUE)]

  # clean
  fields <- fields[fields != ""]
  if (!length(fields)) fields <- ""

  # return the match(es)
  return(sort(fields))

} # end dbFindFields


#' Deduplicate records to provide unique clinical trial identifiers
#'
#' If records for a clinical trial are found from more than one register, the
#' record from EUCTR is returned. The function currently relies on CTGOV
#' recording other identifiers such as the EudraCT number in the field "Other
#' IDs".
#'
#' @param preferregister The abbreviation of the preferred register, in case
#' a trial is in more than one register (string, either "EUCTR" or "CTGOV").
#' If set to an empty string (""), keeps the keys for the same trial in both
#' registers in the returned vector.
#'
#' @inheritParams dfFindUniqueEuctrRecord
#'
#' @param verbose If set to \code{TRUE}, prints out information about numbers
#' of records found at subsequent steps when searching for duplicates
#'
#' @importFrom nodbi docdb_query
#'
#' @inheritParams ctrDb
#'
#' @return A vector with strings of keys ("_id" in the database) that
#'   represent non-duplicate trials.
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' dbFindIdsUniqueTrials(
#'   con = db
#' )
#' }
#'
dbFindIdsUniqueTrials <- function(
  preferregister = "EUCTR",
  prefermemberstate = "GB",
  include3rdcountrytrials = TRUE,
  con,
  verbose = TRUE) {

  # parameter checks
  if (!grepl(preferregister, "CTGOVEUCTR")) {
    stop("Register not known: ", preferregister, call. = FALSE)
  }

  # objective: create a list of mongo database record identifiers (_id)
  # that represent unique records of clinical trials, based on user's
  # preferences for selecting the preferred from any multiple records

  ## check database connection
  if (is.null(con$ctrDb)) con <- ctrDb(con = con)

  # get identifiers
  listofIds <- try(suppressMessages(suppressWarnings(
    dbGetFieldsIntoDf(fields = c(
      # euctr
      "a2_eudract_number",
      "a41_sponsors_protocol_code_number",
      "a51_isrctn_international_standard_randomised_controlled_trial_number",
      "a52_us_nct_clinicaltrialsgov_registry_number",
      # not yet used: "a53_who_universal_trial_reference_number_utrn",
      #
      # ctgov
      "id_info"
    ),
    con = con,
    verbose = FALSE,
    stopifnodata = FALSE)
  )),
  silent = TRUE
  )

  # error check
  if (inherits(listofIds, "try-error") ||
      !length(listofIds) || !nrow(listofIds)) {
    stop("No records found, check collection '", con$collection, "'",
         call. = FALSE)
  }

  # keep trial records
  listofIds <- listofIds[
    grepl("NCT[0-9]{8}|[0-9]{4}-[0-9]{6}-[0-9]{2}", listofIds[["_id"]]),
  ]

  # inform user
  message("Searching for duplicate trials... ")
  message("* Total of ", nrow(listofIds), " records in collection.")

  # find unique, preferred country version of euctr
  listofIds <- dfFindUniqueEuctrRecord(
    df = listofIds,
    prefermemberstate = prefermemberstate,
    include3rdcountrytrials = include3rdcountrytrials)

  # keep only euctr
  listofEUCTRids <- listofIds[
    grepl("^[0-9]{4}-[0-9]{6}-[0-9]{2}$", listofIds[["a2_eudract_number"]]),
    c(1, seq_len(ncol(listofIds))[grepl("^a[0-9]+_", names(listofIds))])
  ]

  # keep only ctgov
  listofCTGOVids <- listofIds[
    grepl("^NCT[0-9]{8}$", listofIds[["_id"]]),
    c(1, seq_len(ncol(listofIds))[grepl("^id_info", names(listofIds))][1]),
    drop = TRUE
  ]

  # find records (_id's) that are in both in euctr and ctgov
  if (!is.null(listofEUCTRids) & !is.null(listofCTGOVids)) {
    #
    # 6. select records from preferred register
    if (preferregister == "EUCTR") {
      #
      # strategy: retain all listofEUCTRids;
      # identify and remove dupes from listofCTGOVids
      #
      # b2 - ctgov in euctr (_id corresponds to index 1)
      dupesB2 <- listofCTGOVids[["_id"]] %in%
        listofEUCTRids[["a52_us_nct_clinicaltrialsgov_registry_number"]]
      #
      if (verbose) message(" - ", sum(dupesB2),
                           " CTGOV _id (nct) in EUCTR a52_us_nct_...")
      #
      # a2 - ctgov in euctr a2_...
      dupesA2 <- vapply(
        listofCTGOVids[["id_info"]], # e.g. "EUDRACT-2004-000242-20"
        function(x) any(sub(".*([0-9]{4}-[0-9]{6}-[0-9]{2}).*", "\\1", x) %in%
                          listofEUCTRids[["a2_eudract_number"]]), logical(1L))
      #
      if (verbose) {
        message(
          " - ", sum(dupesA2),
          " CTGOV secondary_id / nct_alias / org_study_id in EUCTR a2_eudract_number")
      }
      #
      # c.2 - ctgov in euctr a52_... (id_info corresponds to index 2)
      dupesC2 <- vapply(
        listofCTGOVids[["id_info"]],
        function(x) any(
          x %in% na.omit(
            listofEUCTRids[["a52_us_nct_clinicaltrialsgov_registry_number"]])), logical(1L))
      #
      if (verbose) {
        message(
          " - ", sum(dupesC2),
          " CTGOV secondary_id / nct_alias / org_study_id in",
          " EUCTR a52_us_nct_...")
      }
      #
      # d.2 - ctgov in euctr a51_... (id_info corresponds to index 2)
      dupesD2 <- vapply(
        listofCTGOVids[["id_info"]],
        function(x) any(
          x %in% na.omit(
            listofEUCTRids[[
              "a51_isrctn_international_standard_randomised_controlled_trial_number"]])), logical(1L))
      #
      if (verbose) message(" - ", sum(dupesD2),
                           " CTGOV secondary_id / nct_alias / org_study_id in",
                           " EUCTR a51_isrctn_...")
      #
      # e.2 - ctgov in euctr a41_... (id_info corresponds to index 2)
      dupesE2 <- vapply(
        listofCTGOVids[["id_info"]],
        function(x) any(
          x %in% na.omit(
            listofEUCTRids[["a41_sponsors_protocol_code_number"]])), logical(1L))
      #
      if (verbose) {
        message(" - ", sum(dupesE2),
                " CTGOV secondary_id / nct_alias / org_study_id in",
                " EUCTR a41_sponsors_protocol_...")
      }
      #
      # finalise results set
      listofEUCTRids <- listofEUCTRids[["_id"]]
      listofCTGOVids <- listofCTGOVids[[
        "_id"]] [!dupesA2 & !dupesB2 & !dupesC2 & !dupesD2 & !dupesE2]
      #
      message(
        "Concatenating ",
        length(listofEUCTRids), " records from EUCTR and ",
        length(listofCTGOVids), " from CTGOV:")
      #
      retids <- c(listofEUCTRids, listofCTGOVids)
      #
    }
    #
    if (preferregister == "CTGOV") {
      #
      # a.1 - euctr in ctgov
      dupesA1 <- listofEUCTRids[["a2_eudract_number"]] %in% sub(
        ".*([0-9]{4}-[0-9]{6}-[0-9]{2}).*", # e.g. "EUDRACT-2004-000242-20"
        "\\1", unlist(listofCTGOVids[["id_info"]], use.names = FALSE))
      #
      if (verbose) {
        message(" - ", sum(dupesA1),
                " EUCTR _id in CTGOV secondary_id / nct_alias / org_study_id")
      }
      #
      # b.1 - euctr in ctgov
      dupesB1 <- !is.na(
        listofEUCTRids[["a52_us_nct_clinicaltrialsgov_registry_number"]]) &
        listofEUCTRids[["a52_us_nct_clinicaltrialsgov_registry_number"]] %in%
        listofCTGOVids[["_id"]]
      #
      if (verbose) {
        message(" - ", sum(dupesB1),
                " EUCTR a52_us_nct_... in CTGOV _id (nct)")
      }
      #
      # c.1 - euctr in ctgov
      dupesC1 <- !is.na(
        listofEUCTRids[["a52_us_nct_clinicaltrialsgov_registry_number"]]) &
        listofEUCTRids[["a52_us_nct_clinicaltrialsgov_registry_number"]] %in%
        unlist(listofCTGOVids[["id_info"]], use.names = FALSE)
      #
      if (verbose) {
        message(
          " - ", sum(dupesC1),
          " EUCTR a52_us_nct_... in",
          " CTGOV secondary_id / nct_alias / org_study_id")
      }
      #
      # d.1 - euctr in ctgov
      dupesD1 <- !is.na(
        listofEUCTRids[[
          "a51_isrctn_international_standard_randomised_controlled_trial_number"]]) &
        listofEUCTRids[[
          "a51_isrctn_international_standard_randomised_controlled_trial_number"]] %in%
        unlist(listofCTGOVids[["id_info"]], use.names = FALSE)
      #
      if (verbose) {
        message(
          " - ", sum(dupesD1),
          " EUCTR a51_isrctn_...",
          " in CTGOV secondary_id / nct_alias / org_study_id")
      }
      #
      # e.1 - euctr in ctgov
      dupesE1 <- !is.na(
        listofEUCTRids[["a41_sponsors_protocol_code_number"]]) &
        listofEUCTRids[["a41_sponsors_protocol_code_number"]] %in%
        unlist(listofCTGOVids[["id_info"]], use.names = FALSE)
      #
      if (verbose) {
        message(
          " - ", sum(dupesD1),
          " EUCTR a41_sponsors_protocol_...",
          " in CTGOV secondary_id / nct_alias / org_study_id")
      }
      #
      # f.1 - ctgov other id in nct
      dupesF1 <- vapply(
        listofCTGOVids[["id_info"]],
        function(x) !is.null(x[["nct_alias"]]) &&
          any(unlist(x[["nct_alias"]], use.names = FALSE) %in%
                listofCTGOVids[["_id"]]), logical(1L))
      #
      if (verbose) {
        message(
          " - ", sum(dupesF1),
          " CTGOV nct_alias in CTGOV _id (nct)")
      }
      #
      # finalise results set
      listofCTGOVids <- listofCTGOVids[["_id"]] [!dupesF1]
      listofEUCTRids <- listofEUCTRids[[
        "_id"]] [!dupesA1 & !dupesB1 & !dupesC1 & !dupesD1  & !dupesE1]
      #
      message(
        "Concatenating ",
        length(listofCTGOVids), " records from CTGOV and ",
        length(listofEUCTRids), " from EUCTR:")
      #
      retids <- c(listofCTGOVids, listofEUCTRids)
      #
    }
  } else {
    #
    # fallback
    retids <- c(listofEUCTRids[["_id"]], listofCTGOVids[["_id"]])
    #
  }

  # copy attributes
  attribsids <- attributes(listofIds)
  attributes(retids) <- attribsids[grepl("^ctrdata-", names(attribsids))]

  # avoid returning list() if none found
  if (length(retids) == 0) {
    retids <- character()
  }

  # inform user
  message(
    "= Returning keys (_id) of ", length(retids),
    " out of total ", nrow(listofIds),
    " records in collection \"", con$collection, "\".")

  # return
  return(retids)

}
# end dbFindIdsUniqueTrials


#' Create data frame by extracting specified fields from database collection
#'
#' With this convenience function, fields in the database are retrieved
#' into an R data frame. Note that fields within the record of a trial
#' can be hierarchical and structured, that is, nested.
#'
#' With both src_sqlite and src_mongo, the function returns a list of data
#' for a field that includes nested content; use function
#' \link{dfTrials2Long} followed by \link{dfName2Value} to
#' extract desired nested variables.
#'
#' For more sophisticated data retrieval from the database, see vignette
#' examples and other packages to query mongodb such as mongolite.
#'
#' @param fields Vector of one or more strings, with names of the sought fields.
#' See function \link{dbFindFields} for how to find names of fields.
#' Regular expressions are possible. "item.subitem" notation is supported.
#'
#' @param stopifnodata Stops with an error (\code{TRUE}, default) or with
#' a warning (\code{FALSE}) if the sought field is empty in all,
#' or not available in any of the records in the database collection.
#'
#' @param verbose Printing additional information if set to \code{TRUE};
#' default is \code{FALSE}.
#'
#' @inheritParams ctrDb
#'
#' @return A data frame with columns corresponding to the sought fields.
#' Note: a column for the record _id will always be included.
#' Each column can be either a simple data type (numeric, character, date)
#' or a list (see example below): For complicated lists, use function
#' \link{dfTrials2Long} followed by function \link{dfName2Value} to
#' extract values for nested variables.
#' The maximum number of rows of the returned data frame is equal to,
#' or less than the number of records of trials in the database.
#'
#' @importFrom nodbi docdb_query
#' @importFrom stats na.omit
#' @importFrom DBI dbGetQuery
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#'
#' # access fields that are nested within another field
#' # and can have multiple values with the other field
#' dbGetFieldsIntoDf(
#'   "b1_sponsor.b31_and_b32_status_of_the_sponsor",
#'   con = db
#' )[1,]
#' #                 _id b1_sponsor.b31_and_b32_status_of_the_sponsor
#' # 1 2004-000015-25-GB                  Non-commercial / Commercial
#'
#' # access fields that include a list of values
#' # which are printed as comma separated values
#' dbGetFieldsIntoDf(
#'   "keyword",
#'   con = db
#' )[1,]
#'
#' #           _id                                 keyword
#' # 1 NCT00129259 T1D, type 1 diabetes, juvenile diabetes
#'
#' str(.Last.value)
#' # 'data.frame':	1 obs. of  2 variables:
#' # $ _id    : chr "NCT00129259"
#' # $ keyword:List of 1
#' # ..$ : chr  "T1D" "type 1 diabetes" "juvenile diabetes"
#'
#' }
#'
dbGetFieldsIntoDf <- function(fields = "",
                              con, verbose = FALSE,
                              stopifnodata = TRUE) {

  # check parameters
  if (!is.vector(fields) |
      class(fields) != "character") {
    stop("Input should be a vector of strings of field names.", call. = FALSE)
  }

  # remove NA, NULL if included in fields
  fields <- fields[!is.null(fields) & !is.na(fields)]

  # remove _id if included in fields
  fields <- fields["_id" != fields]

  # check if valid fields
  if (any(fields == "") | (length(fields) == 0)) {
    stop("'fields' contains empty elements; ",
         "please provide a vector of strings of field names. ",
         "Function dbFindFields() can be used to find field names. ",
         call. = FALSE)
  }

  ## check database connection
  if (is.null(con$ctrDb)) con <- ctrDb(con = con)

  # helper function for managing lists
  listDepth <- function(x) {
    if (is.null(x)) return(0L)
    if (is.atomic(x)) return(1L)
    if (is.list(x)) return(1L + max(vapply(x, listDepth, integer(1L)), 0L))
  }

  # helper function to transform values coming from
  # a database query that are still json strings
  json2list <- function(df) {

    # prepare content
    tmpi <- df[[2]]
    names(tmpi) <- df[[1]]
    dfn <- names(df)

    if (all(vapply(tmpi, is.character, logical(1L))) &&
        any(vapply(tmpi, jsonlite::validate, logical(1L)))) {

      # work on row elements
      outList <- lapply(
        tmpi,
        function(cell) {

          # get content
          cell <- unlist(cell, use.names = FALSE)

          # check if string could be json
          if (grepl("[\\[{]", cell)) {

            # add [ ] to produce json object in advance of conversion
            if (!grepl("^\\[.+\\]$", cell)) cell <- paste0("[", cell, "]")

            # convert
            out <- try(jsonlite::fromJSON(cell, flatten = FALSE), silent = TRUE)

            # output
            if (!inherits(out, "try-error")) cell <- list(out)

          } # if json string

          # value
          cell

        }) # lapply

      # bind to resemble input df
      df <- do.call(rbind, outList)
      df <- data.frame(
        row.names(df),
        df,
        row.names = NULL,
        stringsAsFactors = FALSE)
      names(df) <- dfn

    } # if all vapply

    return(df)

  } # json2list

  # initialise output
  nFields <- length(fields)

  # iterate over fields so that we can use a custom function to merge results,
  # given that mongodb clients have different approaches and complex returns
  result <- lapply(
    seq_len(nFields),
    function(i) {
      #
      item <- fields[i]
      message(". ", appendLF = FALSE)
      #
      query <- paste0('{"_id": {"$ne": "meta-info"}}')
      if (verbose) message("DEBUG: field: ", item)
      #
      tmpItem <- try({

        ## handle special case: src_* is sqlite
        # json_extract() cannot be used to retrieve all
        # items since a json path would have to include
        # an array indicator such as [1] or [#-1], see
        # https://www.sqlite.org/json1.html#jex ans
        # https://www.sqlite.org/json1.html#path_arguments
        if (inherits(con$con, "SQLiteConnection")) {

          # mangle item names into SQL e.g.,
          # "location[4].facility[#-2].name" # two arrayIndex items
          # "location.facil[a-z0-0]+.*thing" # user regexp
          # "location.facil.*"               # user regexp
          # - remove arrayIndex
          #   NOTE potential side effect: disruption of user's regexp
          item <- gsub("\\[[-#0-9]+\\][.]", ".", item)
          if (verbose) message("DEBUG: 'field' mangled into: ", item)
          # - protect "." between item and subitem using lookahead for overlapping groups
          regexpItem <- gsub("([a-zA-Z]+)[.](?=[a-zA-Z]+)", "\\1@@@\\2", item, perl = TRUE)
          # - add in regexps to match any arrayIndex in fullkey
          regexpItem <- paste0("^[$][.]", gsub("@@@", "[-#\\\\[\\\\]0-9]*[.]", regexpItem), "$")
          # - top element in item
          topElement <- sub("^(.+?)[.].*$", "\\1", item)
          # - construct statement using json_tree(json, path) as per
          #   https://www.sqlite.org/json1.html#jtree
          # - include cast() to string to avoid warnings when types
          #   of columns are changed after first records are retrieved
          # - since mongodb returns NULL for documents that do not have the
          #   sought item but sqlite does not return such documents at all, the
          #   statement is more complex to also include a row for such
          #   non-existing items
          statement <- paste0(
            "SELECT
          CAST(allRows._id AS text) AS _id,
          CAST(jsonRows.value AS text) AS '", item, "'
          FROM (", con$collection, ") AS allRows

          LEFT JOIN
               (SELECT
                  CAST(_id AS text) AS id,
                  CAST(value AS text) AS value
               FROM ", con$collection, ",
                    json_tree(", con$collection, ".json, '$.", topElement, "')
               WHERE fullkey REGEXP '", regexpItem, "') AS jsonRows

          ON jsonRows.id = allRows._id
          WHERE allRows._id <> 'meta-info'
          ;")
          if (verbose) message("DEBUG: src_sqlite, statement:\n", statement)

          # execute query, bypassing nodbi since my implementation
          # of nodbi::doc_query.sqlite() does not use json_tree()
          dfi <- DBI::dbGetQuery(
            conn = con$con,
            statement = statement,
            n = -1L)

          # dfi[, 2] could still be json strings
          dfi <- json2list(dfi)

          # dfi can be a long table, number of rows corresponding to
          # number of subitems found in the collection (possibly more
          # than one per record in the collection): aggregate by _id
          tmpById <- tapply(
            X = dfi[, 2],
            INDEX = dfi[, 1],
            function(i) {
              if (all(is.na(i))) {
                # keep NULL elements in output
                NULL
              } else {
                if (is.atomic(
                  unlist(i, recursive = FALSE, use.names = FALSE))) {
                  # e.g. for location_countries.country
                  unname(i)
                } else {
                  data.frame(
                    unname(i),
                    check.names = FALSE,
                    row.names = NULL,
                    stringsAsFactors = FALSE)
                }}},
            simplify = FALSE)

          # now match format for further processing
          dfi <- data.frame(
            "_id" = names(tmpById), tmpById,
            row.names = NULL,
            check.names = FALSE,
            stringsAsFactors = FALSE
          )

        } else {

          # src_mongo

          # execute query
          dfi <- nodbi::docdb_query(
            src = con,
            key = con$collection,
            query = query,
            fields = paste0('{"_id": 1, "', item, '": 1}'))

          # dfi[, 2] could still be json strings
          if (ncol(dfi) == 2L) dfi <- json2list(dfi)

          # unboxing is not done in docdb_query
          # (for loop could not be replaced by
          # *apply and assignment to dfi[[2]])
          for (i in seq_len(nrow(dfi))) {
            if (!is.null(dfi[i, 2]) &&
                is.list(dfi[i, 2]) &&
                !identical(dfi[i, 2], list(NULL)) &&
                !is.data.frame(dfi[i, 2][[1]])) {

              dfi[i, 2][[1]] <- list(jsonlite::fromJSON(
                jsonlite::toJSON(dfi[i, 2], auto_unbox = TRUE)))

            }}

        } # if src_sqlite or src_mango

        ## mangle further

        # ensure intended column order
        tmp <- names(dfi)
        if (tmp[1L] != "_id") {
          dfi <- dfi[, c("_id", tmp[tmp != "_id"])]
        }

        ## simplify if robust

        # - if each [,2] is a list or data frame with one level
        #   e.g., mongodb: enrollment; study_design_info.allocation
        if ((ncol(dfi) == 2) &&
            (all(vapply(dfi[, 2],
                        function(x)
                          listDepth(x) <= 1L, logical(1L)))
            )) {
          # concatenate (has to remain as sapply
          # because of different content types)
          dfi[, 2] <- sapply(
            dfi[, 2],
            function(x)
              paste0(na.omit(unlist(x, use.names = FALSE)),
                     collapse = " / "))
          # inform user
          message("\r* Collapsed with '/' [1]: '", item, "'")
          # remove any extraneous columns
          dfi <- dfi[, 1:2]
        }
        #
        # - if dfi[, 2:ncol(dfi)] is from the same field
        #   e.g., mongodb: study_design_info
        if ((length(ncol(dfi[, 2])) && ncol(dfi[, 2]) > 1L) ||
            ((ncol(dfi) > 2L) &&
             all(grepl(paste0(item, "[.].+$"),
                       names(dfi)[-1])))) {

          # store names
          tmpnames <- gsub(".+?[.](.+)$", "\\1", names(dfi)[-1])
          names(dfi)[-1] <- tmpnames

          # concatenate to list
          tmpById <- split(dfi[, 2:ncol(dfi)],
                           seq_len(nrow(dfi)))

          # remove extraneous columns
          dfi <- dfi[, 1:2]

          # create items in column from list
          dfi[[2]] <- tmpById

          # inform user
          message("\r* Converted to list [2]: '", item, "'")

        }
        #
        # - if each [,2] is a list with a single and the same element
        if (all(vapply(
          dfi[, 2], function(i) is.null(i) | is.list(i), logical(1L))) &&
          length(unique(unlist(sapply(
            dfi[, 2], function(i) unique(gsub("[0-9]+$", "", names(unlist(i))))
              )))) <= 1L) {
          #
          dfi[, 2] <- vapply(
            dfi[, 2], function(i) paste0(
              na.omit(unlist(i, use.names = FALSE)),
              collapse = " / "), character(1L))
          # inform user
          message("\r* Simplified or collapsed with '/' [3]: '", item, "'")
        }

      },
      silent = TRUE) # tmpItem try

      # inform user
      if (inherits(tmpItem, "try-error") ||
          !nrow(dfi)) {

        # try-error occurred or no data retrieved
        if (stopifnodata) {
          stop("No data could be extracted for '", item,
               "'. \nUse dbGetFieldsIntoDf(stopifnodata = ",
               "FALSE) to ignore this. ",
               call. = FALSE)
        } else {
          message("* No data: '", item, "'")
          # create empty data set
          dfi <- data.frame("_id" = NA, NA,
                            check.names = FALSE,
                            stringsAsFactors = FALSE)
        } # stopifnodata
      } # if

      # name result set
      names(dfi) <- c("_id", item)

      # type item field - note this introduces NAs
      # for fields with no values for a trial
      dfi <- typeField(dfi)

      # add to result
      dfi

    }) # end lapply

  # bring lists into data frame by trial id
  result <- Reduce(function(...) merge(..., all = TRUE, by = "_id"), result)

  # prune rows that do not have any results
  result <- result[
    !is.na(result[["_id"]]) &
      apply(
        X = result[, -1, drop = FALSE],
        MARGIN = 1,
        function(r) {
          r <- unlist(r, use.names = FALSE)
          r <- na.omit(r)
          r <- nchar(r)
          sum(r)
        }), ]

  # finalise output
  if (is.null(result) || !nrow(result)) {
    stop("No records with values for any specified field. ",
         call. = FALSE)
  }

  # add metadata
  result <- addMetaData(result,
                        con = con)

  # return
  return(result)
}
# end dbGetFieldsIntoDf


#' Extract information of interest (e.g., endpoint)
#' from long data frame of protocol- or result-related
#' trial information as returned by \link{dfTrials2Long}
#'
#' @param df A data frame with four columns (_id,
#'  identifier, name, value) as returned by
#'  \link{dfTrials2Long}
#'
#' @param valuename A character string for the name of the variable
#'  from which to extract information for the variable of interest
#'
#' @param wherename A character string to identify the variable
#'  of interest
#'
#' @param wherevalue A character string with the value of interest
#'  for the variable of interest
#'
#' @return A data frame with columns _id, identifier,
#'  name, value that only includes the values of interest,
#'  where value are strings unless all value elements
#'  are numbers.
#'
#' @export
#' @examples
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' df <- ctrdata::dbGetFieldsIntoDf(
#' fields = c(
#'   # ctgov - typical results fields
#'   "clinical_results.baseline.analyzed_list.analyzed.count_list.count",
#'   "clinical_results.baseline.group_list.group",
#'   "clinical_results.baseline.analyzed_list.analyzed.units",
#'   "clinical_results.outcome_list.outcome",
#'   "study_design_info.allocation",
#'   # euctr - typical results fields
#'   "trialInformation.fullTitle",
#'   "subjectDisposition.recruitmentDetails",
#'   "baselineCharacteristics.baselineReportingGroups.baselineReportingGroup",
#'   "endPoints.endPoint",
#'   "trialChanges.hasGlobalInterruptions",
#'   "subjectAnalysisSets",
#'   "adverseEvents.seriousAdverseEvents.seriousAdverseEvent"
#'   ), con = dbc
#' )
#' # convert to long
#' reslong <- ctrdata::dfTrials2Long(
#'   df = df
#' )
#' # get values for endpoint of interest, duration of response
#' ctrdata::dfValue2Name(
#'   df = df,
#'   valuename = paste0(
#'     "endPoints.endPoint.*armReportingGroup.tendencyValues.tendencyValue.value|",
#'     "clinical_results.*category.measurement_list.measurement.value|",
#'     "clinical_results.*outcome.measure.units|endPoints.endPoint.unit"
#'   ),
#'   wherename = "clinical_results.*outcome.measure.title|endPoints.endPoint.title",
#'   wherevalue = "duration of response"
#' )
#' }
#'
dfName2Value <- function(df, valuename = "",
                         wherename = "", wherevalue = "") {

  # check parameters
  if (valuename == "") {
    stop("'valuename' must be specified.",
         call. = FALSE)
  }
  if (!identical(names(df),
                 c("_id", "identifier", "name", "value"))) {
    stop("'df' does not seem to come from dfTrials2Long()",
         call. = FALSE)
  }

  # indices of valuename
  indexVnames <- which(grepl(valuename, df[["name"]],
                             perl = TRUE, ignore.case = TRUE))
  if (!length(indexVnames)) stop("No rows found for 'valuename' = ", valuename)

  # if no where... are specified, just
  # return rows where name corresponds
  # to valuename
  if (wherename == "" & wherevalue == "") {

    # get relevant rows
    out <- df[indexVnames, ]

  } else {# if where... are specified, continue

    # get where... indices per trial
    indexRows <- which(
      grepl(wherename, df[["name"]], perl = TRUE, ignore.case = TRUE) &
      grepl(wherevalue, df[["value"]], perl = TRUE, ignore.case = TRUE))
    if (!length(indexRows)) stop("No rows found for 'wherename' and 'wherevalue'")

    # get trial ids and identifiers for where...
    indexCases <- df[indexRows, c("_id", "identifier")]

    # get output iterate over trials
    out <- apply(
      indexCases, 1,
      function(i) {
        ids <- Reduce(
          intersect, list(
            # trial id
            which(grepl(i[["_id"]], df[["_id"]], fixed = TRUE)),
            # identifier to match starting from left and
            # do not match e.g. 22 for identifier 2
            which(grepl(paste0("^", i[["identifier"]], "([.]|$)"),
                        df[["identifier"]])),
            # indices of sought valuename
            indexVnames
          ))
        # return value
        if (length(ids)) df[ids, ]
      }
    )

    # bind into data frame
    out <- do.call(
      rbind,
      c(out, stringsAsFactors = FALSE, make.row.names = FALSE))

  } # if where...

  # value column is character
  # try to convert it to numeric
  tmp <- suppressWarnings(
    as.numeric(out[["value"]])
  )
  # use if converted ok
  if (all(is.na(tmp) == is.na(out[["value"]]))) {
    out["value"] <- tmp
  }
  # remove any duplicates such as
  # from duplicate where... criteria
  out <- unique(out)

  # return
  return(out)

} # end dfName2Value


#' Extract trial information into long format
#'
#' The function works with procotol- and results-
#' related information. It converts lists and other
#' values into individual rows of a long data frame.
#' From the resulting data frame, values of interest
#' can then be selected (e.g. select an outcome
#' and its analysis by the identifier of the measure
#' which has "Hazard Ratio" in its name, see
#' \link{dfName2Value}).
#'
#' @param df Data frame with columns including
#'  the trial identifier (\code{_id}) and
#'  one or more variables as obtained from
#'  \link{dbGetFieldsIntoDf}
#'
#' @return A data frame with the four columns:
#'  _id, identifier, name, value
#'
#' @importFrom stringi stri_extract_all_charclass
#' @importFrom stringi stri_extract_first
#' @importFrom stringi stri_replace_first
#'
#' @export
#'
#' @examples
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' df <- dbGetFieldsIntoDf(
#'   fields = c(
#'     "clinical_results.outcome_list.outcome"),
#'   con = db
#' )
#' dfTrials2Long(
#'   df = df
#' )
#' }
dfTrials2Long <- function(df) {

  # check parameters
  if (!any("_id" == names(df)) ||
      ncol(df) == 1L) stop(
        "Missing _id column or other variables in 'df'",
        call. = FALSE
      )
  if (any(c("identifier", "name", "value") %in% names(df))) stop(
        "Unexpected columns; 'df' should not come from dfTrials2Long",
        call. = FALSE
      )

  # helper function
  flattenDf <- function(x) {
    while (any(vapply(x, is.list, logical(1L)))) {
      x <- lapply(x, function(x) if (is.list(x)) x else list(x))
      x <- unlist(x, recursive = FALSE, use.names = TRUE)
    }
    x
  }

  # to add a first row in the next step,
  # columns that are not compatible with
  # adding a row are converted to character
  conv <- sapply(df, class) == "Date"
  conv <- seq_len(ncol(df))[conv]
  for (c in conv) df[, c] <- as.character(df[, c, drop = TRUE])

  # add a first row to df to hold item name
  # which otherwise is not available in apply
  df <- rbind(
    names(df),
    df)

  # iterative unnesting, by column
  out <- lapply(
    df[, -match("_id", names(df)), drop = FALSE],
    function(cc) {
      message(". ", appendLF = FALSE)
      # get item name as added in first row
      tn <- cc[[1]]
      # and by element in column
      lapply(cc[-1], function(c) {
        x <- unlist(flattenDf(c))
        if (!is.null(names(x))) tn <- names(x)
        if (is.null(x)) x <- NA
        data.frame(
          "name" = tn,
          "value" = x,
          check.names = FALSE,
          stringsAsFactors = FALSE,
          row.names = NULL)
      })})

  # add _id to list elements and
  # simplify into data frames
  tmpNames <- df[-1, "_id", drop = TRUE]
  out <- lapply(
    out, function(e) {
      message(". ", appendLF = FALSE)
      names(e) <- tmpNames
      # duplicate e to force generating
      # names in the later rbind step
      do.call(rbind, c(e, e, stringsAsFactors = FALSE))
    })

  # combine lists into data frame
  out <- do.call(rbind, c(out, stringsAsFactors = FALSE))
  message(". ", appendLF = FALSE)

  # remove rows where value is NA
  out <- out[!is.na(out[["value"]]), , drop = FALSE]

  # process row.names such as "clinical_results.NCT00082758.73"
  # to to obtain "clinical_results" as part of variable name
  names <- stringi::stri_replace_first(
    str = row.names(out), replacement = "",
    regex = c(paste0(".(", regCtgov, "|", regEuctr, "-[3A-Z]+)[.0-9]*")))

  # generate new data frame with target columns and order
  out <- data.frame(
    # process row.names to obtain trial id
    "_id" = stringi::stri_extract_first(
      str = row.names(out),
      regex = c(paste0(regCtgov, "|", regEuctr, "-[3A-Z]+"))),
    "identifier" = NA,
    "name" = out[["name"]],
    "value" = out[["value"]],
    check.names = FALSE,
    row.names = NULL,
    stringsAsFactors = FALSE)
  message(". ", appendLF = FALSE)

  # generate variable names from processed row names
  # and name unless the same is as already in name
  out[["name"]] <- ifelse(
    out[["name"]] == names,
    out[["name"]],
    paste0(names, ".0.", out[["name"]]))

  # name can have from 0 to about 6 number groups, get all
  # and concatenate to oid-like string such as "1.2.2.1.4"
  out[["identifier"]] <- vapply(
    stringi::stri_extract_all_regex(out[["name"]], "[0-9]+([.]|$)"),
    function(i) paste0(gsub("[.]", "", i), collapse = "."), character(1L))
  out[["identifier"]] [out[["identifier"]] == "NA"] <- "0"
  message(". ", appendLF = FALSE)

  # remove numbers from variable name
  out[["name"]] <- gsub(
    "[0-9]+([.])|[0-9]+$|[.]?@attributes", "\\1", out[["name"]], perl = TRUE)

  # remove any double separators
  out[["name"]] <- gsub("[.]+", ".", out[["name"]], perl = TRUE)

  # remove double rows from duplicating e above
  out <- unique(out)

  # inform
  message("\nTotal ", nrow(out), " rows, ",
          length(unique(out[["name"]])),
          " unique names of variables")

  # output
  return(out)

} # end dfTrials2Long


#' Extract named element(s) from list(s) into long-format
#' data frame
#'
#' The function uses a name (key) to extract an element
#' from a list in a data.frame such as obtained with
#' \link{dbGetFieldsIntoDf}. This helps to simplify
#' working with nested lists and with complex structures.
#'
#' @param df A data frame
#' @param list.key A list of pairs of list names and
#'  key names, where the list name corresponds to the
#'  name of a column in \code{df} that holds a list and
#'  the name of the key identifies the element to be
#'  extracted. See example.
#'
#' @return A data frame in long format with columns
#'  name (identifying the full path in the data frame,
#'  "<list>.<key>"), _id (of the trial record), value
#'  (of name per _id), item (number of value of name
#'  per _id).
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' db <- nodbi::src_sqlite(
#'   collection = "my_collection"
#' )
#' df <- dbGetFieldsIntoDf(
#'   fields = c(
#'     "endPoints.endPoint",
#'     "subjectDisposition.postAssignmentPeriods"),
#'   con = db
#' )
#' dfListExtractKey(
#'   df = df,
#'   list.key = list(
#'       c("endPoints.endPoint",
#'         "^title"),
#'       c("subjectDisposition.postAssignmentPeriods",
#'         "arms.arm.type.value")
#' ))
#' }
#'
dfListExtractKey <- function(
  df,
  list.key =
    list(c("endPoints.endPoint", "^title")
    )) {

  # deprecate
  .Deprecated(new = "dfName2Value")

  # check
  if (!any("_id" == names(df))) {
    stop("Data frame 'df' lacks '_id' column.",
         call. = FALSE)
  }

  # helper function to extract from
  # a named vector elements by name
  extractKey <- function(flattenedList, key) {

    # find element by key
    selected <- grepl(key,
                      names(flattenedList),
                      ignore.case = TRUE)


    # extract value for key
    extracted <- flattenedList[selected]

    # if key is not found, return a value
    # e.g. missing value (NA) or empty string ("")
    # please change as wanted for later processing
    if (length(extracted) == 0) extracted <- NA

    # return
    return(extracted)
  }

  # dots needs to be defined because passing
  # it in .Internal(mapply()) is not enough
  out <- lapply(
    list.key,
    function(k)
      lapply(df[[k[1]]],
             # k[1] = "endPoints.endPoint" identifies
             # the column in data frame with the list
             function(l) extractKey(
               unlist(l, recursive = TRUE, use.names = TRUE), k[2])
             # k[2] = "^title" identifies the key in the sublist
      ))

  out <- sapply(seq_along(list.key), function(li) {

    tmp <- out[[li]]

    tmp <- sapply(

      seq_along(tmp),
      function(ii) {

        data.frame(
          name = gsub("[-0-9]*$", "", # trailing number
                 gsub("[^a-zA-Z0-9_.-]", "",
                 paste0(list.key[[li]], collapse = "."))),
          "_id" = df[["_id"]][[ii]],
          value = tmp[[ii]],
          item = seq_along(tmp[[ii]]),
          row.names = NULL,
          stringsAsFactors = FALSE,
          check.names = FALSE)
      }, simplify = FALSE)

    do.call(rbind, tmp)

  }, simplify = FALSE)

  # return
  do.call(rbind, c(out, stringsAsFactors = FALSE, make.row.names = FALSE))

} # end dfListExtractKey


#' Merge two variables into one, optionally map values to new levels
#'
#' @param df A \link{data.frame} in which there are two variables (columns)
#' to be merged into one.
#' @param colnames A vector of length two with names of the two columns
#' that hold the variables to be merged. See \link{colnames} for how to
#' obtain the names of columns of a data frame.
#' @param levelslist A list with one slice each for a new value to be
#' used for a vector of old values (optional).
#' @param ... for deprecated \code{varnames} parameter (will be removed)
#'
#' @return A vector of strings
#'
#' @export
#'
#' @examples
#'
#' \dontrun{
#' statusvalues <- list(
#'   "ongoing" = c("Recruiting", "Active", "Ongoing",
#'                 "Active, not recruiting", "Enrolling by invitation"),
#'   "completed" = c("Completed", "Prematurely Ended", "Terminated"),
#'   "other" = c("Withdrawn", "Suspended",
#'               "No longer available", "Not yet recruiting"))
#'
#' dfMergeTwoVariablesRelevel(
#'   df = result,
#'   colnames = c("Recruitment", "x5_trial_status"),
#'   levelslist = statusvalues)
#' }
#'
dfMergeTwoVariablesRelevel <- function(
  df = NULL,
  colnames = "",
  levelslist = NULL,
  ...) {

  # check parameters

  # FIXME migrate from previously
  # used parameter "varnames"
  tmp <- match.call()
  tmp <- tmp["varnames"]
  tmp <- as.list(tmp)[[1]]
  if (length(tmp) == 3 && colnames == "") {
    colnames <- unlist(as.list(tmp[-1], use.names = FALSE))
    warning("Parameter varnames is deprecated, use colnames instead.",
            call. = FALSE)
  }

  # other checks
  if (class(df) != "data.frame") {
    stop("Need a data frame as input.", call. = FALSE)
  }
  if (length(colnames) != 2) {
    stop("Please provide exactly two column names.", call. = FALSE)
  }

  # find variables in data frame and merge
  tmp <- match(colnames, names(df))
  df <- df[, tmp]

  # bind as ...
  if (class(df[, 1]) == class(df[, 2]) &&
      class(df[, 1]) != "character") {
    # check
    if (nrow(na.omit(df[!vapply(df[, 1, drop = TRUE], is.null, logical(1L)) &
                        !vapply(df[, 2, drop = TRUE], is.null, logical(1L)), ,
                        drop = FALSE]))) {
      warning("Some rows had values for both columns, used first",
              noBreaks. = TRUE, immediate. = TRUE)
    }
    # values, with first having
    # priority over the second
    tmp <- ifelse(is.na(tt <- df[, 1]), df[, 2], df[, 1])
  } else {
    # check
    if (nrow(df[df[, 1] != "" &
                df[, 2] != "", , drop = FALSE])) {
      warning("Some rows had values for both columns, concatenated",
              noBreaks. = TRUE, immediate. = TRUE)
    }
    # strings, concatenated
    tmp <- paste0(
      ifelse(is.na(tt <- as.character(df[, 1])), "", tt),
      ifelse(is.na(tt <- as.character(df[, 2])), "", tt))
  }

  # type where possible
  if (class(df[, 1]) == class(df[, 2]) &&
      class(df[, 1]) != "character") {
    mode(tmp) <- mode(df[, 1])
    class(tmp) <- class(df[, 1])
  }

  # relevel if specified
  if (!is.null(levelslist)) {

    # check
    if (class(levelslist) != "list") {
      stop("Need list for parameter 'levelslist'.", call. = FALSE)
    }

    # helper function to collapse factor levels into the first
    refactor <- function(x, collapselevels, levelgroupname) {
      levels(x) [match(collapselevels, levels(x))] <- levelgroupname
      return(x)
    }

    # convert result to factor as this is needed for helper function
    tmp <- as.factor(tmp)

    # apply helperfunction to elements of the list
    for (i in seq_len(length(levelslist))) {
      tmp <- refactor(tmp, unlist(levelslist[i], use.names = FALSE),
                      attr(levelslist[i], "names"))
    }

    # convert factor back into string vector
    tmp <- as.character(tmp)

  }

  # check and inform user
  if (length(tt <- unique(tmp)) > 3L) {
    message("Unique values returned (first three): ",
            paste(tt[1L:3L], collapse = ", "))
  } else {
    message("Unique values returned: ",
            paste(tt, collapse = ", "))
  }

  # return
  return(tmp)
}
# end dfMergeTwoVariablesRelevel


#' Select single trial record from records of different EU Member States
#'
#' The EUCTR provides one record per trial per EU Member State in which the
#' trial is conducted. For all trials conducted in more than one Member State,
#' this function returns only one record per trial.
#'
#' Note: To deduplicate trials from different registers (EUCTR and CTGOV),
#' please first use function \code{\link{dbFindIdsUniqueTrials}}.
#'
#' @param df A data frame created from the database that includes the columns
#'   "_id" and "a2_eudract_number", for example created with function
#'   dbGetFieldsIntoDf(c("_id", "a2_eudract_number")).
#' @param prefermemberstate Code of single EU Member State for which records
#' should returned. If not available, a record for GB or lacking this, any
#' other record for the trial will be returned. For a list of codes of EU
#'   Member States, please see vector \code{countriesEUCTR}. Alternatively,
#'   "3RD" will lead to return a Third Country record of a trial, if available.
#' @param include3rdcountrytrials A logical value if trials should be retained
#'   that are conducted exclusively in third countries, that is, outside
#'   the European Union.
#'
#' @return A data frame as subset of \code{df} corresponding to the sought
#'   records.
#'
#' @keywords internal
#
dfFindUniqueEuctrRecord <- function(
  df = NULL,
  prefermemberstate = "GB",
  include3rdcountrytrials = TRUE) {

  # check parameters
  if (class(df) != "data.frame") {
    stop("Parameter df is not a data frame.", call. = FALSE)
  }
  #
  if (is.null(df[["_id"]]) ||
      is.null(df["a2_eudract_number"])) {
    stop('Data frame does not include "_id"',
         ' and "a2_eudract_number" columns.',
         call. = FALSE)
  }
  #
  if (nrow(df) == 0) {
    stop("Data frame does not contain records (0 rows).",
         call. = FALSE)
  }
  #
  if (!(prefermemberstate %in% countriesEUCTR)) {
    stop("Value specified for prefermemberstate does not match",
         " one of the recognised codes: ",
         paste(sort(countriesEUCTR), collapse = ", "),
         call. = FALSE)
  }

  # notify it mismatching parameters
  if (prefermemberstate == "3RD" & !include3rdcountrytrials) {
    warning("Preferred EUCTR version set to 3RD country trials, but ",
            "'include3rdcountrytrials' was FALSE, setting it to TRUE.",
            call. = FALSE,
            noBreaks. = FALSE,
            immediate. = FALSE)
    include3rdcountrytrials <- TRUE
  }

  # as a first step, handle 3rd country trials e.g. 2010-022945-52-3RD
  # if retained, these trials would count as record for a trial
  if (!include3rdcountrytrials) {
    df <- df[!grepl("-3RD", df[["_id"]]), ]
  }

  # count number of records by eudract number
  tbl <- table(df[["_id"]], df[["a2_eudract_number"]])
  tbl <- as.matrix(tbl)
  # nms has names of all records
  nms <- dimnames(tbl)[[1]]

  # nrs has eudract numbers for which is there more than 1 record
  nrs <- colSums(tbl)
  nrs <- nrs[nrs > 1]
  nrs <- names(nrs)

  # nst is a list of nrs trials of a logical vector along nms
  # that indicates if the indexed record belongs to the trial
  nms2 <- substr(nms, 1, 14)
  nst <- lapply(nrs, function(x) nms2 %in% x)

  # helper function to find the Member State version
  removeMSversions <- function(indexofrecords) {
    # given a vector of records (nnnn-nnnnnnn-nn-MS) of a single trial, this
    # returns all those _ids of records that do not correspond to the preferred
    # Member State record, based on the user's choices and defaults.
    # Function uses prefermemberstate, nms from the caller environment
    recordnames <- nms[indexofrecords]
    #
    # fnd should be only a single string, may need to be checked
    if (sum(fnd <- grepl(prefermemberstate, recordnames)) != 0) {
      result <- recordnames[!fnd]
      return(result)
    }
    #
    if (sum(fnd <- grepl("GB", recordnames)) != 0) {
      result <- recordnames[!fnd]
      return(result)
    }
    #
    # default is to list all but first record
    # the listed records are the duplicates
    # 3RD country trials would be listed first
    # hence selected, which is not desirable
    # unless chosen as prefermemberstate
    return(rev(sort(recordnames))[-1])
  }

  # finds per trial the desired record;
  # uses prefermemberstate and nms
  result <- lapply(nst,
                   function(x) removeMSversions(x))
  result <- unlist(result, use.names = FALSE)

  # eleminate the unwanted EUCTR records
  df <- df[!(df[["_id"]] %in% result), ]

  # also eliminate the meta-info record
  df <- df[!(df[["_id"]] == "meta-info"), ]

  # inform user about changes to data frame
  if (length(nms) > (tmp <- length(result))) {
    message(
      " - ", tmp,
      " EUCTR _id were not preferred EU Member State record of trial")
  }

  # return
  return(df)

}
# end dfFindUniqueEuctrRecord


#' Change type of field based on name of field
#'
#' @param dfi a data frame of columns _id, fieldname
#'
#' @keywords internal
#'
typeField <- function(dfi) {

  # check
  if (ncol(dfi) != 2) {
    stop("Expect data frame with two columns, _id and a field.",
         call. = FALSE)
  }

  # clean up anyway in input
  #
  # - just return if all is a list, such as with parent elements
  if (inherits(dfi[, 2], "list")) return(dfi)
  #
  # - if NA as string, change to empty string
  if (all(class(dfi[, 2]) == "character")) dfi[dfi[, 2] == "NA", 2] <- ""
  #
  # - give Month Year also a Day to work with as.Date
  dfi[, 2] <- sub("^([a-zA-Z]+) ([0-9]{4})$", "\\1 15, \\2", dfi[, 2])

  # for date time conversion
  lct <- Sys.getlocale("LC_TIME")
  Sys.setlocale("LC_TIME", "C")
  on.exit(Sys.setlocale("LC_TIME", lct))

  # main typing functions
  ctrDate      <- function() as.Date(dfi[, 2], format = "%Y-%m-%d")
  ctrDateUs    <- function() as.Date(dfi[, 2], format = "%b %e, %Y")
  ctrDateCtr   <- function() as.Date(dfi[, 2], format = "%Y-%m-%d %H:%M:%S")
  ctrDateTime  <- function() as.Date(dfi[, 2], format = "%Y-%m-%dT%H:%M:%S")
  ctrYesNo     <- function() vapply(dfi[, 2], FUN = function(x) switch(x, "Yes" = TRUE, "No" = FALSE, NA), logical(1L))
  ctrFalseTrue <- function() vapply(dfi[, 2], FUN = function(x) switch(x, "true" = TRUE, "false" = FALSE, NA), logical(1L))
  ctrInt       <- function() vapply(dfi[, 2], FUN = function(x) as.integer(x = x), integer(1L))

  # selective typing
  tmp <- try({
    switch(
      EXPR = names(dfi)[2],
      #
      #
      # dates
      #
      # - intern
      "record_last_import" = ctrDateCtr(),
      # - EUCTR
      "n_date_of_ethics_committee_opinion"     = ctrDate(),
      "n_date_of_competent_authority_decision" = ctrDate(),
      "p_date_of_the_global_end_of_the_trial"  = ctrDate(),
      "x6_date_on_which_this_record_was_first_entered_in_the_eudract_database" = ctrDate(),
      "x7_start_date"                          = ctrDate(),
      "firstreceived_results_date"             = ctrDate(),
      "trialInformation.primaryCompletionDate" = ctrDate(),
      "trialInformation.globalEndOfTrialDate"  = ctrDateTime(),
      "trialInformation.recruitmentStartDate"  = ctrDateTime(),
      # - CTGOV
      "start_date"              = ctrDateUs(),
      "primary_completion_date" = ctrDateUs(),
      "completion_date"         = ctrDateUs(),
      "firstreceived_date"      = ctrDateUs(),
      "resultsfirst_posted"     = ctrDateUs(),
      "lastupdate_posted"       = ctrDateUs(),
      "lastchanged_date"        = ctrDateUs(),
      #
      #
      # factors
      #
      # - EUCTR Yes / No / Information not present in EudraCT
      "d21_imp_to_be_used_in_the_trial_has_a_marketing_authorisation" = ctrYesNo(),
      "e13_condition_being_studied_is_a_rare_disease" = ctrYesNo(),
      #
      "e61_diagnosis"         = ctrYesNo(),
      "e62_prophylaxis"       = ctrYesNo(),
      "e63_therapy"           = ctrYesNo(),
      "e64_safety"            = ctrYesNo(),
      "e65_efficacy"          = ctrYesNo(),
      "e66_pharmacokinetic"   = ctrYesNo(),
      "e67_pharmacodynamic"   = ctrYesNo(),
      "e68_bioequivalence"    = ctrYesNo(),
      "e69_dose_response"     = ctrYesNo(),
      "e610_pharmacogenetic"  = ctrYesNo(),
      "e611_pharmacogenomic"  = ctrYesNo(),
      "e612_pharmacoeconomic" = ctrYesNo(),
      "e613_others"           = ctrYesNo(),
      #
      "e71_human_pharmacology_phase_i"         = ctrYesNo(),
      "e711_first_administration_to_humans"    = ctrYesNo(),
      "e712_bioequivalence_study"              = ctrYesNo(),
      "e713_other"                             = ctrYesNo(),
      "e72_therapeutic_exploratory_phase_ii"   = ctrYesNo(),
      "e73_therapeutic_confirmatory_phase_iii" = ctrYesNo(),
      "e74_therapeutic_use_phase_iv"           = ctrYesNo(),
      #
      "e81_controlled"      = ctrYesNo(),
      "e811_randomised"     = ctrYesNo(),
      "e812_open"           = ctrYesNo(),
      "e813_single_blind"   = ctrYesNo(),
      "e814_double_blind"   = ctrYesNo(),
      "e815_parallel_group" = ctrYesNo(),
      "e816_cross_over"     = ctrYesNo(),
      "e817_other"          = ctrYesNo(),
      #
      "e83_the_trial_involves_single_site_in_the_member_state_concerned"    = ctrYesNo(),
      "e84_the_trial_involves_multiple_sites_in_the_member_state_concerned" = ctrYesNo(),
      "e85_the_trial_involves_multiple_member_states"                       = ctrYesNo(),
      "e861_trial_being_conducted_both_within_and_outside_the_eea"          = ctrYesNo(),
      "e862_trial_being_conducted_completely_outside_of_the_eea"            = ctrYesNo(),
      "e87_trial_has_a_data_monitoring_committee"                           = ctrYesNo(),
      #
      "f11_trial_has_subjects_under_18"            = ctrYesNo(),
      "f111_in_utero"                              = ctrYesNo(),
      "f112_preterm_newborn_infants_up_to_gestational_age__37_weeks" = ctrYesNo(),
      "f113_newborns_027_days"                     = ctrYesNo(),
      "f114_infants_and_toddlers_28_days23_months" = ctrYesNo(),
      "f115_children_211years"                     = ctrYesNo(),
      "f116_adolescents_1217_years"                = ctrYesNo(),
      "f12_adults_1864_years"                      = ctrYesNo(),
      "f13_elderly_65_years"                       = ctrYesNo(),
      "f21_female"                                 = ctrYesNo(),
      "f22_male"                                   = ctrYesNo(),
      "f31_healthy_volunteers"                     = ctrYesNo(),
      "f32_patients"                               = ctrYesNo(),
      "f33_specific_vulnerable_populations"        = ctrYesNo(),
      "f331_women_of_childbearing_potential_not_using_contraception_" = ctrYesNo(),
      "f332_women_of_childbearing_potential_using_contraception"      = ctrYesNo(),
      "f333_pregnant_women"      = ctrYesNo(),
      "f334_nursing_women"       = ctrYesNo(),
      "f335_emergency_situation" = ctrYesNo(),
      "f336_subjects_incapable_of_giving_consent_personally" = ctrYesNo(),
      #
      # - CTGOV
      "has_expanded_access"            = ctrYesNo(),
      "oversight_info.has_dmc"         = ctrYesNo(),
      "eligibility.healthy_volunteers" = ctrYesNo(),
      #
      #
      # numbers
      #
      # - EUCTR
      "e824_number_of_treatment_arms_in_the_trial" = ctrInt(),
      "e891_in_the_member_state_concerned_years"   = ctrInt(),
      "e891_in_the_member_state_concerned_months"  = ctrInt(),
      "e891_in_the_member_state_concerned_days"    = ctrInt(),
      "e892_in_all_countries_concerned_by_the_trial_years"  = ctrInt(),
      "e892_in_all_countries_concerned_by_the_trial_months" = ctrInt(),
      "e892_in_all_countries_concerned_by_the_trial_days"   = ctrInt(),
      "e841_number_of_sites_anticipated_in_member_state_concerned" = ctrInt(),
      "f11_number_of_subjects_for_this_age_range"   = ctrInt(),
      "f1111_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1121_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1131_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1141_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1151_number_of_subjects_for_this_age_range" = ctrInt(),
      "f1161_number_of_subjects_for_this_age_range" = ctrInt(),
      "f121_number_of_subjects_for_this_age_range"  = ctrInt(),
      "f131_number_of_subjects_for_this_age_range"  = ctrInt(),
      "f41_in_the_member_state"          = ctrInt(),
      "f421_in_the_eea"                  = ctrInt(),
      "f422_in_the_whole_clinical_trial" = ctrInt(),
      #
      # - CTGOV
      "number_of_arms" = ctrInt(),
      "enrollment"     = ctrInt(),
      #
      # TODO: results-related variables
      "trialInformation.analysisForPrimaryCompletion" = ctrFalseTrue()
      #
    )
  },
  silent = TRUE)

  # prepare output
  if (!inherits(tmp, "try-error") &&
      !is.null(unlist(tmp, use.names = FALSE))) {

    # need to construct new data frame,
    # since replacing columns with
    # posixct did not work
    dfn <- names(dfi)
    dfi <- data.frame(dfi[["_id"]],
                      tmp,
                      stringsAsFactors = FALSE)
    names(dfi) <- dfn

  }

  # return
  return(dfi)

} # end typeField


#' Annotate ctrdata function return values
#'
#' @param x object to be annotated
#'
#' @inheritParams ctrDb
#'
#' @keywords internal
#'
addMetaData <- function(x, con) {

  # add metadata
  attr(x, "ctrdata-dbname")         <- con$db
  attr(x, "ctrdata-table")          <- con$collection
  attr(x, "ctrdata-dbqueryhistory") <- dbQueryHistory(
    con = con,
    verbose = FALSE)

  # return annotated object
  return(x)

} # end addMetaData


#' Function to set proxy
#'
#' @importFrom curl ie_proxy_info
#'
#' @keywords internal
#'
setProxy <- function() {

  # only act if environment
  # variable is not already set
  if (Sys.getenv("https_proxy") == "") {

    # works under windows only
    p <- curl::ie_proxy_info()$Proxy

    if (!is.null(p)) {

      # used by httr and curl
      Sys.setenv(https_proxy = p)

    }
  }
} # end setproxy


#' Convenience function to install a minimal cygwin environment under MS
#' Windows, including perl, sed and php
#'
#' Alternatively and in case of difficulties, download and run the cygwin
#' setup yourself as follows: \code{cygwinsetup.exe --no-admin --quiet-mode
#' --verbose --upgrade-also --root c:/cygwin --site
#' http://www.mirrorservice.org/sites/sourceware.org/pub/cygwin/ --packages
#' perl,php-jsonc,php-simplexml}
#'
#' @export
#'
#' @param force Set to \code{TRUE} to force updating and overwriting an existing
#'   installation in \code{c:\\cygwin}
#' @param proxy Specify any proxy to be used for downloading via http, e.g.
#'   "host_or_ip:port". \code{installCygwinWindowsDoInstall} may detect and use
#'   the proxy configuration used in MS Windows to use an automatic proxy
#'   configuration script. Authenticated proxies are not supported at this time.
#'
installCygwinWindowsDoInstall <- function(
  force = FALSE,
  proxy = "") {

  # checks
  if (.Platform$OS.type != "windows") {
    stop(
      "This function is only for MS Windows operating systems.",
      call. = FALSE)
  }
  #
  if (!force & dir.exists("c:\\cygwin")) {
    message("cygwin is already installed in c:\\cygwin. ",
            "To re-install, use force = TRUE.")
    # exit function after testing
    return(installCygwinWindowsTest(verbose = TRUE))
  }

  # define installation command
  installcmd <- paste0(
    "--no-admin --quiet-mode --upgrade-also --no-shortcuts --prune-install ",
    "--root c:/cygwin ",
    "--site http://www.mirrorservice.org/sites/sourceware.org/pub/cygwin/ ",
    "--packages perl,php-simplexml,php-json")

  # create R session temporary directory
  tmpfile <- paste0(tempdir(), "/cygwin_inst")
  dir.create(tmpfile)
  dstfile <- paste0(tmpfile, "/cygwinsetup.exe")

  # generate download url
  tmpurl <- ifelse(
    grepl("x64", utils::win.version()),
    "setup-x86_64.exe",
    "setup-x86.exe")
  tmpurl <- paste0("https://cygwin.org/", tmpurl)

  # inform user
  message("Attempting cygwin download using ",
          tmpurl, " ...")

  # check and set proxy if needed to access internet
  setProxy()

  # download.file uses the proxy configured in the system
  tmpdl <- try({
    utils::download.file(
      url = tmpurl,
      destfile = dstfile,
      quiet = FALSE,
      mode = "wb")
  }, silent = TRUE)

  # check
  if (!file.exists(dstfile) ||
      file.size(dstfile) < (5 * 10 ^ 5) ||
      (inherits(tmpdl, "try-error"))) {
    stop("Failed, please download manually and install with:\n",
         tmpurl, " ", installcmd,
         call. = FALSE)
  }

  # proxy handling
  if (proxy != "") {
    # manual setting overriding
    proxy <- paste0(" --proxy ", proxy)
    message("Setting cygwin proxy install argument to: ",
            proxy, ", based on provided parameter.")
  } else {
    # detect proxy
    proxy <- curl::ie_proxy_info()$Proxy
    if (!is.null(proxy)) {
      message("Setting cygwin proxy install argument to: ",
              proxy, ", based on system settings.")
      proxy <- paste0(" --proxy ", proxy)
    }
  }

  # execute cygwin setup command
  system(paste0(dstfile, " ", installcmd,
                " --local-package-dir ", tmpfile, " ", proxy))

  # return cygwin installation test
  return(installCygwinWindowsTest(verbose = TRUE))

}
# end installCygwinWindowsDoInstall


#' Convenience function to test for working cygwin installation
#'
#' @param verbose If \code{TRUE}, prints confirmatory
#'  message (default \code{FALSE})
#'
#' @return Information if cygwin can be used, \code{TRUE}
#'  or \code{FALSE}, or NULL if not under MS Windows
#'
#' @keywords internal
#
installCygwinWindowsTest <- function(verbose = FALSE) {
  #
  if (.Platform$OS.type != "windows") {
    message("Function installCygwinWindowsTest() is ",
            "only for MS Windows operating systems.")
    return(invisible(NULL))
  }
  #
  tmpcygwin <- try({
    suppressWarnings(
      system(
        paste0("cmd.exe /c ",
               rev(Sys.glob("c:\\cygw*\\bin\\bash.exe"))[1],
               " --version"),
        intern = TRUE,
        ignore.stderr = TRUE
      ))},
    silent = TRUE)
  #
  if ((class(tmpcygwin) != "try-error") &
      (length(tmpcygwin) > 5)) {
    if (verbose) message("cygwin seems to work correctly.")
    return(invisible(TRUE))
  } else {
    message("cygwin is not available for this package, ",
            "ctrLoadQueryIntoDb() will not work.\n",
            "Consider calling ",
            "ctrdata::installCygwinWindowsDoInstall() ",
            "from within R.")
    return(invisible(FALSE))
  }
}
# end installCygwinWindowsTest


#' Check availability of binaries installed locally
#'
#' @param commandtest Command to be used for testing
#' the availability of the binary, e.g. "php -v".
#' Note internal quotes need to be escaped, e.g.
#' \code{installFindBinary('php -r
#' \"simplexml_load_string(\'\');\"')}.
#' See R/onload.R for tested binaries.
#'
#' @param verbose Set to \code{TRUE} to see printed
#' return value of \code{commandtest}
#'
#' @return A logical if executing commandtest
#' returned an error or not
#'
#' @keywords internal
#
installFindBinary <- function(commandtest = NULL, verbose = FALSE) {
  #
  if (is.null(commandtest)) {
    stop("Empty argument: commandtest",
         call. = FALSE)
  }
  #
  if (.Platform$OS.type == "windows") {
    commandtest <-
      paste0(rev(Sys.glob("c:\\cygw*\\bin\\bash.exe"))[1],
             " --login -c ",
             shQuote(commandtest))
  }
  #
  if (verbose) print(commandtest)
  #
  commandresult <- try(
    suppressWarnings(
      system(commandtest,
             intern = TRUE,
             ignore.stderr =
               ifelse(.Platform$OS.type == "windows",
                      FALSE, TRUE))),
    silent = TRUE
  )
  #
  commandreturn <- ifelse(
    class(commandresult) == "try-error" ||
      grepl("error|not found", tolower(paste(commandresult, collapse = " "))) ||
      (!is.null(attr(commandresult, "status")) &&
         (attr(commandresult, "status") != 0)),
    FALSE, TRUE)
  #
  if (!commandreturn) {
    warning(commandtest, " not found.",
            call. = FALSE,
            immediate. = FALSE)
  } else {
    if (interactive()) {
      message(". ", appendLF = FALSE)
    }
  }
  #
  if (verbose) {
    print(commandresult)
  }
  #
  return(commandreturn)
  #
}
# end installFindBinary
