% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/RcppExports.R
\name{AhoCorasickSearch}
\alias{AhoCorasickSearch}
\title{Fast searching for one or more keywords in one or more texts}
\usage{
AhoCorasickSearch(keywords, text, alphabet = "ascii",
  groupByKeyword = FALSE)
}
\arguments{
\item{keywords}{Character vector of one or more keywords}

\item{text}{Character vector of one or more texts to search}

\item{alphabet}{Alphabet to use; one of \code{ascii}, \code{aminoacid}, or \code{nucleicacid}}

\item{groupByKeyword}{If true, matches are grouped by keyword (instead of by text)}
}
\value{
List of matches, grouped by either text or by keyword
}
\description{
Builds an Aho-Corasick trie from one or more keywords and uses it to
  search one or more texts. For a large number of keywords, Aho-Corasick is much faster
  than a naive approach (such as \code{lapply(keywords, gregexpr, text)}).

  Use \code{\link{AhoCorasickSearchList}} instead of \code{\link{AhoCorasickSearch}} when you want to keep the matches
  of each input text separate. If the input texts have names, the resulting list of matches will include those
  names and non-matched texts will be excluded from the results. If the input texts do
  not have names, then the resulting list of matches will be in the same order as the
  input texts, and non-matched texts will be kept to preserve that order. Thus, it is more
  efficient to use named input texts (so non-matched texts can be dropped).

  The default alphabet allows all 128 ASCII characters in the keywords and the texts.
  Characters outside this range will cause an error. A more efficient trie is possible
  if the alphabet size can be reduced. For example, DNA sequences use at most 19 distinct
  characters and usually only 4; protein sequences use at most 26 distinct characters and
  usually only 20. Set the \code{alphabet} parameter if a reduced alphabet is appropriate.

  UTF-8 (Unicode) matching is not currently supported.
}
\examples{
listEquals = function(a, b) { is.null(unlist(a)) && is.null(unlist(b)) ||
                              !is.null(a) && !is.null(b) && all(unlist(a) == unlist(b)) }

# 1. Search for multiple keywords in a single text
keywords = c("Abra", "cadabra", "is", "the", "Magic", "Word")
oneSearch = AhoCorasickSearch(keywords, "Is Abracadabra the Magic Word?")
stopifnot(listEquals(oneSearch[[1]][[1]], list(keyword="Abra", offset=4)))
stopifnot(listEquals(oneSearch[[1]][[2]], list(keyword="cadabra", offset=8)))
stopifnot(listEquals(oneSearch[[1]][[3]], list(keyword="the", offset=16)))
stopifnot(listEquals(oneSearch[[1]][[4]], list(keyword="Magic", offset=20)))
stopifnot(listEquals(oneSearch[[1]][[5]], list(keyword="Word", offset=26)))

# 2. Search multiple named texts in a named list with keyword grouping and aminoacid alphabet
# * all matches to a keyword are accessed by name
# * non-matched keywords are dropped
proteins = c(protein1="PEPTIDEPEPTIDEDADADARARARARAKEKEKEKEPEPTIDE",
             protein2="DERPADERPAPEWPEWPEEPEERAWRAWWARRAGTAGPEPTIDEKESEQUENCE")
peptides = c("PEPTIDE", "DERPA", "SEQUENCE", "KEKE", "PEPPIE")

peptideSearch = AhoCorasickSearch(peptides, proteins, alphabet="aminoacid", groupByKeyword=TRUE)
stopifnot(listEquals(peptideSearch$PEPTIDE, list(list(keyword="protein1", offset=1),
                                                 list(keyword="protein1", offset=8),
                                                 list(keyword="protein1", offset=37),
                                                 list(keyword="protein2", offset=38))))
stopifnot(listEquals(peptideSearch$DERPA, list(list(keyword="protein2", offset=1),
                                               list(keyword="protein2", offset=6))))
stopifnot(listEquals(peptideSearch$SEQUENCE, list(list(keyword="protein2", offset=47))))
stopifnot(listEquals(peptideSearch$KEKE, list(list(keyword="protein1", offset=29),
                                              list(keyword="protein1", offset=31),
                                              list(keyword="protein1", offset=33))))
stopifnot(listEquals(peptideSearch$PEPPIE, NULL))

# 3. Grouping by keyword without text names: offsets are given without reference to the text
names(proteins) = NULL
peptideSearch = AhoCorasickSearch(peptides, proteins, groupByKeyword=TRUE)
stopifnot(listEquals(peptideSearch$PEPTIDE, list(1, 8, 37, 38)))
stopifnot(listEquals(peptideSearch$DERPA, list(1, 6)))
stopifnot(listEquals(peptideSearch$SEQUENCE, list(47)))
stopifnot(listEquals(peptideSearch$KEKE, list(29, 31, 33)))
}
\seealso{
\itemize{
\item \href{http://www.codeproject.com/Articles/12383/Aho-Corasick-string-matching-in-C}{Aho-Corasick string matching in C#} for the article this package is based on
\item \code{\link[Biostrings]{matchPDict}} and \code{\link[Starr]{match_ac}} for more memory efficient, but DNA-only, implementations of the algorithm.
}
}

