% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/tokens.R
\name{tokens}
\alias{tokens}
\title{Construct a tokens object}
\usage{
tokens(
  x,
  what = "word",
  remove_punct = FALSE,
  remove_symbols = FALSE,
  remove_numbers = FALSE,
  remove_url = FALSE,
  remove_separators = TRUE,
  split_hyphens = FALSE,
  split_tags = FALSE,
  include_docvars = TRUE,
  padding = FALSE,
  verbose = quanteda_options("verbose"),
  ...
)
}
\arguments{
\item{x}{the input object to the tokens constructor, one of: a (uniquely)
named \strong{list} of characters; a \link{tokens} object; or a \link{corpus} or
\link{character} object that will be tokenized}

\item{what}{character; which tokenizer to use.  The default \code{what = "word"}
is the version 2 \pkg{quanteda} tokenizer.  Legacy tokenizers (version < 2)
are also supported, including the default \code{what = "word1"}. See the Details
and quanteda Tokenizers below.}

\item{remove_punct}{logical; if \code{TRUE} remove all characters in the Unicode
"Punctuation" \verb{[P]} class, with exceptions for those used as prefixes for
valid social media tags if \code{preserve_tags = TRUE}}

\item{remove_symbols}{logical; if \code{TRUE} remove all characters in the Unicode
"Symbol" \verb{[S]} class}

\item{remove_numbers}{logical; if \code{TRUE} remove tokens that consist only of
numbers, but not words that start with digits, e.g. \verb{2day}}

\item{remove_url}{logical; if \code{TRUE} find and eliminate URLs beginning with
http(s)}

\item{remove_separators}{logical; if \code{TRUE} remove separators and separator
characters (Unicode "Separator" \verb{[Z]} and "Control" \verb{[C]} categories)}

\item{split_hyphens}{logical; if \code{FALSE}, do not split words that are
connected by hyphenation and hyphenation-like characters in between words,
e.g. \code{"self-aware"} becomes \code{c("self", "-", "aware")}}

\item{split_tags}{logical; if \code{FALSE}, do not split social media tags defined
in \code{quanteda_options()}. The default patterns are \code{pattern_hashtag = "#\\\\w+#?"} and \code{pattern_username = "@[a-zA-Z0-9_]+"}.}

\item{include_docvars}{if \code{TRUE}, pass docvars through to the tokens object.
Does not apply when the input is a character data or a list of characters.}

\item{padding}{if \code{TRUE}, leave an empty string where the removed tokens
previously existed.  This is useful if a positional match is needed between
the pre- and post-selected tokens, for instance if a window of adjacency
needs to be computed.}

\item{verbose}{if \code{TRUE}, print timing messages to the console}

\item{...}{used to pass arguments among the functions}
}
\value{
\pkg{quanteda} \code{tokens} class object, by default a serialized list of
integers corresponding to a vector of types.
}
\description{
Construct a tokens object, either by importing a named list of characters
from an external tokenizer, or by calling the internal \pkg{quanteda}
tokenizer.
}
\details{
\code{tokens()} works on tokens class objects, which means that the removal rules
can be applied post-tokenization, although it should be noted that it will
not be possible to remove things that are not present.  For instance, if the
\code{tokens} object has already had punctuation removed, then \code{tokens(x, remove_punct = TRUE)} will have no additional effect.
}
\section{Details}{
 As of version 2, the choice of tokenizer is left more to
the user, and \code{tokens()} is treated more as a constructor (from a named
list) than a tokenizer. This allows users to use any other tokenizer that
returns a named list, and to use this as an input to \code{tokens()}, with
removal and splitting rules applied after this has been constructed (passed
as arguments).  These removal and splitting rules are conservative and will
not remove or split anything, however, unless the user requests it.

You usually do not want to split hyphenated words or social media tags, but
extra steps required to preserve such special tokens. If there are many
random characters in your texts, you should \code{split_hyphens = TRUE} and
\code{split_tags = TRUE} to avoid a slowdown in tokenization.

Using external tokenizers is best done by piping the output from these
other tokenizers into the \code{tokens()} constructor, with additional removal
and splitting options applied at the construction stage.  These will only
have an effect, however, if the tokens exist for which removal is specified
at in the \code{tokens()} call.  For instance, it is impossible to remove
punctuation if the input list to \code{tokens()} already had its punctuation
tokens removed at the external tokenization stage.

To construct a tokens object from a list with no additional processing,
call \code{\link[=as.tokens]{as.tokens()}} instead of \code{tokens()}.

Recommended tokenizers are those from the \pkg{tokenizers} package, which
are generally faster than the default (built-in) tokenizer but always
splits infix hyphens, or \pkg{spacyr}.
}

\section{quanteda Tokenizers}{
 The default word tokenizer \code{what = "word"}
splits tokens using \link[stringi:stri_split_boundaries]{stri_split_boundaries(x, type = "word")} but by default preserves infix
hyphens (e.g. "self-funding"), URLs, and social media "tag" characters
(#hashtags and @usernames), and email addresses.  The rules defining a
valid "tag" can be found at
https://www.hashtags.org/featured/what-characters-can-a-hashtag-include/
for hashtags and at
https://help.twitter.com/en/managing-your-account/twitter-username-rules
for usernames.

In versions < 2, the argument \code{remove_twitter} controlled whether social
media tags were preserved or removed, even when \code{remove_punct = TRUE}. This
argument is not longer functional in versions >= 2.  If greater control
over social media tags is desired, you should user an alternative
tokenizer, including non-\pkg{quanteda} options.

For backward compatibility, the following older tokenizers are also
supported through \code{what}: \describe{ \item{\code{"word1"}}{(legacy) implements
similar behaviour to the version of \code{what = "word"} found in pre-version 2.
(It preserves social media tags and infix hyphens, but splits URLs.)
"word1" is also slower than "word".} \item{\code{"fasterword"}}{(legacy) splits
on whitespace and control characters, using
\code{stringi::stri_split_charclass(x, "[\\\\p{Z}\\\\p{C}]+")}}
\item{\code{"fastestword"}}{(legacy) splits on the space character, using
\code{stringi::stri_split_fixed(x, " ")}} \item{\code{"character"}}{tokenization into
individual characters} \item{\code{"sentence"}}{sentence segmenter based on
\link[stringi:stri_split_boundaries]{stri_split_boundaries}, but with
additional rules to avoid splits on words like "Mr." that would otherwise
incorrectly be detected as sentence boundaries.  For better sentence
tokenization, consider using \pkg{spacyr}.} }
}

\examples{
txt <- c(doc1 = "A sentence, showing how tokens() works.",
         doc2 = "@quantedainit and #textanalysis https://example.com?p=123.",
         doc3 = "Self-documenting code??",
         doc4 = "£1,000,000 for 50¢ is gr8 4ever \U0001f600")
tokens(txt)
tokens(txt, what = "word1")

# removing punctuation marks but keeping tags and URLs
tokens(txt[1:2], remove_punct = TRUE)

# splitting hyphenated words
tokens(txt[3])
tokens(txt[3], split_hyphens = TRUE)

# symbols and numbers
tokens(txt[4])
tokens(txt[4], remove_numbers = TRUE)
tokens(txt[4], remove_numbers = TRUE, remove_symbols = TRUE)

\dontrun{# using other tokenizers
tokens(tokenizers::tokenize_words(txt[4]), remove_symbols = TRUE)
tokenizers::tokenize_words(txt, lowercase = FALSE, strip_punct = FALSE) \%>\%
    tokens(remove_symbols = TRUE)
tokenizers::tokenize_characters(txt[3], strip_non_alphanum = FALSE) \%>\%
    tokens(remove_punct = TRUE)
tokenizers::tokenize_sentences(
    "The quick brown fox.  It jumped over the lazy dog.") \%>\%
    tokens()
}

}
\seealso{
\code{\link[=tokens_ngrams]{tokens_ngrams()}}, \code{\link[=tokens_skipgrams]{tokens_skipgrams()}}, \code{\link[=as.list.tokens]{as.list.tokens()}},
\code{\link[=as.tokens]{as.tokens()}}
}
\keyword{tokens}
