% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_nns.R
\name{get_nns}
\alias{get_nns}
\title{Given a tokenized corpus and a set of candidate neighbors, find the top N nearest
neighbors.}
\usage{
get_nns(
  x,
  N = 10,
  groups = NULL,
  candidates = character(0),
  pre_trained,
  transform = TRUE,
  transform_matrix,
  bootstrap = TRUE,
  num_bootstraps = 100,
  confidence_level = 0.95,
  stem = FALSE,
  language = "porter",
  as_list = TRUE
)
}
\arguments{
\item{x}{a (quanteda) \code{tokens-class} object}

\item{N}{(numeric) number of nearest neighbors to return}

\item{groups}{a character or factor variable equal in length to the number of documents}

\item{candidates}{(character) vector of features to consider as candidates to be nearest neighbor
You may for example want to only consider features that meet a certain count threshold
or exclude stop words etc. To do so you can simply identify the set of features you
want to consider and supply these as a character vector in the \code{candidates} argument.}

\item{pre_trained}{(numeric) a F x D matrix corresponding to pretrained embeddings.
F = number of features and D = embedding dimensions.
rownames(pre_trained) = set of features for which there is a pre-trained embedding.}

\item{transform}{(logical) if TRUE (default) apply the 'a la carte' transformation,
if FALSE ouput untransformed averaged embeddings.}

\item{transform_matrix}{(numeric) a D x D 'a la carte' transformation matrix.
D = dimensions of pretrained embeddings.}

\item{bootstrap}{(logical) if TRUE, use bootstrapping -- sample from \code{x} with replacement and
re-estimate cosine similarities for each sample. Required to get std. errors.
If \code{groups} defined, sampling is automatically stratified.}

\item{num_bootstraps}{(integer) number of bootstraps to use.}

\item{confidence_level}{(numeric in (0,1)) confidence level e.g. 0.95}

\item{stem}{(logical) - whether to stem candidates when evaluating nns. Default is FALSE.
If TRUE, candidate stems are ranked by their average cosine similarity to the target.
We recommend you remove misspelled words from candidate set \code{candidates} as these can
significantly influence the average.}

\item{language}{the name of a recognized language, as returned by
     \code{\link[SnowballC]{getStemLanguages}}, or a two- or three-letter ISO-639
     code corresponding to one of these languages (see references for
     the list of codes).
  }

\item{as_list}{(logical) if FALSE all results are combined into a single data.frame
If TRUE, a list of data.frames is returned with one data.frame per group.}
}
\value{
a \code{data.frame} or list of data.frames (one for each target)
with the following columns:
\describe{
\item{\code{target}}{ (character) rownames of \code{x},
the labels of the ALC embeddings. \code{NA} if \code{is.null(rownames(x))}.}
\item{\code{feature}}{(character) features identified as nearest neighbors.}
\item{\code{rank}}{(character) rank of feature in terms of similarity with \code{x}.}
\item{\code{value}}{(numeric) cosine similarity between \code{x}
and feature. Average over bootstrapped samples if bootstrap = TRUE.}
\item{\code{std.error}}{(numeric) std. error of the similarity value.
Column is dropped if bootstrap = FALSE.}
\item{\code{lower.ci}}{(numeric) (if bootstrap = TRUE) lower bound of the confidence interval.}
\item{\code{upper.ci}}{(numeric) (if bootstrap = TRUE) upper bound of the confidence interval.}
}
}
\description{
This is a wrapper function for \code{nns()} that allows users to go from a
tokenized corpus to results with the option to bootstrap cosine similarities
and get the corresponding std. errors.
}
\examples{

library(quanteda)

# tokenize corpus
toks <- tokens(cr_sample_corpus)

# build a tokenized corpus of contexts sorrounding a target term
immig_toks <- tokens_context(x = toks, pattern = "immigration", window = 6L)

# sample 100 instances of the target term, stratifying by party (only for example purposes)
set.seed(2022L)
immig_toks <- tokens_sample(immig_toks, size = 100, by = docvars(immig_toks, 'party'))

# we limit candidates to features in our corpus
feats <- featnames(dfm(immig_toks))

# compare nearest neighbors between groups
set.seed(2021L)
immig_party_nns <- get_nns(x = immig_toks, N = 10,
                           groups = docvars(immig_toks, 'party'),
                           candidates = feats,
                           pre_trained = cr_glove_subset,
                           transform = TRUE,
                           transform_matrix = cr_transform,
                           bootstrap = TRUE,
                           num_bootstraps = 100,
                           stem = TRUE,
                           as_list = TRUE)

# nearest neighbors of "immigration" for Republican party
immig_party_nns[["R"]]
}
\keyword{get_nns}
