% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/preparation.R
\name{bow_pp_create_basic_text_rep}
\alias{bow_pp_create_basic_text_rep}
\title{Prepare texts for text embeddings with a bag of word approach.}
\usage{
bow_pp_create_basic_text_rep(
  data,
  vocab_draft,
  remove_punct = TRUE,
  remove_symbols = TRUE,
  remove_numbers = TRUE,
  remove_url = TRUE,
  remove_separators = TRUE,
  split_hyphens = FALSE,
  split_tags = FALSE,
  language_stopwords = "de",
  use_lemmata = FALSE,
  to_lower = FALSE,
  min_termfreq = NULL,
  min_docfreq = NULL,
  max_docfreq = NULL,
  window = 5,
  weights = 1/(1:5),
  trace = TRUE
)
}
\arguments{
\item{data}{\code{vector} containing the raw texts.}

\item{vocab_draft}{Object created with \link{bow_pp_create_vocab_draft}.}

\item{remove_punct}{\code{bool} \code{TRUE} if punctuation should be removed.}

\item{remove_symbols}{\code{bool} \code{TRUE} if symbols should be removed.}

\item{remove_numbers}{\code{bool} \code{TRUE} if numbers should be removed.}

\item{remove_url}{\code{bool} \code{TRUE} if urls should be removed.}

\item{remove_separators}{\code{bool} \code{TRUE} if separators should be removed.}

\item{split_hyphens}{\code{bool} \code{TRUE} if hyphens should be split into several tokens.}

\item{split_tags}{\code{bool} \code{TRUE} if tags should be split.}

\item{language_stopwords}{\code{string} Abbreviation for the language for which stopwords should be
removed.}

\item{use_lemmata}{\code{bool} \code{TRUE} lemmas instead of original tokens should be used.}

\item{to_lower}{\code{bool} \code{TRUE} if tokens or lemmas should be used with lower cases.}

\item{min_termfreq}{\code{int} Minimum frequency of a token to be part of the vocabulary.}

\item{min_docfreq}{\code{int} Minimum appearance of a token in documents to be part of the vocabulary.}

\item{max_docfreq}{\code{int} Maximum appearance of a token in documents to be part of the vocabulary.}

\item{window}{\code{int} size of the window for creating the feature-co-occurance matrix.}

\item{weights}{\code{vector} weights for the corresponding window. The vector length must be equal to the window size.}

\item{trace}{\code{bool} \code{TRUE} if information about the progress should be
printed to console.}
}
\value{
Returns a \code{list} of class \code{basic_text_rep} with the following components.
\itemize{
\item{\code{dfm: }}{Document-Feature-Matrix. Rows correspond to the documents. Columns represent
the number of tokens in the document.}

\item{\code{fcm: }}{Feature-Co-Occurance-Matrix.}

\item{\code{information: }}{\code{list} containing information about the used vocabulary. These are:
 \itemize{
 \item{\code{n_sentence: }} {Number of sentences}
 \item{\code{n_document_segments: }} {Number of document segments/raw texts}
 \item{\code{n_token_init: }} {Number of initial tokens}
 \item{\code{n_token_final: }} {Number of final tokens}
 \item{\code{n_lemmata: }} {Number of lemmas}
  }}

\item{\code{configuration: }}{\code{list} containing information if the vocabulary was
created with lower cases and if the vocabulary uses original tokens or lemmas.}

\item{\code{language_model: }}{\code{list} containing information about the applied
language model. These are:
\itemize{
\item{\code{model: }} {the udpipe language model}
\item{\code{label: }} {the label of the udpipe language model}
\item{\code{upos: }} {the applied universal part-of-speech tags}
\item{\code{language: }} {the language}
\item{\code{vocab: }} {a \code{data.frame} with the original vocabulary}
}}

}
}
\description{
This function prepares raw texts for use with \link{TextEmbeddingModel}.
}
\seealso{
Other Preparation: 
\code{\link{bow_pp_create_vocab_draft}()}
}
\concept{Preparation}
