% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/string_tools.R
\name{string_split2df}
\alias{string_split2df}
\alias{string_split2dt}
\title{Splits a character vector into a data frame}
\usage{
string_split2df(
  x,
  data = NULL,
  split = NULL,
  id = NULL,
  add.pos = FALSE,
  id_unik = TRUE,
  fixed = FALSE,
  ignore.case = FALSE,
  word = FALSE,
  envir = parent.frame(),
  dt = FALSE,
  ...
)

string_split2dt(
  x,
  data = NULL,
  split = NULL,
  id = NULL,
  add.pos = FALSE,
  id_unik = TRUE,
  fixed = FALSE
)
}
\arguments{
\item{x}{A character vector or a two-sided formula. If a two-sided formula, then the
argument \code{data} must be provided since the variables will be fetched in there.
A formula is of the form \code{char_var ~ id1 + id2} where \code{char_var} on the left is a
character variable and on the right \code{id1} and \code{id2} are identifiers which will be
included in the resulting table. Alternatively, you can provide identifiers via
the argument \code{id}.}

\item{data}{Optional, only used if the argument \code{x} is a formula. It should
contain the variables of the formula.}

\item{split}{A character scalar. Used to split the character vectors. By default
this is a regular expression. You can use flags in the pattern in the form \verb{flag1, flag2/pattern}.
Available flags are \code{ignore} (case), \code{fixed} (no regex), word (add word boundaries),
magic (add interpolation with \code{"{}"}). Example:
if "ignore/hello" and the text contains "Hello", it will be split at "Hello".
Shortcut: use the first letters of the flags. Ex: "iw/one" will split at the word
"one" (flags 'ignore' + 'word').}

\item{id}{Optional. A character vector or a list of vectors. If provided, the
values of \code{id} are considered as identifiers that will be included in the resulting table.}

\item{add.pos}{Logical, default is \code{FALSE}. Whether to include the position of each split element.}

\item{id_unik}{Logical, default is \code{TRUE}. In the case identifiers are provided,
whether to trigger a message if the identifiers are not unique. Indeed, if
the identifiers are not unique, it is not possible to reconstruct the original texts.}

\item{fixed}{Logical, default is \code{FALSE}. Whether to consider the argument \code{split}
as fixed (and not as a regular expression).}

\item{ignore.case}{Logical scalar, default is \code{FALSE}. If \code{TRUE}, then case insensitive search is triggered.}

\item{word}{Logical scalar, default is \code{FALSE}. If \code{TRUE} then a) word boundaries are added to the pattern,
and b) patterns can be chained by separating them with a comma, they are combined with an OR logical operation.
Example: if \code{word = TRUE}, then pattern = "The, mountain" will select strings containing either the word
'The' or the word 'mountain'.}

\item{envir}{Environment in which to evaluate the interpolations if the flag \code{"magic"} is provided.
Default is \code{parent.frame()}.}

\item{dt}{Logical, default is \code{FALSE}. Whether to return a \code{data.table}. See also the function \code{string_split2dt}.}

\item{...}{Not currently used.}
}
\value{
It returns a \code{data.frame} or a \code{data.table} which will contain: i) \code{obs}: the observation index,
ii) \code{pos}: the position of the text element in the initial string (optional, via add.pos),
iii) the text element, iv) the identifier(s) (optional, only if \code{id} was provided).
}
\description{
Splits a character vector and formats the resulting substrings into a data.frame
}
\section{Functions}{
\itemize{
\item \code{string_split2dt()}: Splits a string vector and returns a \code{data.table}

}}
\examples{

x = c("Nor rain, wind, thunder, fire are my daughters.",
      "When my information changes, I alter my conclusions.")

id = c("ws", "jmk")

# we split at each word
string_split2df(x, "[[:punct:] ]+")

# we add the 'id'
string_split2df(x, "[[:punct:] ]+", id = id)

# TO NOTE:
# - the second argument is `data`
# - when it is missing, the argument `split` becomes implicitly the second
# - ex: above we did not use `split = "[[:punct:] ]+"`

#
# using the formula

base = data.frame(text = x, my_id = id)
string_split2df(text ~ my_id, base, "[[:punct:] ]+")

#
# with 2+ identifiers

base = within(mtcars, carname <- rownames(mtcars))

# we have a message because the identifiers are not unique
string_split2df(carname ~ am + gear + carb, base, " +")

# adding the position of the words & removing the message
string_split2df(carname ~ am + gear + carb, base, " +", id_unik = FALSE, add.pos = TRUE)


}
\seealso{
String operations: \code{\link[=string_is]{string_is()}}, \code{\link[=string_get]{string_get()}}, \code{\link[=string_clean]{string_clean()}}, \code{\link[=string_split2df]{string_split2df()}}.
Chain basic operations with \code{\link[=string_ops]{string_ops()}}. Clean character vectors efficiently
with \code{\link[=string_clean]{string_clean()}}.

Use \code{\link[=string_vec]{string_vec()}} to create simple string vectors.

String interpolation combined with operation chaining: \code{\link[=string_magic]{string_magic()}}. You can change \code{string_magic}
default values with \code{\link[=string_magic_alias]{string_magic_alias()}} and add custom operations with \code{\link[=string_magic_register_fun]{string_magic_register_fun()}}.

Display messages while benefiting from \code{string_magic} interpolation with \code{\link[=cat_magic]{cat_magic()}} and \code{\link[=message_magic]{message_magic()}}.

Other tools with aliases: 
\code{\link{cat_magic_alias}()},
\code{\link{string_magic}()},
\code{\link{string_magic_alias}()},
\code{\link{string_ops_alias}()},
\code{\link{string_vec_alias}()}
}
