% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dtjoin.R
\name{dtjoin}
\alias{dtjoin}
\title{Join data frame-like objects using an extended \code{DT[i]}-style interface
to data.table}
\usage{
dtjoin(
  .DT = NULL,
  .i = NULL,
  on,
  match.na = FALSE,
  mult = "all",
  mult.DT = "all",
  nomatch = NA,
  nomatch.DT = NULL,
  indicate = FALSE,
  select = NULL,
  select.DT = NULL,
  select.i = NULL,
  both = FALSE,
  on.first = FALSE,
  i.home = FALSE,
  i.first = i.home,
  prefix = if (i.home) "x." else "i.",
  i.class = i.home,
  do = !(is.null(.DT) && is.null(.i)),
  show = !do,
  verbose = FALSE,
  ...
)
}
\arguments{
\item{.DT, .i}{\code{data.frame}-like objects (plain, \code{data.table}, tibble,
\code{sf}, \code{list}, etc.), or else both omitted for a mock join
statement with no data.}

\item{on}{A character vector of join predicates, e.g. \code{c("id", "col_DT
== col_i", "date < date", "cost <= budget")}, or else \code{NA} for a
natural join (an equality join on all same-named columns).}

\item{match.na}{If \code{TRUE}, allow equality matches between \code{NA}s or
\code{NaN}s. Default \code{FALSE}.}

\item{mult}{(as in \code{[.data.table}) When a row of \code{.i} has multiple
matching rows in \code{.DT}, which to accept. One of \code{"all"} (the
default), \code{"first"}, or \code{"last"}.}

\item{mult.DT}{Like \code{mult}, but with the roles of \code{.DT} and
\code{.i} reversed, i.e. when a row of \code{.DT} has multiple matching
rows in \code{.i}, which to accept (default \code{"all"}). Can be combined
with \code{mult}. See Details.}

\item{nomatch}{(as in \code{[.data.table}) Either \code{NA} (the default) to
retain rows of \code{.i} with no match in \code{.DT}, or \code{NULL} to
exclude them.}

\item{nomatch.DT}{Like \code{nomatch} but with the roles of \code{.DT} and
\code{.i} reversed, and a different default: either \code{NA} to append
rows of \code{.DT} with no match in \code{.i}, or \code{NULL} (the default)
to leave them out.}

\item{indicate}{Whether to add a column \code{".join"} at the front of the
result, with values \code{1L} if from the "home" table only, \code{2L} if
from the "foreign" table only, and \code{3L} if joined from both tables
(c.f. \code{_merge} in Stata). Default \code{FALSE}.}

\item{select, select.DT, select.i}{Character vectors of columns to be selected
from either input if present (\code{select}) or specifically from one or
other (\code{select.DT}, \code{select.i}). \code{NULL} (the default)
selects all columns. Use \code{""} or \code{NA} to select no columns. Join
columns are always selected. See Details.}

\item{both}{Whether to include equality join columns from the "foreign"
table separately in the output, instead of combining them with those from
the "home" table. Default \code{FALSE}. Note that non-equality join columns
from the foreign table are always included separately.}

\item{on.first}{Whether to place the join columns from both inputs first in
the join result. Default \code{FALSE}.}

\item{i.home}{Whether to treat \code{.i} as the "home" table and \code{.DT}
as the "foreign" table for column prefixing and \code{indicate}. Default
\code{FALSE}, i.e. \code{.DT} is the "home" table, as in
\code{[.data.table}.}

\item{i.first}{Whether to place \code{.i}'s columns before \code{.DT}'s in
the join result. The default is to use the value of \code{i.home}, i.e.
bring \code{.i}'s columns to the front if \code{.i} is the "home" table.}

\item{prefix}{A prefix to attach to column names in the "foreign" table that
are the same as a column name in the "home" table. The default is
\code{"i."} if the "foreign" table is \code{.i} (\code{i.home} is
\code{FALSE}) and \code{"x."} if it is \code{.DT} (\code{i.home} is
\code{TRUE}).}

\item{i.class}{Whether the \code{class} of the output should be based on
\code{.i} instead of \code{.DT}. The default follows \code{i.home} (default
\code{FALSE}). See Details for how output \code{class} and other attributes
are set.}

\item{do}{Whether to execute the join. Default is \code{TRUE} unless
\code{.DT} and \code{.i} are both omitted/\code{NULL}, in which case a mock
join statement is produced.}

\item{show}{Whether to print the code for the join to the console. Default is
the opposite of \code{do}. If \code{.DT} and \code{.i} are both
omitted/\code{NULL}, mock join code is displayed.}

\item{verbose}{(passed to \code{[.data.table}) Whether data.table should
print information to the console during execution. Default \code{FALSE}.}

\item{...}{Further arguments (for internal use).}
}
\value{
A \code{data.frame}, \code{data.table}, (grouped) tibble, \code{sf},
or \code{sf}-tibble, or else \code{NULL} if \code{do} is \code{FALSE}. See
Details.
}
\description{
Write (and optionally run) \pkg{data.table} code for a join
using a generalisation of \code{DT[i]} syntax with extended arguments and
enhanced behaviour. Accepts any \code{data.frame}-like inputs (not only
\code{data.table}s), permits left, right, inner, and full joins, prevents
unwanted matches on \code{NA} and \code{NaN} by default, does not garble join
columns in non-equality joins, allows \code{mult} on both sides of the join,
creates an optional join indicator column, allows specifying which columns to
select from each input, and provides convenience options to control column
order and prefixing.

If run, the join returns a \code{data.frame}, \code{data.table}, tibble,
\code{sf}, or \code{sf}-tibble according to context. The generated
\pkg{data.table} code can be printed to the console instead of (or as well
as) being executed. This feature extends to \emph{mock joins}, where no
inputs are provided, and template code is produced.

\code{dtjoin} is the workhorse function for \code{\link{fjoin_inner}},
\code{\link{fjoin_left}}, \code{\link{fjoin_right}}, and
\code{\link{fjoin_full}}, which are wrappers providing a more conventional
interface for join operations. These functions are recommended over
\code{dtjoin} for most users and cases.
}
\details{
\subsection{Input and output class}{
Each input can be any object with class \code{data.frame}, or a plain
\code{list} of same-length vectors.

The output class depends on \code{.DT} by default (but \code{.i} with
\code{i.class = TRUE}) and is as follows:
\itemize{
  \item a \code{data.table} if the input is a pure \code{data.table}
  \item a tibble if it is a tibble (and a grouped tibble if it has class
  \code{grouped_df})
  \item an \code{sf} if it is an \code{sf} with its active geometry selected
  in the join
  \item a plain \code{data.frame} in all other cases
}
The following attributes are carried through and refreshed: \code{data.table}
key, tibble \code{groups}, \code{sf} \code{agr} (and \code{bbox} etc. of all
individual \code{sfc}-class columns regardless of output class). See below
for specifics. Other classes and attributes are not carried through.
}

\subsection{Specifying join conditions with \code{on}}{
\code{on} is a required argument. For a natural join (a join by equality on
all same-named column pairs), you must specify \code{on = NA}; you can't just
omit \code{on} as in other packages. This is to prevent a natural join being
specified by mistake, which may then go unnoticed.
}

\subsection{Using \code{select}, \code{select.DT}, and \code{select.i}}{
Used on its own, \code{select} keeps the join columns plus the specified
non-join columns from both inputs if present.

If \code{select.DT} is provided (and similarly for \code{select.i}) then:
\itemize{
 \item if \code{select} is also specified, non-join columns of \code{.DT}
 named in either \code{select} or \code{select.DT} are included
 \item if \code{select} is not specified, only non-join columns named in
 \code{select.DT} are included from \code{.DT}. Thus e.g.
 \code{select.DT = ""} excludes all of \code{.DT}'s non-join columns.
}
Non-existent column names are ignored without warning.
}

\subsection{Column order}{
When \code{select} is specified but \code{select.DT} and \code{select.i} are
not, the output consists of all join columns followed by the selected
non-join columns from either input in the order given in \code{select}.

In all other cases:
\itemize{
  \item columns from \code{.DT} come before columns from \code{.i} by default
  (but vice versa if \code{i.first} is \code{TRUE})
  \item within each group of columns, non-join columns are in the order
  given by \code{select.DT}/\code{select.i}, or in their original data order
  if no selection is provided
  \item if \code{on.first} is \code{TRUE}, join columns from both inputs are
  moved to the front of the overall output.
}
}

\subsection{Using \code{mult} and \code{mult.DT}}{
If both of these arguments are not the default \code{"all"}, \code{mult} is
applied first (typically by passing directly to \code{[.data.table}) and
\code{mult.DT} is applied subsequently to eliminate all but the first or last
occurrence of each row of \code{.DT} from the inner part of the join,
producing a 1:1 result. This order of operations can affect the identity of
the rows in the inner join.
}

\subsection{Displaying code and 'mock joins'}{
The option of displaying the join code with \code{show = TRUE} or by passing
null inputs is aimed at \pkg{data.table} users wanting to use the package as
a cookbook of recipes for adaptation. If \code{.DT} and \code{.i} are both
\code{NULL}, template code is displayed based on join column names implied by
\code{on}, plus sample non-join column names. \code{select} arguments are
ignored in this case.

The code displayed is for the join operation after casting the inputs as
\code{data.table}s if necessary, and before casting the result as a tibble
and/or \code{sf} if applicable. Note that \pkg{fjoin} departs from the usual
\code{j = list()} idiom in order to avoid a deep copy of the output made by
\code{as.data.table.list}. (Likewise, internally it takes only shallow copies
of columns when casting inputs or outputs to different classes.)
}

\subsection{tibble \code{groups}}{
If the relevant input is a grouped tibble (class \code{grouped_df}), the
output is grouped by the grouping columns that are selected in the result.
}

\subsection{\pkg{data.table} \code{key}s}{
If \code{.i} is a \code{key}ed \code{data.table} and the output is also a
\code{data.table}, it inherits \code{.i}'s key provided
\code{nomatch.DT} is \code{NULL} (i.e. the non-matching rows of \code{.DT}
are not included in the result). This differs from a \pkg{data.table}
\code{DT[i]} join, in which the output inherits the key of \code{DT}
provided it remains sorted on those columns. If not all of the key columns
are selected in the result, the leading subset is used.
}

\subsection{\pkg{sf} objects and \code{sfc}-class columns}{
Joins between two \code{sf} objects are supported. The relation-to-geometry
attribute \code{agr} is inherited from the input supplying the active
geometry. All \code{sfc}-class columns in the output are refreshed after
joining (using \code{sf::st_sfc()} with \code{recompute_bbox = TRUE}); this
is true regardless of whether or not the inputs and output are \code{sf}s.
}
}
\examples{
# An illustration showing:
# - two calls to fjoin_left() (commented out), differing in the `order` argument
# - the resulting calls to dtjoin(), plus `show = TRUE`
# - the generated data.table code and output

# data frames
set.seed(1)
df_x <- data.frame(id_x = 1:3, col_x = paste0("x", 1:3), val = runif(3))
df_y <- data.frame(id_y = rep(4:2, each = 2), col_y = paste0("y", 1:6), val = runif(6))

# ---------------------------------------------------------------------------

# (1) fjoin_left(df_x, df_y, on = "id_x == id_y", mult.x = "first")
dtjoin(
  df_y,
  df_x,
  on = "id_y == id_x",
  mult = "first",
  i.home = TRUE,
  prefix = "R.",
  show = TRUE
)

# (2) fjoin_left(df_x, df_y, on = "id_x == id_y", mult.x = "first", order = "right")
dtjoin(
  df_x,
  df_y,
  on = "id_x == id_y",
  mult.DT = "first",
  nomatch = NULL,
  nomatch.DT = NA,
  prefix = "R.",
  show = TRUE
)

}
\seealso{
See the package-level documentation \code{\link{fjoin}} for related
 functions.
}
