% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/links.R
\name{links}
\alias{links}
\title{Multistage deterministic record linkage}
\usage{
links(
  criteria,
  sub_criteria = NULL,
  sn = NULL,
  strata = NULL,
  data_source = NULL,
  data_links = "ANY",
  display = "none",
  group_stats = FALSE,
  expand = TRUE,
  shrink = FALSE,
  recursive = FALSE,
  check_duplicates = FALSE,
  tie_sort = NULL
)
}
\arguments{
\item{criteria}{\code{[list|atomic]}. Attributes to compare. Each element of the list is a stage in the linkage process. See \code{Details}.}

\item{sub_criteria}{\code{[list|\link{sub_criteria}]}. Additional matching criteria for each stage of the linkage process. See \code{\link{sub_criteria}}}

\item{sn}{\code{[integer]}. Unique record identifier. Useful for creating familiar \code{\link[=pid-class]{pid}} identifiers.}

\item{strata}{\code{[atomic]}. Subsets of the dataset. Record-groups are created separately for each \code{strata}. See \code{Details}.}

\item{data_source}{\code{[character]}. Data source identifier. Adds the list of data sources in each record-group to the \code{\link[=pid-class]{pid}}. Useful when the data is from multiple sources.}

\item{data_links}{\code{[list|character]}. A set of \code{data_sources} required in each \code{\link[=pid-class]{pid}}. A record-group without records from these \code{data_sources} will be \code{\link[=delink]{unlinked}}. See \code{Details}.}

\item{display}{\code{[character]}. Display or produce a status update. Options are; \code{"none"} (default), \code{"progress"}, \code{"stats"}, \code{"none_with_report"}, \code{"progress_with_report"} or \code{"stats_with_report"}.}

\item{group_stats}{\code{[logical]}. If \code{TRUE} (default), return group specific information like record counts for each \code{\link[=pid-class]{pid}}.}

\item{expand}{\code{[logical]}. If \code{TRUE}, allows a record-group to expand with each subsequent stage of the linkage process. \emph{Not interchangeable with \code{shrink}}.}

\item{shrink}{\code{[logical]}. If \code{TRUE}, forces a record-group to shrink with each subsequent stage of the linkage process. \emph{Not interchangeable with \code{expand}}.}

\item{recursive}{\code{[logical]}. If \code{TRUE}, within each iteration of the process, a match can spawn new matches.}

\item{check_duplicates}{\code{[logical]}. If \code{TRUE}, within each iteration of the process, duplicates values of an attributes are not checked. The outcome of the logical test on the first instance of the value will be recycled for the duplicate values.}

\item{tie_sort}{\code{[atomic]}. Preferential order for breaking tied matches within a stage.}
}
\value{
\code{\link[=pid-class]{pid}}; \code{list}
}
\description{
Match records in consecutive stages with different matching criteria.
Each set of linked records are assigned a unique identifier with relevant group-level information.
}
\details{
Match priority decreases with each subsequent stage of the linkage process
i.e. earlier stages (\code{criteria}) are considered superior.
Therefore, it's important for each \code{criteria} to be listed in an order of decreasing relevance.

Records with missing \code{criteria} (\code{NA}) are skipped at each stage, while
records with missing \code{strata} (\code{NA}) are skipped from the entire linkage process.

If a record is skipped, another attempt will be made to match the record at the next stage.
If a record does not match any other record by the end of the linkage process (or it has a missing \code{strata}),
it is assigned to a unique record-group.

A \code{\link{sub_criteria}} can be used to request additional matching conditions for each stage of the linkage process.
When used, only records with a matching \code{criteria} and \code{sub_criteria} are linked.

In \bold{\code{\link{links}}}, each \code{\link{sub_criteria}} must be linked to a \code{criteria}.
This is done by adding a \code{\link{sub_criteria}} to a named element of a \code{list}.
Each element's name must correspond to a stage. See below for an example of 3 \code{sub_criteria} linked to
\code{criteria} \code{1}, \code{5} and \code{13}.

For example;

\deqn{list("cr1" = sub_criteria(...), "cr5" = sub_criteria(...), "cr13" = sub_criteria(...)).}

\code{\link{sub_criteria}} can be nested to achieve nested conditions.

A \code{\link{sub_criteria}} can be linked to different \code{criteria} but any unlinked \code{\link{sub_criteria}} will be ignored.

By default, attributes in a \code{\link{sub_criteria}} are compared for an \code{\link{exact_match}}.
However, user-defined functions are also permitted. Such functions must meet three requirements:
\enumerate{
\item It must be able to compare the attributes.
\item It must have two arguments named \code{`x`} and \code{`y`}, where \code{`y`} is the value for one observation being compared against all other observations (\code{`x`}).
\item It must return a \code{logical} object i.e.\code{TRUE} or \code{FALSE}.
}

Every element in \code{data_links} must be named \code{"l"} (links) or \code{"g"} (groups).
Unnamed elements of \code{data_links} will be assumed to be \code{"l"}.
\itemize{
\item If named \code{"l"}, only groups with records from every listed \code{data_source} will remain linked.
\item If named \code{"g"}, only groups with records from any listed \code{data_source} will remain linked.
}

See \code{vignette("links")} for more information.
}
\examples{
# Exact match
attr_1 <- c(1, 1, 1, NA, NA, NA, NA, NA)
attr_2 <- c(NA, NA, 2, 2, 2, NA, NA, NA)
links(criteria = list(attr_1, attr_2))

# User-defined tests using `sub_criteria()`
# Matching `sex` and a 20-year age range
age <- c(30, 28, 40, 25, 25, 29, 27)
sex <- c("M", "M", "M", "F", "M", "M", "F")
f1 <- function(x, y) abs(y - x) \%in\% 0:20
links(criteria = sex,
      sub_criteria = list(cr1 = sub_criteria(age, match_funcs = f1)))

# Multistage matches
# Relevance of matches: `forename` > `surname`
data(staff_records); staff_records
links(criteria = list(staff_records$forename, staff_records$surname),
      data_source = staff_records$sex)

# Relevance of matches:
# `staff_id` > `age` (AND (`initials`, `hair_colour` OR `branch_office`))
data(missing_staff_id); missing_staff_id
links(criteria = list(missing_staff_id$staff_id, missing_staff_id$age),
      sub_criteria = list(cr2 = sub_criteria(missing_staff_id$initials,
                                          missing_staff_id$hair_colour,
                                          missing_staff_id$branch_office)),
      data_source = missing_staff_id$source_1)

# Group expansion
match_cri <- list(c(1,NA,NA,1,NA,NA),
                  c(1,1,1,2,2,2),
                  c(3,3,3,2,2,2))
links(criteria = match_cri, expand = TRUE)
links(criteria = match_cri, expand = FALSE)
links(criteria = match_cri, shrink = TRUE)

}
\seealso{
\code{\link{link_records}}; \code{\link{episodes}}; \code{\link{partitions}}; \code{\link{predefined_tests}}; \code{\link{sub_criteria}}; \code{\link{schema}}
}
