% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/approx_near_optimal_survey_scheme.R
\name{approx_near_optimal_survey_scheme}
\alias{approx_near_optimal_survey_scheme}
\title{Approximately near optimal survey scheme}
\usage{
approx_near_optimal_survey_scheme(
  site_data,
  feature_data,
  site_detection_columns,
  site_n_surveys_columns,
  site_probability_columns,
  site_management_cost_column,
  site_survey_cost_column,
  feature_survey_column,
  feature_survey_sensitivity_column,
  feature_survey_specificity_column,
  feature_model_sensitivity_column,
  feature_model_specificity_column,
  feature_target_column,
  total_budget,
  survey_budget,
  site_management_locked_in_column = NULL,
  site_management_locked_out_column = NULL,
  site_survey_locked_out_column = NULL,
  prior_matrix = NULL,
  n_approx_replicates = 100,
  n_approx_outcomes_per_replicate = 10000,
  seed = 500,
  n_threads = 1,
  verbose = FALSE
)
}
\arguments{
\item{site_data}{\code{\link[sf:sf]{sf::sf()}} object with site data.}

\item{feature_data}{\code{\link[base:data.frame]{base::data.frame()}} object with feature data.}

\item{site_detection_columns}{\code{character} names of \code{numeric}
columns in the argument to \code{site_data} that contain the proportion of
surveys conducted within each site that detected each feature.
Each column should correspond to a different feature, and contain
a proportion value (between zero and one). If a site has
not previously been surveyed, a value of zero should be used.}

\item{site_n_surveys_columns}{\code{character} names of \code{numeric}
columns in the argument to \code{site_data} that contain the total
number of surveys conducted for each each feature within each site.
Each column should correspond to a different feature, and contain
a non-negative integer number (e.g. 0, 1, 2, 3). If a site has
not previously been surveyed, a value of zero should be used.}

\item{site_probability_columns}{\code{character} names of \code{numeric}
columns in the argument to \code{site_data} that contain modelled
probabilities of occupancy for each feature in each site.
Each column should correspond to a different feature, and contain
probability data (values between zero and one). No missing (\code{NA})
values are permitted in these columns.}

\item{site_management_cost_column}{\code{character} name of column in the
argument to \code{site_data} that contains costs for managing each
site for conservation. This column should have \code{numeric} values that
are equal to or greater than zero. No missing (\code{NA}) values are
permitted in this column.}

\item{site_survey_cost_column}{\code{character} name of column in the
argument to  \code{site_data} that contains costs for surveying each
site. This column should have \code{numeric} values that are equal to
or greater than zero. No missing (\code{NA}) values are permitted in this
column.}

\item{feature_survey_column}{\code{character} name of the column in the
argument to \code{feature_data} that contains \code{logical} (\code{TRUE} /
\code{FALSE}) values indicating if the feature will be surveyed in
the planned surveys or not. Note that considering additional features will
rapidly increase computational burden, and so it is only recommended to
consider features that are of specific conservation interest.
No missing (\code{NA}) values are permitted in this column.}

\item{feature_survey_sensitivity_column}{\code{character} name of the
column in the argument to \code{feature_data} that contains
probability of future surveys correctly detecting a presence of each
feature in a given site (i.e. the sensitivity of the survey methodology).
This column should have \code{numeric} values that are between zero and
one. No missing (\code{NA}) values are permitted in this column.}

\item{feature_survey_specificity_column}{\code{character} name of the
column in the argument to \code{feature_data} that contains
probability of future surveys correctly detecting an absence of each
feature in a given site (i.e. the specificity of the survey methodology).
This column should have \code{numeric} values that are between zero and
one. No missing (\code{NA}) values are permitted in this column.}

\item{feature_model_sensitivity_column}{\code{character} name of the
column in the argument to \code{feature_data} that contains
probability of the initial models correctly predicting a presence of each
feature in a given site (i.e. the sensitivity of the models).
This column should have \code{numeric} values that are between zero and
one. No missing (\code{NA}) values are permitted in this column.
This should ideally be calculated using
\code{\link[=fit_xgb_occupancy_models]{fit_xgb_occupancy_models()}} or
\code{\link[=fit_hglm_occupancy_models]{fit_hglm_occupancy_models()}}.}

\item{feature_model_specificity_column}{\code{character} name of the
column in the argument to \code{feature_data} that contains
probability of the initial models correctly predicting an absence of each
feature in a given site (i.e. the specificity of the models).
This column should have \code{numeric} values that are between zero and
one. No missing (\code{NA}) values are permitted in this column.
This should ideally be calculated using
\code{\link[=fit_xgb_occupancy_models]{fit_xgb_occupancy_models()}} or
\code{\link[=fit_hglm_occupancy_models]{fit_hglm_occupancy_models()}}.}

\item{feature_target_column}{\code{character} name of the column in the
argument to \code{feature_data} that contains the \eqn{target}
values used to parametrize the conservation benefit of managing of each
feature.
This column should have \code{numeric} values that
are equal to or greater than zero. No missing (\code{NA}) values are
permitted in this column.}

\item{total_budget}{\code{numeric} maximum expenditure permitted
for conducting surveys and managing sites for conservation.}

\item{survey_budget}{\code{numeric} maximum expenditure permitted
for conducting surveys.}

\item{site_management_locked_in_column}{\code{character} name of the column
in the argument to \code{site_data} that contains \code{logical}
(\code{TRUE} / \code{FALSE}) values indicating which sites should
be locked in for (\code{TRUE}) being managed for conservation or
(\code{FALSE}) not. No missing (\code{NA}) values are permitted in this
column. This is useful if some sites have already been earmarked for
conservation, or if some sites are already being managed for conservation.
Defaults to \code{NULL} such that no sites are locked in.}

\item{site_management_locked_out_column}{\code{character} name of the column
in the argument to \code{site_data} that contains \code{logical}
(\code{TRUE} / \code{FALSE}) values indicating which sites should
be locked out for (\code{TRUE}) being managed for conservation or
(\code{FALSE}) not. No missing (\code{NA}) values are permitted in this
column. This is useful if some sites could potentially be surveyed
to improve model predictions even if they cannot be managed for
conservation. Defaults to \code{NULL} such that no sites are locked out.}

\item{site_survey_locked_out_column}{\code{character} name of the column
in the argument to \code{site_data} that contains \code{logical}
(\code{TRUE} / \code{FALSE}) values indicating which sites should
be locked out (\code{TRUE}) from being selected for future surveys or
(\code{FALSE}) not. No missing (\code{NA}) values are permitted in this
column. This is useful if some sites will never be considered for future
surveys (e.g. because they are too costly to survey, or have a
low chance of containing the target species).
Defaults to \code{NULL} such that no sites are locked out.}

\item{prior_matrix}{\code{numeric} \code{matrix} containing
the prior probability of each feature occupying each site.
Rows correspond to features, and columns correspond to sites.
Defaults to \code{NULL} such that prior data is calculated automatically
using \code{\link[=prior_probability_matrix]{prior_probability_matrix()}}.}

\item{n_approx_replicates}{\code{integer} number of replicates to use for
approximating the expected value calculations. Defaults to 100.}

\item{n_approx_outcomes_per_replicate}{\code{integer} number of outcomes to
use per replicate for approximation calculations. Defaults to 10000.}

\item{seed}{\code{integer} state of the random number generator for
simulating outcomes when conducting the value of information analyses.
Defaults to 500.}

\item{n_threads}{\code{integer} number of threads to use for computation.}

\item{verbose}{\code{logical} indicating if information should be
printed during processing. Defaults to \code{FALSE}.}
}
\value{
\code{matrix} of \code{logical} (\code{TRUE}/ \code{FALSE})
values indicating if a site is selected in the scheme or not. Columns
correspond to sites, and rows correspond to different schemes. If there
are no ties for the best identified solution, then the the \code{matrix}
will only contain a single row.
}
\description{
Find a near optimal survey scheme that maximizes value of information.
This function uses the approximation method
for calculating the expected value of the decision given a survey scheme,
and a greedy heuristic algorithm to maximize this metric.
}
\details{
Ideally, the brute-force algorithm would be used to identify the optimal
survey scheme. Unfortunately, it is not feasible to apply the brute-force
to large problems because it can take an incredibly long time to complete.
In such cases, it may be desirable to obtain a "relatively good" survey
scheme and the greedy heuristic algorithm is provided for such cases.
The greedy heuristic algorithm -- unlike the brute force algorithm --
is not guaranteed to identify an optimal solution -- or even a "relatively
good solution" for that matter -- though greedy heuristic algorithms tend to
deliver solutions that are 15\\% from optimality. Specifically, this
greedy algorithms is implemented as:

\enumerate{

\item Initialize an empty \emph{list of survey scheme solutions}, and an
empty \emph{list of approximate expected values}.

\item Calculate the expected value of current information.

\item Add a survey scheme with no sites selected for surveying to the
\emph{list of survey scheme solutions}, and add the expected value of current
information to the \emph{list of approximate expected values}.

\item Set the \emph{current survey solution} as the survey scheme with no
sites selected for surveying.

\item For each remaining candidate site that has not been selected for
a survey, generate a new candidate survey scheme with each candidate site
added to the current survey solution.

\item Calculate the approximate expected value of each
new candidate survey scheme. If the cost of a given candidate survey scheme
exceeds the survey budget, then store a missing \verb{NA value} instead.
Also if the the cost of a given candidate survey scheme plus the
management costs of locked in planning units exceeds the total budget,
then a store a missing value \code{NA} value too.

\item If all of the new candidate survey schemes are associated with
missing \code{NA} values -- because they all exceed the survey budget -- then
go to step 12.

\item Calculate the cost effectiveness of each new candidate survey
scheme. This calculated as the difference between the approximate expected
value of a given new candidate survey scheme and that of the
\emph{current survey solution}, and dividing this difference by the the cost
of the newly selected candidate site.

\item Find the new candidate survey scheme that is associated with the
highest cost-effectiveness value, ignoring any missing \code{NA} values.
This new candidate survey scheme is now set as the
\emph{current survey scheme}.

\item Store the \emph{current survey scheme} in the
\emph{list of survey scheme solutions} and store its approximate expected
value in the \emph{list of approximate expected values}.

\item Go to step 12.

\item Find the solution in the \emph{list of survey scheme solutions} that
has the highest expected value in the
\emph{list of approximate expected values} and return this solution.

}
}
\examples{
# set seeds for reproducibility
library(RandomFields)
set.seed(123)
RFoptions(seed = 201)

# simulate data
site_data <- simulate_site_data(n_sites = 30, n_features = 2, prop = 0.1)
feature_data <- simulate_feature_data(n_features = 2, prop = 1)
feature_data$target <- c(10, 10)

# preview simulated data
print(site_data)
print(feature_data)

# set total budget for managing sites for conservation
# (i.e. 50\% of the cost of managing all sites)
total_budget <- sum(site_data$management_cost) * 0.5

# set total budget for surveying sites for conservation
# (i.e. 10\% of the cost of managing all sites)
survey_budget <- sum(site_data$survey_cost) * 0.1

# find survey scheme using approximate method and greedy heuristic algorithm
# (using 10 replicates so that this example completes relatively quickly)
approx_near_optimal_survey <- approx_near_optimal_survey_scheme(
  site_data, feature_data,
  c("f1", "f2"), c("n1", "n2"), c("p1", "p2"),
  "management_cost", "survey_cost",
  "survey", "survey_sensitivity", "survey_specificity",
  "model_sensitivity", "model_specificity",
  "target", total_budget, survey_budget)

# print result
print(approx_near_optimal_survey)
}
