% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/simulating_functions.R
\name{simulate_gaussian}
\alias{simulate_gaussian}
\alias{simulate_binomial}
\alias{simulate_gamma}
\alias{simulate_poisson}
\alias{simulate_inverse_gaussian}
\alias{simulate_negative_binomial}
\alias{simulate_tweedie}
\title{Create ideal data for a generalized linear model.}
\usage{
simulate_gaussian(N = 10000, link = "identity", weights = 1:3,
  unrelated = 0, ancillary = 1)

simulate_binomial(N = 10000, link = "logit", weights = c(0.1, 0.2),
  unrelated = 0)

simulate_gamma(N = 10000, link = "inverse", weights = 1:3,
  unrelated = 0, ancillary = 0.05)

simulate_poisson(N = 10000, link = "log", weights = c(0.5, 1),
  unrelated = 0)

simulate_inverse_gaussian(N = 10000, link = "1/mu^2", weights = 1:3,
  unrelated = 0, ancillary = 0.3333)

simulate_negative_binomial(N = 10000, link = "log", weights = c(0.5,
  1), unrelated = 0, ancillary = 1)

simulate_tweedie(N = 10000, link = "log", weights = 0.02,
  unrelated = 0, ancillary = 1.15)
}
\arguments{
\item{N}{Sample size. (Default: 10000)}

\item{link}{Link function. See \code{\link[stats]{family}} for details.}

\item{weights}{Betas in glm model. See details. simulate_binomial: c(.1, .2) All other: c(1, 2, 3)}

\item{unrelated}{Number of unrelated features to return. (Default: 0)}

\item{ancillary}{Ancillary parameter for continuous families and negative binomial. See details.}
}
\value{
A tibble with a response variable and predictors.
}
\description{
Create ideal data for a generalized linear model.
}
\details{
For many families, it is possible to pick weights that cause inverse link(X * weights) to be mathematically invalid.
For example, the log link for binomial regression defines P(Y=1) as exp(X * weights) which can be above one.
If this happens, the function will error with a helpful message.

The intercept in the underlying link(Y) = X * weights + intercept is always max(weights). In
simulate_gaussian(link = "inverse", weights = 1:3), the model is (1/Y) = 1*X1 + 2*X2 + 3*X3 + 3.


 links
 \itemize{
  \item gaussian: identity, log, inverse
  \item binomial: logit, probit, cauchit, loglog, cloglog, log, logc, identity
  \item gamma: inverse, identity, log
  \item poisson: log, identity, sqrt
  \item inverse gaussian: 1/mu^2, inverse, identity, log
  \item negative binomial: log, identity, sqrt
  \item tweedie: log, identity, sqrt, inverse
  }
 The default link is the first link listed for each family.


 ancillary parameter
 \itemize{
  \item gaussian: standard deviation
  \item binomial: N/A
  \item gamma: scale parameter
  \item poisson: N/A
  \item inverse gaussian: dispersion parameter
  \item negative binomial: theta.
  \item tweedie: rho
  }
}
\examples{
library(GlmSimulatoR)
library(ggplot2)
library(MASS)

# Do glm and lm estimate the same weights? Yes
set.seed(1)
simdata <- simulate_gaussian()
linearModel <- lm(Y ~ X1 + X2 + X3, data = simdata)
glmModel <- glm(Y ~ X1 + X2 + X3, data = simdata, family = gaussian(link = "identity"))
summary(linearModel)
summary(glmModel)
rm(linearModel, glmModel, simdata)

# If the effects are multiplicative instead of additive,
# will my response variable still be normal? Yes
set.seed(1)
simdata <- simulate_gaussian(N = 1000, link = "log", weights = c(.1, .2))

ggplot(simdata, aes(x = Y)) +
  geom_histogram(bins = 30)
rm(simdata)

# Is AIC lower for the correct link? For ten thousand data points, depends on seed!
set.seed(1)
simdata <- simulate_gaussian(N = 10000, link = "inverse", weights = 1)
glmCorrectLink <- glm(Y ~ X1, data = simdata, family = gaussian(link = "inverse"))
glmWrongLink <- glm(Y ~ X1, data = simdata, family = gaussian(link = "identity"))
summary(glmCorrectLink)$aic
summary(glmWrongLink)$aic
rm(simdata, glmCorrectLink, glmWrongLink)


# Does a stepwise search find the correct model for logistic regression? Yes
# 3 related variables. 3 unrelated variables.
set.seed(1)
simdata <- simulate_binomial(N = 10000, link = "logit", weights = c(.3, .4, .5), unrelated = 3)

scopeArg <- list(
  lower = Y ~ 1,
  upper = Y ~ X1 + X2 + X3 + Unrelated1 + Unrelated2 + Unrelated3
)

startingModel <- glm(Y ~ 1, data = simdata, family = binomial(link = "logit"))
glmModel <- stepAIC(startingModel, scopeArg)
summary(glmModel)
rm(simdata, scopeArg, startingModel, glmModel)

# When the resposne is a gamma distribution, what does a scatter plot between X and Y look like?
set.seed(1)
simdata <- simulate_gamma(weights = 1)
ggplot(simdata, aes(x = X1, y = Y)) +
  geom_point()
rm(simdata)
}
