% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/create_vectorstore.R
\name{embed_openai}
\alias{embed_openai}
\alias{create_vectorstore}
\alias{insert_vectors}
\alias{build_vector_index}
\alias{search_vectors}
\title{Embed text with OpenAI}
\usage{
embed_openai(
  x,
  model = "text-embedding-ada-002",
  base_url = "https://api.openai.com/v1",
  api_key = Sys.getenv("OPENAI_API_KEY"),
  batch_size = 20L,
  embedding_dim = 1536
)

create_vectorstore(
  db_path = ":memory:",
  overwrite = FALSE,
  embedding_dim = 1536,
  load_vss = identical(Sys.getenv("_R_CHECK_PACKAGE_NAME_"), "")
)

insert_vectors(
  con,
  df,
  embed_fun = embed_openai(),
  chunk_chars = 12000,
  embedding_dim = 1536
)

build_vector_index(store, type = c("vss", "fts"))

search_vectors(
  con,
  query_text,
  top_k = 5,
  embed_fun = embed_openai(),
  embedding_dim = 1536
)
}
\arguments{
\item{x}{Character vector of texts, or a data frame with a `page_content` column.}

\item{model}{OpenAI embedding model name.}

\item{base_url}{Base URL for an OpenAI-compatible API.}

\item{api_key}{API key; defaults to `Sys.getenv("OPENAI_API_KEY")`.}

\item{batch_size}{Batch size for embedding requests.}

\item{embedding_dim}{Integer; the dimensionality of the vector embeddings to store.}

\item{db_path}{Path to the DuckDB file. Use `":memory:"` to create an in-memory database.}

\item{overwrite}{Logical; if `TRUE`, deletes any existing DuckDB file or table.}

\item{load_vss}{Logical; whether to load the experimental `vss` extension.
This defaults to `TRUE`, but is forced to `FALSE` during CRAN checks.}

\item{con}{Active DuckDB DBI connection.}

\item{df}{Data frame containing `page_content` (or `content`) text.}

\item{embed_fun}{Function used to convert text into numeric embeddings.}

\item{chunk_chars}{Approximate max chunk size in bytes before splitting.}

\item{store}{Active DuckDB DBI connection or vector-store handle.}

\item{type}{Index types to build; any of `"vss"` and/or `"fts"`.}

\item{query_text}{Query text to embed and search.}

\item{top_k}{Number of nearest matches to return.}
}
\value{
For character input, a numeric matrix of embeddings. For data-frame
  input, the same data frame with an added `embedding` column. If `x` is
  missing, a configured embedding function is returned.

A live DuckDB connection object. Be sure to manually disconnect with:
\code{DBI::dbDisconnect(con, shutdown = TRUE)}
}
\description{
Helper for vector-store pipelines. If called without `x`, this returns a
closure that can be passed directly to `insert_vectors(embed_fun = ...)`.

Initializes a DuckDB database connection for storing embedded documents,
with optional support for the experimental `vss` extension.

Chunks long text rows, generates embeddings when needed, and inserts
`(page_content, embedding)` rows into the `vectors` table.

Builds HNSW (`vss`) and/or full-text (`fts`) indexes on the `vectors` table.

Embeds `query_text`, computes vector distance against stored embeddings,
and returns the nearest matches.
}
\details{
This function is part of the vector-store utilities for:
\itemize{
  \item Embedding text via the OpenAI API
  \item Storing and chunking documents in DuckDB
  \item Building `HNSW` and `FTS` indexes
  \item Running nearest-neighbour search over vector embeddings
}

Core helpers like \code{embed_openai()}, \code{insert_vectors()},
\code{build_vector_index()}, and \code{search_vectors()} are also exported
to support composable workflows.
}
\examples{
\dontrun{
# Create vector store
con <- create_vectorstore("tests/testthat/test-data/my_vectors.duckdb", overwrite = TRUE)

# Assume response is output from fetch_data()
docs <- data.frame(head(response))

# Insert documents with embeddings
insert_vectors(
  con = con,
  df = docs,
  embed_fun = embed_openai(),
  chunk_chars = 12000
)

# Build vector + FTS indexes
build_vector_index(con, type = c("vss", "fts"))

# Perform vector search
response <- search_vectors(con, query_text = "Tell me about R?", top_k = 5)
}

}
