% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/data-analyzer.R
\name{DataAnalyzer}
\alias{DataAnalyzer}
\title{Analyzes input text files and n-gram token files}
\description{
It provides a method that returns information about text files, such as
number of lines and number of words. It also provides a method that displays
bar plots of n-gram frequencies. Additionally it provides a method for
searching for n-grams in a n-gram token file. This file is generated using
the TokenGenerator class.
}
\details{
It provides a method that returns text file information. The text
file information includes total number of lines, max, min and mean line
length and file size.

It also provides a method that generates a bar plot showing the most common
n-gram tokens.

Another method is provided which returns a list of n-grams that match the
given regular expression.
}
\examples{

## ------------------------------------------------
## Method `DataAnalyzer$plot_n_gram_stats`
## ------------------------------------------------

# Start of environment setup code
# The level of detail in the information messages
ve <- 0
# The name of the folder that will contain all the files. It will be
# created in the current directory. NULL value implies tempdir will
# be used.
fn <- NULL
# The required files. They are default files that are part of the
# package
rf <- c("n2.RDS")
# An object of class EnvManager is created
em <- EnvManager$new(ve = ve, rp = "./")
# The required files are downloaded
ed <- em$setup_env(rf, fn)
# End of environment setup code

# The n-gram file name
nfn <- paste0(ed, "/n2.RDS")
# The DataAnalyzer object is created
da <- DataAnalyzer$new(nfn, ve = ve)
# The top features plot is checked
df <- da$plot_n_gram_stats(opts = list(
    "type" = "top_features",
    "n" = 10,
    "save_to" = NULL,
    "dir" = ed
))
# N-gram statistics are displayed
print(df)
# The test environment is removed. Comment the below line, so the
# files generated by the function can be viewed
em$td_env()

## ------------------------------------------------
## Method `DataAnalyzer$get_file_info`
## ------------------------------------------------

# Start of environment setup code
# The level of detail in the information messages
ve <- 0
# The name of the folder that will contain all the files. It will be
# created in the current directory. NULL implies tempdir will be used
fn <- NULL
# The required files. They are default files that are part of the
# package
rf <- c("test.txt")
# An object of class EnvManager is created
em <- EnvManager$new(ve = ve, rp = "./")
# The required files are downloaded
ed <- em$setup_env(rf, fn)
# End of environment setup code

# The test file name
cfn <- paste0(ed, "/test.txt")
# The DataAnalyzer object is created
da <- DataAnalyzer$new(ve = ve)
# The file info is fetched
fi <- da$get_file_info(cfn)
# The file information is printed
print(fi)

# The test environment is removed. Comment the below line, so the
# files generated by the function can be viewed
em$td_env()

## ------------------------------------------------
## Method `DataAnalyzer$get_ngrams`
## ------------------------------------------------

# Start of environment setup code
# The level of detail in the information messages
ve <- 0
# The name of the folder that will contain all the files. It will be
# created in the current directory. NULL implies tempdir will be used
fn <- NULL
# The required files. They are default files that are part of the
# package
rf <- c("n2.RDS")
# An object of class EnvManager is created
em <- EnvManager$new(ve = ve, rp = "./")
# The required files are downloaded
ed <- em$setup_env(rf, fn)
# End of environment setup code

# The n-gram file name
nfn <- paste0(ed, "/n2.RDS")
# The DataAnalyzer object is created
da <- DataAnalyzer$new(nfn, ve = ve)
# Bi-grams starting with "and_" are returned
df <- da$get_ngrams(fn = nfn, c = 10, pre = "^and_*")
# The data frame is sorted by frequency
df <- df[order(df$freq, decreasing = TRUE),]
# The data frame is printed
print(df)

# The test environment is removed. Comment the below line, so the
# files generated by the function can be viewed
em$td_env()
}
\section{Super class}{
\code{\link[wordpredictor:Base]{wordpredictor::Base}} -> \code{DataAnalyzer}
}
\section{Methods}{
\subsection{Public methods}{
\itemize{
\item \href{#method-new}{\code{DataAnalyzer$new()}}
\item \href{#method-plot_n_gram_stats}{\code{DataAnalyzer$plot_n_gram_stats()}}
\item \href{#method-get_file_info}{\code{DataAnalyzer$get_file_info()}}
\item \href{#method-get_ngrams}{\code{DataAnalyzer$get_ngrams()}}
\item \href{#method-clone}{\code{DataAnalyzer$clone()}}
}
}
\if{html}{
\out{<details open ><summary>Inherited methods</summary>}
\itemize{
}
\out{</details>}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-new"></a>}}
\if{latex}{\out{\hypertarget{method-new}{}}}
\subsection{Method \code{new()}}{
It initializes the current object. It is used to set the file name
and verbose options.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{DataAnalyzer$new(fn = NULL, ve = 0)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{fn}}{The path to the input file.}

\item{\code{ve}}{The level of detail in the information messages.}
}
\if{html}{\out{</div>}}
}
}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-plot_n_gram_stats"></a>}}
\if{latex}{\out{\hypertarget{method-plot_n_gram_stats}{}}}
\subsection{Method \code{plot_n_gram_stats()}}{
It allows generating two type of n-gram plots. It first reads n-gram
token frequencies from an input text file. The n-gram frequencies are
displayed in a bar plot.

The type of plot is specified by the type option. The type options
can have the values 'top_features' or 'coverage'. 'top_features'
displays the top n most occurring tokens along with their
frequencies. 'coverage' displays the number of words along with their
frequencies.

The plot stats are returned as a data frame.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{DataAnalyzer$plot_n_gram_stats(opts)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{opts}}{The options for analyzing the data.
\itemize{
\item \strong{type}. The type of plot to display. The options are:
'top_features', 'coverage'.
\item \strong{n}. For 'top_features', it is the number of top most occurring
tokens. For 'coverage' it is the first n frequencies.
\item \strong{save_to}. The graphics devices to save the plot to.
NULL implies plot is printed.
\item \strong{dir}. The output directory where the plot will be saved.
}}
}
\if{html}{\out{</div>}}
}
\subsection{Returns}{
A data frame containing the stats.
}
\subsection{Examples}{
\if{html}{\out{<div class="r example copy">}}
\preformatted{# Start of environment setup code
# The level of detail in the information messages
ve <- 0
# The name of the folder that will contain all the files. It will be
# created in the current directory. NULL value implies tempdir will
# be used.
fn <- NULL
# The required files. They are default files that are part of the
# package
rf <- c("n2.RDS")
# An object of class EnvManager is created
em <- EnvManager$new(ve = ve, rp = "./")
# The required files are downloaded
ed <- em$setup_env(rf, fn)
# End of environment setup code

# The n-gram file name
nfn <- paste0(ed, "/n2.RDS")
# The DataAnalyzer object is created
da <- DataAnalyzer$new(nfn, ve = ve)
# The top features plot is checked
df <- da$plot_n_gram_stats(opts = list(
    "type" = "top_features",
    "n" = 10,
    "save_to" = NULL,
    "dir" = ed
))
# N-gram statistics are displayed
print(df)
# The test environment is removed. Comment the below line, so the
# files generated by the function can be viewed
em$td_env()
}
\if{html}{\out{</div>}}

}

}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-get_file_info"></a>}}
\if{latex}{\out{\hypertarget{method-get_file_info}{}}}
\subsection{Method \code{get_file_info()}}{
It generates information about text files. It takes as input a file
or a directory containing text files. For each file it calculates the
total number of lines, maximum, minimum and mean line lengths and the
total file size. The file information is returned as a data frame.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{DataAnalyzer$get_file_info(res)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{res}}{The name of a directory or a file name.}
}
\if{html}{\out{</div>}}
}
\subsection{Returns}{
A data frame containing the text file statistics.
}
\subsection{Examples}{
\if{html}{\out{<div class="r example copy">}}
\preformatted{# Start of environment setup code
# The level of detail in the information messages
ve <- 0
# The name of the folder that will contain all the files. It will be
# created in the current directory. NULL implies tempdir will be used
fn <- NULL
# The required files. They are default files that are part of the
# package
rf <- c("test.txt")
# An object of class EnvManager is created
em <- EnvManager$new(ve = ve, rp = "./")
# The required files are downloaded
ed <- em$setup_env(rf, fn)
# End of environment setup code

# The test file name
cfn <- paste0(ed, "/test.txt")
# The DataAnalyzer object is created
da <- DataAnalyzer$new(ve = ve)
# The file info is fetched
fi <- da$get_file_info(cfn)
# The file information is printed
print(fi)

# The test environment is removed. Comment the below line, so the
# files generated by the function can be viewed
em$td_env()
}
\if{html}{\out{</div>}}

}

}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-get_ngrams"></a>}}
\if{latex}{\out{\hypertarget{method-get_ngrams}{}}}
\subsection{Method \code{get_ngrams()}}{
It extracts a given number of n-grams and their frequencies from a
n-gram token file.

The prefix parameter specifies the regular expression for matching
n-grams. If this parameter is not specified then the given number of
n-grams are randomly chosen.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{DataAnalyzer$get_ngrams(fn, c = NULL, pre = NULL)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{fn}}{The n-gram file name.}

\item{\code{c}}{The number of n-grams to return.}

\item{\code{pre}}{The n-gram prefix, given as a regular expression.}
}
\if{html}{\out{</div>}}
}
\subsection{Examples}{
\if{html}{\out{<div class="r example copy">}}
\preformatted{# Start of environment setup code
# The level of detail in the information messages
ve <- 0
# The name of the folder that will contain all the files. It will be
# created in the current directory. NULL implies tempdir will be used
fn <- NULL
# The required files. They are default files that are part of the
# package
rf <- c("n2.RDS")
# An object of class EnvManager is created
em <- EnvManager$new(ve = ve, rp = "./")
# The required files are downloaded
ed <- em$setup_env(rf, fn)
# End of environment setup code

# The n-gram file name
nfn <- paste0(ed, "/n2.RDS")
# The DataAnalyzer object is created
da <- DataAnalyzer$new(nfn, ve = ve)
# Bi-grams starting with "and_" are returned
df <- da$get_ngrams(fn = nfn, c = 10, pre = "^and_*")
# The data frame is sorted by frequency
df <- df[order(df$freq, decreasing = TRUE),]
# The data frame is printed
print(df)

# The test environment is removed. Comment the below line, so the
# files generated by the function can be viewed
em$td_env()
}
\if{html}{\out{</div>}}

}

}
\if{html}{\out{<hr>}}
\if{html}{\out{<a id="method-clone"></a>}}
\if{latex}{\out{\hypertarget{method-clone}{}}}
\subsection{Method \code{clone()}}{
The objects of this class are cloneable with this method.
\subsection{Usage}{
\if{html}{\out{<div class="r">}}\preformatted{DataAnalyzer$clone(deep = FALSE)}\if{html}{\out{</div>}}
}

\subsection{Arguments}{
\if{html}{\out{<div class="arguments">}}
\describe{
\item{\code{deep}}{Whether to make a deep clone.}
}
\if{html}{\out{</div>}}
}
}
}
