% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/LinkExtractor.R
\name{LinkExtractor}
\alias{LinkExtractor}
\title{LinkExtractor}
\usage{
LinkExtractor(url, id, lev, IndexErrPages, Useragent, Timeout = 6,
  URLlenlimit = 255, urlExtfilter, encod, urlbotfiler, removeparams,
  ExternalLInks = FALSE)
}
\arguments{
\item{url}{character, url to fetch and extract links.}

\item{id}{numeric, an id to identify a specific web page in a website collection, it's auto-generated by default}

\item{lev}{numeric, the depth level of the web page, auto-generated by the \code{Rcrawler} function.}

\item{IndexErrPages}{character vector, vector of html error code-statut to process, by default it's c(200),eg to include 404 and 403 pages c(404,403)}

\item{Useragent}{, default to "Rcrawler"}

\item{Timeout}{,default to 5s}

\item{URLlenlimit}{interger, the url character length limit to index, default to 255 characters (to avoid spider traps)}

\item{urlExtfilter}{character vector, the list of file extensions to exclude from indexing, by dfault a large list is defined (html pages only are permitted) in order to prevent large files downloading; To define your own use c(ext1,ext2,ext3 ...)}

\item{encod}{character, specify the encoding of th web page}

\item{urlbotfiler}{character vector , directories/files restricted by robot.txt}

\item{removeparams}{character vector, list of url parameters to be removed/ignored}

\item{ExternalLInks}{boolean, default FALSE, if set to TRUE external links also are returned.}
}
\value{
return a list of three elements, the first is a list containing the web page details (url, encoding-type, content-type, content ... etc), the second is a character-vector containing the list of retreived urls and the third is a vetor of external Urls scraped from the page.
}
\description{
A function that take a _charachter_ url as input, fetches its html document, and extract all links following a set of rules.
}
\examples{

pageinfo<-LinkExtractor(url="http://www.glofile.com", ExternalLInks = TRUE)
#Pageinfo handle page header detail, as well as content, and internal links.
#pageinfo[[1]][[10]] : page content
#pageinfo[[2]] : Internal hyperlinks
#pageinfo[[3]] : External hyperlinks

}
\author{
salim khalil
}
