.
*
* END LICENSE
*
* @author Chris Pollett chris@pollett.org
* @license http://www.gnu.org/licenses/ GPL3
* @link http://www.seekquarry.com/
* @copyright 2009 - 2015
* @filesource
*/
if (!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/** Register File Types We Handle*/
$add_extensions = array("asp", "aspx", "cgi", "cfm", "cfml", "do", "htm",
"html", "jsp", "php", "pl", "py", "shtml");
if (!isset($INDEXED_FILE_TYPES)) {
$INDEXED_FILE_TYPES = array();
}
$INDEXED_FILE_TYPES = array_merge($INDEXED_FILE_TYPES, $add_extensions);
$PAGE_PROCESSORS["text/html"] = "HtmlProcessor";
$PAGE_PROCESSORS["text/asp"] = "HtmlProcessor";
$PAGE_PROCESSORS["application/xhtml+xml"] = "HtmlProcessor";
/**
* Load base class, if needed.
*/
require_once BASE_DIR."/lib/processors/text_processor.php";
/**
* Load so can parse urls
*/
require_once BASE_DIR."/lib/url_parser.php";
/**
* Get the centroid summary
*/
require_once BASE_DIR."/lib/summarizers/centroid_summarizer.php";
/**
* Get the graph based summary
*/
require_once BASE_DIR."/lib/summarizers/graph_based_summarizer.php";
/**
* Get the graph based summary
*/
require_once BASE_DIR."/lib/summarizers/scrape_summarizer.php";
/**
* For guessing language from charset
*/
require_once BASE_DIR."/lib/locale_functions.php";
/**
* Used to create crawl summary information
* for HTML files
*
* @author Chris Pollett
* @package seek_quarry\library\processor
*/
class HtmlProcessor extends TextProcessor
{
/**
* Maximum number of characters in a title
*/
const MAX_TITLE_LEN = 100;
/**
* Used to extract the title, description and links from
* a string consisting of webpage data.
*
* @param string $page web-page contents
* @param string $url the url where the page contents came from,
* used to canonicalize relative links
*
* @return array a summary of the contents of the page
*
*/
function process($page, $url)
{
$summary = null;
$is_centroid = $this->summarizer_option == self::CENTROID_SUMMARIZER;
$is_graph_based = $this->summarizer_option ==
self::GRAPH_BASED_SUMMARIZER;
if (is_string($page)) {
$page = preg_replace('/\ \;|\&rdquo\;|\&ldquo\;|\&mdash\;/si',
' ',$page);
$page =
preg_replace('@@si', ' ', $page);
$dom_page = preg_replace('@@si', ' ',
$page);
$dom = self::dom($dom_page);
if ($dom !== false ) {
$summary[self::ROBOT_METAS] = self::getMetaRobots($dom);
$summary[self::TITLE] = self::title($dom);
if ($summary[self::TITLE] == "") {
$summary[self::TITLE] = self::crudeTitle($dom_page);
}
$summary[self::LANG] = self::lang($dom,
$summary[self::TITLE], $url);
if ($is_centroid) {
$summary_cloud = CentroidSummarizer::getCentroidSummary(
$dom_page, $summary[self::LANG]);
$summary[self::DESCRIPTION] = $summary_cloud[0];
$summary[self::WORD_CLOUD] = $summary_cloud[1];
crawlLog("..Using Centroid Summarizer");
} elseif ($is_graph_based) {
$summary[self::DESCRIPTION] =
GraphBasedSummarizer::getGraphBasedSummary($dom_page,
$summary[self::LANG]);
crawlLog("..Using Graph Based Summarizer");
} else {
$summary[self::DESCRIPTION] =
ScrapeSummarizer::getBasicSummary($dom, $dom_page);
crawlLog("..Using Basic Summarizer");
}
$crude = false;
if (trim($summary[self::DESCRIPTION]) == "") {
$summary[self::DESCRIPTION] = self::crudeDescription(
$dom_page);
crawlLog("..No text extracted. ".
"Invoked crude description fallback.");
$crude = true;
}
$summary[self::LINKS] = self::links($dom, $url);
if ($summary[self::LINKS] == array()) {
$summary[self::LINKS] = parent::extractHttpHttpsUrls(
$page);
}
$location = self::location($dom, $url);
if ($location) {
$summary[self::LINKS][$location] = "location:".$url;
$summary[self::LOCATION] = true;
$summary[self::DESCRIPTION] .= $url." => ".$location;
if (!$summary[self::TITLE]) {
$summary[self::TITLE] = $url;
}
}
if (!$crude && !$location) {
$location = self::relCanonical($dom, $url);
if ($location) {
$summary[self::LINKS] = array();
$summary[self::LINKS][$location] = "location:".$url;
$summary[self::LOCATION] = true;
if (!$summary[self::DESCRIPTION]) {
$summary[self::DESCRIPTION].=$url." => ".$location;
}
if (!$summary[self::TITLE]) {
$summary[self::TITLE] = $url;
}
}
}
$summary[self::PAGE] = $page;
if (strlen($summary[self::DESCRIPTION] . $summary[self::TITLE])
== 0 && count($summary[self::LINKS]) == 0 && !$location) {
/*maybe not html? treat as text with messed up tags
still try to get urls
*/
$summary_text = parent::process(strip_tags($page), $url);
foreach ($summary as $field => $value) {
if (($value == "" || $value == array() ) &&
isset($summary_text[$field])) {
$summary[$field] = $summary_text[$field];
}
}
}
} else if ( $dom == false ) {
$summary = parent::process($page, $url);
}
}
return $summary;
}
/**
* Return a document object based on a string containing the contents of
* a web page
*
* @param string $page a web page
*
* @return object document object
*/
static function dom($page)
{
/*
first do a crude check to see if we have at least an tag
otherwise try to make a simplified html document from what we got
*/
if (!stristr($page, "";
$head = strip_tags($page, $head_tags);
$body_tags = "