. * * END LICENSE * * @author Charles Bocage charles.bocage@sjsu.edu * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ * @copyright 2009 - 2015 * @filesource */ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** * Reads in constants used as enums used for storing web sites */ require_once BASE_DIR."/lib/crawl_constants.php"; /** * Contains the max_description_length for the summary */ require_once BASE_DIR."/lib/processors/page_processor.php"; /** * Contains function getTokenizer to get the object of the language specified. */ require_once BASE_DIR."/lib/phrase_parser.php"; /** * Contains the base methods for the summarizer. */ require_once BASE_DIR."/lib/summarizers/summarizer.php"; /** * Class which may be used by TextProcessors to get a summary for a text * document that may later be used for indexing. * * @author Charles Bocage charles.bocage@sjsu.edu * @package seek_quarry\library */ class ScrapeSummarizer extends Summarizer { /** * This is a basic summarizer * * @param object $dom a document object to extract a description from. * @param string $page original page string to extract description from * * @return string the summary */ static function getBasicSummary($dom, $dom_page) { return self::description($dom, $dom_page); } /** * Returns descriptive text concerning a webpage based on its document * object * * @param object $dom a document object to extract a description from. * @param string $page original page string to extract description from * @return string a description of the page */ static function description($dom, $page) { $xpath = new DOMXPath($dom); $metas = $xpath->evaluate("/html//meta"); $description = ""; //look for a meta tag with a description foreach($metas as $meta) { if(stristr($meta->getAttribute('name'), "description")) { $description .= " .. ".$meta->getAttribute('content'); } } if(PageProcessor::$max_description_len > 2 * MAX_DESCRIPTION_LEN) { /* if don't need to summarize much, take meta description from above code, then concatenate body of doc after stripping tags, return result */ $description .= "\n".self::crudeDescription($page); return $description; } /* concatenate the contents of then additional dom elements up to the limit of description length. Choose tags in order of likely importance to this doc */ $page_parts = array("/html//p[1]", "/html//div[1]", "/html//p[2]", "/html//div[2]", "/html//p[3]", "/html//div[3]", "/html//p[4]", "/html//div[4]", "/html//td", "/html//li", "/html//dt", "/html//dd", "/html//pre", "/html//a", "/html//article", "/html//section", "/html//cite"); $para_data = array(); $len = 0; foreach($page_parts as $part) { $doc_nodes = $xpath->evaluate($part); foreach($doc_nodes as $node) { if($part == "/html//a") { $content = $node->getAttribute('href')." = "; $add_len = min(PageProcessor::$max_description_len / 2, mb_strlen($content)); $para_data[$add_len][] = mb_substr($content, 0, $add_len); } $node_text = self::domNodeToString($node); $add_len = min(PageProcessor::$max_description_len / 2, mb_strlen($node_text)); $para_data[$add_len][] = mb_substr($node_text, 0, $add_len); $len += $add_len; if($len > PageProcessor::$max_description_len) { break 2;} if(in_array($part, array("/html//p[1]", "/html//div[1]", "/html//div[2]", "/html//p[2]", "/html//p[3]", "/html//div[3]", "/html//div[4]", "/html//p[4]"))){ break;} } } krsort($para_data); foreach($para_data as $add_len => $data) { if(!isset($first_len)) { $first_len = $add_len; } foreach($data as $datum) { $description .= " .. ". $datum; } if($first_len > 3 * $add_len) break; } $description = preg_replace("/(\s)+/u", " ", $description); return $description; } /** * Returns summary of body of a web page based on crude regex matching * used as a fall back if dom parsing did not work. * * @param string $page to extract description from * @return string a title of the page */ static function crudeDescription($page) { $body = parent::getBetweenTags($page, 0, ""); if($body == "") { return $body; } $body= preg_replace("/\s+/", " ", $body); return mb_substr($body, 0, self::$max_description_len); } /** * This returns the text content of a node but with spaces * where tags were (unlike just using textContent) * * @param object $node a DOMNode * @return string its text content with spaces */ static function domNodeToString($node) { $text = $node->ownerDocument->saveHTML($node); $text = html_entity_decode($text); $text = preg_replace('/\