.
*
* END LICENSE
*
* @author Charles Bocage charles.bocage@gmail.com
* @license http://www.gnu.org/licenses/ GPL3
* @link http://www.seekquarry.com/
* @copyright 2009 - 2015
* @filesource
*/
if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}
/**
* Reads in constants used as enums used for storing web sites
*/
require_once BASE_DIR."/lib/crawl_constants.php";
/**
* Contains the max_description_length for the summary
*/
require_once BASE_DIR."/lib/processors/page_processor.php";
/**
* Contains function getTokenizer to get the object of the language specified.
*/
require_once BASE_DIR."/lib/phrase_parser.php";
/**
* Class which may be used by TextProcessors to get a summary for a text
* document that may later be used for indexing.
*
* add more comments later
*
* @author Charles Bocage charles.bocage@sjsu.edu
* @package seek_quarry\library
*/
class GraphBasedSummarizer
{
/**
* Number of distinct terms to use in generating summary
*/
const MAX_DISTINCT_TERMS = 1000;
/**
* This is a graph based summarizer
*
* @param string $doc complete raw page to generate the summary from.
* @param string $lang language of the page to decide which stop words to
* call proper tokenizer.php of the specified language.
*
* @return array array of summary and word cloud
*/
static function getGraphBasedSummary($doc, $lang)
{
$doc = self::pageProcessing($doc);
$formatted_doc = self::formatDoc($doc);
//not filtering non-ascii characters
$sentences = self::getSentences($doc . " ", true);
$sentences = self::removeStopWords($sentences, $lang);
$sentences = self::removePunctuation($sentences);
$sentences = PhraseParser::stemTermsK($sentences, $lang, true);
$terms = self::getTerms($sentences, $lang);
$term_frequencies = self::getTermFrequencies($terms, $sentences);
$term_frequencies_normalized =
self::normalizeTermFrequencies($term_frequencies);
$adjacency = self::computeAdjacency($term_frequencies_normalized,
$sentences, $lang);
$p = self::getSentenceRanks($adjacency);
$sentences_with_punctuation = self::getSentences($doc . " ", true);
$summary = self::getSummary($sentences_with_punctuation, $p);
return $summary;
}
/*
* Get the summary from the sentences
* @param array $sentences the sentences in the doc
* @param array $p the sentence probabilities
*/
static function getSummary($sentences, $p)
{
$result = "";
$result_length = 0;
$n = count($p);
for($i = 0; $i < $n; $i++ ) {
$index = self::findLargestIndex($p);
$p[$index] = -1;
$sentence = $sentences[$index];
if ($result_length + strlen($sentence) >
PageProcessor::$max_description_len) {
break;
} else {
$result_length += strlen($sentence);
if ($i == 0) {
$result = $sentence;
} else {
$result = $result . " " . $sentence;
}
}
}
return $result;
}
/*
* find the largest value in the array and return it
* @param array $v the array to search for the largest value
* @return double the largest value found in the array
*/
static function findLargestIndex($v)
{
$result = 0;
$n = count($v);
$last_value = -1;
for($i = 0; $i < $n; $i++ ) {
if ($v[$i] > $last_value) {
$last_value = $v[$i];
$result = $i;
}
}
return $result;
}
/*
* compute the sentence ranks using a version of the famous
* page ranking algorithm developed by the founder of Google.
* @param array $adjacency the adjacency matrix generated for the
* sentences
* @return array the sentence ranks
*/
static function getSentenceRanks($adjacency)
{
$n = count($adjacency);
$old_p = array();
$p = array();
for($i = 0; $i < $n; $i++ ) {
$p[$i] = 1 / $n;
}
for($i = 0; $i < 10; $i++ ) {
$p = self::multiplyMatrixVector($adjacency, $p);
}
return $p;
}
/*
* Compute the difference of squares
* @param array $v the minuend vector
* @param array $m the subtrahend vector
* @result double the difference of the squares of vectors
*/
static function squareDiff($v, $w)
{
$result = 0;
$n = count($v);
for($i = 0; $i < $n; $i++ ) {
$subtraction = $v[$i] - $w[$i];
$result += $subtraction * $subtraction;
}
return $result;
}
/*
* Perform matrix multiplication on a matrix and a vector
* @param array $mat the matrix to multiply the probabilities to
* @param array $vec the probability vector
* @return array the new vector after it has been multiplied
*/
static function multiplyMatrixVector($mat, $vec)
{
$result = array();
$n = count($vec);
for($i = 0; $i < $n; $i++ ) {
$result[$i] = 0;
for($j = 0; $j < $n; $j++ ) {
$result[$i] += $mat[$i][$j] * $vec[$j];
}
}
return $result;
}
/*
* Compute the adjacency matrix based on its distortion measure
* @param array $term_frequencies_normalized the array of term frequencies
* @param array $sentences the sentences in the doc
* @param string $lang locale tag for stemming
* @return array the array of sentence adjacency
*/
static function computeAdjacency($term_frequencies_normalized, $sentences,
$lang)
{
$result = array(array());
$n = count($sentences);
for($i = 0; $i < $n; $i++ ) {
$result[$i][$i] = 0;
for($j = $i + 1; $j < $n; $j++ ) {
$result[$i][$j] = $result[$j][$i] =
self::findDistortion($sentences[$i], $sentences[$j],
$term_frequencies_normalized, $lang);
}
}
return $result;
}
/*
* Remove punctuation
* @param array $sentences the sentences in the doc
* @return array the array of sentences with the punctuation removed
*/
static function removePunctuation($sentences)
{
$n = count($sentences);
for($i = 0; $i < $n; $i++ ) {
$sentences[$i] = trim(preg_replace('/[^a-z0-9]+/iu', ' ',
$sentences[$i]));
}
return $sentences;
}
/*
* Remove the stop words from the array of sentences
* @param array $sentences the sentences in the doc
* @param string $lang locale tag for stemming
* @return array the array of sentences with the stop words removed
*/
static function removeStopWords($sentences, $lang)
{
$n = count($sentences);
$stop_obj = PhraseParser::getTokenizer($lang);
if($stop_obj && method_exists($stop_obj, "stopwordsRemover")) {
for($i = 0; $i < $n; $i++ ) {
$sentences[$i] = $stop_obj->stopwordsRemover(
self::formatDoc($sentences[$i]));
}
}
return $sentences;
}
/*
* Calculate the term frequencies.
* @param array $terms the list of all terms in the doc
* @param array $sentences the sentences in the doc
* @return array a two dimensional array where the word is the key and
* the frequency is the value
*/
static function getTermFrequencies($terms, $sentences)
{
$t = count($terms);
$n = count($sentences);
$nk = array();
$nk = array_fill(0, $t, 0);
$nt = array();
for($i = 0; $i < $n; $i++) {
for($j = 0; $j < $t; $j++) {
if(strpos($sentences[$i], $terms[$j]) !== false) {
$nk[$j]++;
}
}
}
for($i = 0; $i < count($nk); $i++ ) {
$term_frequencies[$terms[$i]] = $nk[$i];
}
return $term_frequencies;
}
/*
* Get the terms from an array of sentences
* @param array $sentences the sentences in the doc
* @param string $lang locale tag for stemming
* $return array an array of terms in the array of sentences
*/
static function getTerms($sentences, $lang)
{
$terms = array();
foreach($sentences as $sentence) {
$terms = array_merge($terms,
PhraseParser::segmentSegment($sentence, $lang));
}
$terms = array_filter($terms);
$terms_counts = array_count_values($terms);
arsort($terms_counts);
$terms_counts = array_slice($terms_counts, 0, self::MAX_DISTINCT_TERMS);
$terms = array_unique(array_keys($terms_counts));
$t = count($terms);
if($t == 0) {
return array("", "");
}
return $terms;
}
/**
* Breaks any content into sentences by splitting it on spaces or carriage
* returns
* @param string $content complete page.
* @param boolean $keep_punctuation whether to keep the punctuation or not.
* @return array array of sentences from that content.
*/
static function getSentences($content, $keep_punctuation)
{
if ($keep_punctuation) {
$sentences =
preg_split('/(? $v) {
$sum_of_squares += ($v * $v);
}
$square_root = sqrt($sum_of_squares);
foreach ($term_frequencies as $k => $v) {
$result[$k] = ($v / $square_root);
}
foreach ($result as $k => $v) {
$result_sum += $v;
}
return $result;
}
/**
* Calcluate the distortion measure.
* 1. Check each word in sentence1 to see if it exists in sentence2.
* If the word X of sentence1 does not exist in sentence2,
* square the score of word X and add to the sum
* and increase the number of not-common words by one.
* 2. In case the word X is common between sentence1 and
* sentence2, calculate its frequency in sentence2 and subtract
* it from the score of word X, then square and add to
* sum.
* 3. Then check the sentence2 to find its not-common words
* with sentence1, in case the word Y is not in sentence1,
* square the score of word Y and add to sum and increase
* the number of not-common words by one.
* 4. At the end, calcualte the distortion between sentence1 and
* sentence2 by dividing sum by the number of not-common
* words.
* @param string $first_sentence the first sentence to compare
* @param string $second_sentence the second sentence to compare
* @param string $term_frequencies the term frequency of the sentences
* @param string $lang locale tag for stemming
*/
static function findDistortion($first_sentence, $second_sentence,
$term_frequencies, $lang)
{
$result = 0;
$first_sentence_split = preg_split('/ +/u', $first_sentence);
$second_sentence_split = preg_split('/ +/u', $second_sentence);
$sum = 0;
$non_common_words = 0;
$n = count($first_sentence_split);
for($i = 0; $i < $n; $i++ ) {
$word_to_search_for = trim($first_sentence_split[$i]);
if ($word_to_search_for != "") {
preg_match_all("/ " . $word_to_search_for . " /",
$second_sentence, $matches);
if (count($matches[0]) == 0) {
$sum += ($term_frequencies[$word_to_search_for] *
$term_frequencies[$word_to_search_for]);
$non_common_words++;
} else {
$terms = self::getTerms(array($second_sentence), $lang);
$temp_term_frequencies = self::getTermFrequencies(
$terms, array($second_sentence));
$temp_term_frequencies_normalized =
self::normalizeTermFrequencies(
$temp_term_frequencies);
$new_term_frequency =
$term_frequencies[$word_to_search_for] -
$temp_term_frequencies_normalized[$word_to_search_for];
$sum += ($new_term_frequency * $new_term_frequency);
}
}
}
$n = count($second_sentence_split);
for($i = 0; $i < $n; $i++ ) {
$word_to_search_for = trim($second_sentence_split[$i]);
if ($word_to_search_for != "") {
preg_match_all("/ " . trim($word_to_search_for) . " /",
$first_sentence, $matches);
if (count($matches[0]) == 0) {
$sum += ($term_frequencies[$word_to_search_for] *
$term_frequencies[$word_to_search_for]);
$non_common_words++;
}
}
}
if ($non_common_words != 0) {
$result = $sum / $non_common_words;
}
return $result;
}
/**
* Formats the sentences to remove all characters except words,
* digits and spaces
* @param string $sent complete page.
* @return string formatted sentences.
*/
static function formatSentence($sent)
{
$sent = trim(preg_replace('/[^\p{L}\p{N}\s]+/u',
' ', mb_strtolower($sent)));
return $sent;
}
/**
* Formats the document to remove carriage returns, hyphens and digits
* as we will not be using digits in word cloud.
* The formatted document generated by this function is only used to
* compute centroid.
* @param string $content formatted page.
* @return string formatted document.
*/
static function formatDoc($content)
{
$substitute = array('/[\n\r\-]+/', '/[^\p{L}\s\.]+/u', '/[\.]+/u');
$content = preg_replace($substitute, ' ', mb_strtolower($content));
return $content;
}
/**
* This function does an additional processing on the page
* such as removing all the tags from the page
* @param string $page complete page.
* @return string processed page.
*/
static function pageProcessing($page)
{
$substitutions = array('@@si',
'/\ \;|\&rdquo\;|\&ldquo\;|\&mdash\;/si',
'@@si', '/[\^\(\)]/',
'/\[(.*?)\]/', '/\t\n/'
);
$page = preg_replace($substitutions, ' ', $page);
$page = preg_replace('/\s{2,}/u', ' ', $page);
$new_page = preg_replace("/\
/u", "\n", $page);
$changed = false;
if($new_page != $page) {
$changed = true;
$page = $new_page;
}
$page = preg_replace("/\<\/(h1|h2|h3|h4|h5|h6|table|tr|td|div|".
"p|address|section)\s*\>/u", "\n\n", $page);
$page = preg_replace("/\