.
*
* END LICENSE
*
* @author Chris Pollett (chris@pollett.org)
* @license http://www.gnu.org/licenses/ GPL3
* @link http://www.seekquarry.com/
* @copyright 2009 - 2015
* @filesource
*/
namespace seekquarry\yioop\library\summarizers;
use seekquarry\yioop\configs as C;
use seekquarry\yioop\library as L;
use seekquarry\yioop\library\CrawlConstants;
use seekquarry\yioop\library\PhraseParser;
use seekquarry\yioop\library\processors\PageProcessor;
/**
* Class which may be used by the processors to get a summary for a text
* document that may later be used for indexing. Generate a summary based
* the Lanczos algorithm.
* @author Charles Bocage (charles.bocage@sjsu.edu)
*/
class LanczosSummarizer extends Summarizer
{
/**
* Number of bytes in a sentence before it is considered long
* We use strlen rather than mbstrlen. This might actually be
* a better metric of the potential of a sentence to have info.
*/
const LONG_SENTENCE_LEN = 50;
/**
* Number of sentences in a document before only consider longer
* sentences in centroid
*/
const LONG_SENTENCE_THRESHOLD = 100;
/**
* Number of distinct terms to use in generating summary
*/
const MAX_DISTINCT_TERMS = 1000;
/**
* Number of words in word cloud
*/
const WORD_CLOUD_LEN = 5;
/**
* Number of nonzero centroid components
*/
const CENTROID_COMPONENTS = 50;
/**
* whether to output the results to the disk or not
*/
const OUTPUT_TO_FILE = false;
/**
* The full disk location to save the result to
*/
const OUTPUT_FILE_PATH = "/temp/centroid_weighted_summarizer_result.txt";
/**
* Generate a summary based on it closeness to the average sentence.
* It also weights sentences based on the CMS that produced it.
* @param string $doc complete raw page to generate the summary from.
* @param string $lang language of the page to decide which stop words to
* call proper tokenizer.php of the specified language.
*
* @return array array of summary and word cloud
*/
public static function getLanczosSummary($doc, $lang)
{
$raw_doc = $doc;
$doc = self::pageProcessing($doc);
/* Format the document to remove characters other than periods and
alphanumerics.
*/
$formatted_doc = self::formatDoc($doc);
$stop_obj = PhraseParser::getTokenizer($lang);
/* Splitting into sentences */
$out_sentences = self::getSentences($doc);
$sentences = self::removeStopWords($out_sentences, $stop_obj);
$sentence_array = self::splitSentences($sentences, $lang, $raw_doc);
$terms = $sentence_array[0];
$tf_per_sentence = $sentence_array[1];
$tf_per_sentence_normalized = $sentence_array[2];
$tf_average_sentence =
self::getAverageSentence($tf_per_sentence_normalized);
$tf_dot_product_per_sentence =
self::getDotProduct($tf_per_sentence_normalized,
$tf_average_sentence);
usort($tf_dot_product_per_sentence, 'self::sortInAscendingOrder');
$summary = self::getSummary($tf_dot_product_per_sentence,
$out_sentences);
$n = count($out_sentences);
$terms = array_filter($terms);
$terms_counts = array_count_values($terms);
arsort($terms_counts);
$terms_counts = array_slice($terms_counts, 0,
self::MAX_DISTINCT_TERMS);
$terms = array_unique(array_keys($terms_counts));
$t = count($terms);
if ($t == 0) {
return ["", ""];
}
/* Initialize Nk [Number of sentences the term occurs] */
$nk = [];
$nk = array_fill(0, $t, 0);
$nt = [];
/* Count TF for each word */
for ($i = 0; $i < $n; $i++) {
for ($j = 0; $j < $t; $j++) {
if (strpos($sentences[$i], $terms[$j]) !== false) {
$nk[$j]++;
}
}
}
/* Calculate weights of each term for every sentence */
$w = [];
$idf = [];
$idf_temp = 0;
for ($k = 0; $k < $t; $k++) {
if ($nk[$k] == 0) {
$idf_temp = 0;
$tmp = 0;
} else {
$idf_temp = $n / $nk[$k];
$tmp = log($idf_temp);
}
$idf[$k] = $tmp;
}
/* Count TF for finding centroid */
$wc = [];
$max_nt = -1;
$b = "\b";
if (in_array($lang, ["zh-CN", "ja", "ko"])) {
$b = "";
}
for ($j = 0; $j < $t; $j++) {
$nt = @preg_match_all("/$b{$terms[$j]}$b/", $formatted_doc,
$matches); //$matches included for backwards compatibility
$wc[$j] = $nt * $idf[$j];
if (is_nan($wc[$j]) || is_infinite($wc[$j])) {
$wc[$j] = 0;
}
}
/* Calculate centroid */
arsort($wc);
$centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true);
/* Initializing centroid weight array by 0 */
$wc = array_fill(0, $t, 0);
/* Word cloud */
$i = 0;
$word_cloud = [];
foreach ($centroid as $key => $value) {
$wc[$key] = $value;
if ($i < self::WORD_CLOUD_LEN) {
$word_cloud[$i] = $terms[$key];
}
$i++;
}
//should not need anything below this line
// if (strlen($formatted_doc) < PageProcessor::$max_description_len
// || $n == 1) {
// //if input short only use above to get a word cloud
// $formatted_doc = substr($formatted_doc, 0,
// PageProcessor::$max_description_len);
// return [$formatted_doc, $word_cloud];
// }
// ksort($wc);
// /* Calculate similarity measure between centroid and each sentence */
// $sim = [];
// for ($i=0; $i < $n; $i++) {
// $a = $b1 = $b2 = $c1 = $c2 = $d = 0;
// for ($k = 0; $k < $t; $k++) {
// $wck = $wc[$k];
// $idfk = $idf[$k];
// $tmp = substr_count($sentences[$i], $terms[$k]);
// $wik = ($tmp > 0) ? $idfk * (1 + log($tmp)) : 0;
// $a += ($wik * $wck * $idfk);
// $b1 += ($wik * $wik);
// $c1 += ($wck * $wck);
// }
// $b2 = sqrt($b1);
// $c2 = sqrt($c1);
// $d = $b2 * $c2;
// if ($d == 0) {
// $sim[$i] = 0;
// } else {
// $sim[$i] = $a / $d;
// }
// }
// arsort($sim);
// /* Getting how many sentences should be there in summary */
// $top = self::summarySentenceCount($out_sentences, $sim);
// $sum_array = [];
// $sum_array = array_keys(array_slice($sim, 0, $top - 1, true));
// sort($sum_array);
// $summary = '';
// foreach ($sum_array as $key) {
// $summary .= $out_sentences[$key] . ". ";
// }
//
//
//
//
// if (self::OUTPUT_TO_FILE) {
// $output_file_contents = "";
// foreach ($sum_array as $key) {
// $output_file_contents .= $out_sentences[$key] . ".\n";
// }
// file_put_contents(C\WORK_DIRECTORY . self::OUTPUT_FILE_PATH,
// $output_file_contents);
// }
/* Summary of text summarization */
return [$summary, $word_cloud];
}
/**
* Calculates how many sentences to put in the summary to match the
* MAX_DESCRIPTION_LEN.
*
* @param array $sentences sentences in doc in their original order
* @param array $sim associative array of sentence-number-in-doc =>
* similarity score to centroid (sorted from highest to lowest score).
* @return int number of sentences
*/
public static function summarySentenceCount($sentences, $sim)
{
$top = null;
$count = 0;
foreach ($sim as $key => $value)
{
if ($count < PageProcessor::$max_description_len) {
$count += strlen($sentences[$key]);
$top++;
}
}
return $top;
}
/**
* Breaks any content into sentences by splitting it on spaces or carriage
* returns
* @param string $content complete page.
* @return array array of sentences from that content.
*/
public static function getSentencesOriginal($content)
{
$lines = preg_split(
'/(\.|\||\!|\?|!|?|。)\s+|(\n|\r)(\n|\r)+|\s{5}/',
$content, 0, PREG_SPLIT_NO_EMPTY);
$out = [];
$sentence = "";
$count = 0;
$theshold_factor = 1;
foreach ($lines as $line) {
$sentence .= " " . $line;
if (strlen($line) < 2) {
continue;
}
if ($count < self::LONG_SENTENCE_THRESHOLD ||
strlen($sentence) > $theshold_factor *
self::LONG_SENTENCE_LEN){
$sentence = preg_replace("/\s+/ui", " ", $sentence);
$out[] = trim($sentence);
$count++;
$theshold_factor =
pow(1.5, floor($count/self::LONG_SENTENCE_THRESHOLD));
}
$sentence = "";
}
if (trim($sentence) != "") {
$sentence = preg_replace("/\s+/ui", " ", $sentence);
$out[] = trim($sentence);
}
return $out;
}
/**
* Formats the sentences to remove all characters except words,
* digits and spaces
* @param string $sent complete page.
* @return string formatted sentences.
*/
public static function formatSentence($sent)
{
$sent = trim(preg_replace('/[^\p{L}\p{N}\s]+/u',
' ', mb_strtolower($sent)));
return $sent;
}
/**
* Formats the document to remove carriage returns, hyphens and digits
* as we will not be using digits in word cloud.
* The formatted document generated by this function is only used to
* compute centroid.
* @param string $content formatted page.
* @return string formatted document.
*/
public static function formatDoc($content)
{
$substitute = ['/[\n\r\-]+/', '/[^\p{L}\s\.]+/u', '/[\.]+/'];
$content = preg_replace($substitute, ' ', mb_strtolower($content));
return $content;
}
/**
* This function does an additional processing on the page
* such as removing all the tags from the page
* @param string $page complete page.
* @return string processed page.
*/
public static function pageProcessing($page)
{
$substitutions = ['@@si',
'/\ \;|\&rdquo\;|\&ldquo\;|\&mdash\;/si',
'@@si', '/[\^\(\)]/',
'/\[(.*?)\]/', '/\t\n/'
];
$page = preg_replace($substitutions, ' ', $page);
$page = preg_replace('/\s{2,}/', ' ', $page);
$new_page = preg_replace("/\
/", "\n", $page);
$changed = false;
if ($new_page != $page) {
$changed = true;
$page = $new_page;
}
$page = preg_replace("/\<\/(h1|h2|h3|h4|h5|h6|table|tr|td|div|".
"p|address|section)\s*\>/", "\n\n", $page);
$page = preg_replace("/\ $v) {
$sum_of_squares += ($v * $v);
}
$square_root = sqrt($sum_of_squares);
foreach ($term_frequencies as $k => $v) {
if ($square_root == 0) {
$result[$k] = 0;
} else {
$result[$k] = ($v / $square_root);
}
}
foreach ($result as $k => $v) {
$result_sum += $v;
}
}
return $result;
}
/**
* Get the average sentence by adding up the values from each column and
* dividing it by the rows in the array.
* @param array $term_frequencies_normalized the array with the terms as
* the key and its normalized frequency as the value
* @return array array of frequencies averaged
*/
public static function getAverageSentence($term_frequencies_normalized)
{
$result = [];
if (count($term_frequencies_normalized) != 0) {
foreach ($term_frequencies_normalized as $k => $v) {
foreach ($v as $l => $w) {
if (count($result) == 0) {
$result[$l] = $w;
} else {
if (@array_key_exists($l, $result)) {
$result[$l] = $result[$l] + $w;
} else {
$result[$l] = $w;
}
}
}
}
$count = count($term_frequencies_normalized);
foreach ($result as $k => $v) {
$result[$k] = ($v / $count);
}
}
return $result;
}
/**
* Get the dot product of the normalized array and the average sentence
* @param array $term_frequencies_normalized the array with the terms as
* the key and its normalized frequency as the value
* @param array $average_sentence an array of each words average
* frequency value
* @return array array of frequencies averaged
*/
public static function getDotProduct($term_frequencies_normalized,
$average_sentence)
{
$result = [];
$count = 0;
foreach ($term_frequencies_normalized as $k => $v) {
$tempResult = 0;
foreach ($v as $l => $w) {
if (@array_key_exists($l, $average_sentence)) {
$tempResult = $tempResult +
($average_sentence[$l] * $w);
}
}
$result[$count] = $tempResult;
$count++;
}
return $result;
}
/**
* Compare the two values and return if b is greater than a
* @param string $a the first value to compare
* @param string $b the second value to compare
* @return boolean if b is greater than a
*/
public static function sortInAscendingOrder($a, $b)
{
return $b > $a ? 1 : -1;
}
/**
* Returns a new array of sentences without the stop words
* @param array $sentences the array of sentences to process
* @param object $stop_obj the class that has the stopworedRemover method
* @return array a new array of sentences without the stop words
*/
public static function removeStopWords($sentences, $stop_obj)
{
$n = count($sentences);
$result = [];
if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) {
for ($i = 0; $i < $n; $i++ ) {
$result[$i] = $stop_obj->stopwordsRemover(
self::formatDoc($sentences[$i]));
}
} else {
$result = $sentences;
}
return $result;
}
/**
* Split up the sentences and return an array with all of the needed parts
* @param array $sentences the array of sentences to process
* @param string $lang the current locale
* @param string $doc complete raw page to generate the summary from.
* @return array an array with all of the needed parts
*/
public static function splitSentences($sentences, $lang, $doc)
{
$result = [];
$terms = [];
$tf_index = 0;
$tf_per_sentence = [];
$tf_per_sentence_normalized = [];
foreach ($sentences as $sentence) {
$temp_terms = PhraseParser::segmentSegment($sentence, $lang);
$terms = array_merge($terms, $temp_terms);
$tf_per_sentence[$tf_index] =
self::getTermFrequencies($temp_terms, $sentence, $doc);
$tf_per_sentence_normalized[$tf_index] =
self::normalizeTermFrequencies($tf_per_sentence[$tf_index]);
$tf_index++;
}
$result[0] = $terms;
$result[1] = $tf_per_sentence;
$result[2] = $tf_per_sentence_normalized;
return $result;
}
/**
* Split up the sentences and return an array with all of the needed parts
* @param array $tf_dot_product_per_sentence an array that holds the dot
product of each sentence. It should be sorted from highest to
lowest when it is passed to this method.
* @param array $sentences the array of sentences to process
* @return string a string that represents the summary
*/
public static function getSummary($tf_dot_product_per_sentence,
$sentences)
{
$result = "";
$result_length = 0;
$i = 0;
foreach ($tf_dot_product_per_sentence as $k => $v) {
if ($result_length + strlen($sentences[$k]) >
PageProcessor::$max_description_len) {
break;
} else {
$result_length += strlen($sentences[$k]);
if ($i == 0) {
$i = 1;
$result = $sentences[$k] . ". ";
if (self::OUTPUT_TO_FILE) {
$output_file_contents = $sentences[$k] . ". ";
}
} else {
$result .= " " . $sentences[$k] . ". ";
if (self::OUTPUT_TO_FILE) {
$output_file_contents = $output_file_contents .
"\r\n" . $sentences[$k] . ". ";
}
}
}
}
if (self::OUTPUT_TO_FILE) {
file_put_contents(C\WORK_DIRECTORY . self::OUTPUT_FILE_PATH,
$output_file_contents);
}
return $result;
}
public static $q;
public static $lineCount = 0;
public static $matrix;
public static $senten;
public static $senMatrix;
public static $words;
public static $ignores;
public static $rr;
public static $cc;
public static function main($args){
//makeIgnorelists();//remove stop words
//makeFinalWSMatrix();
//double [][]A = copy(matrix);
//List ev = new LinkedList();
//
///*
//double[][] A
//= new double[][] {
// {1.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {1.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000},
// {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000},
// {1.000, 0.000, 0.000, 1.000, 0.000, 1.000},
// {0.000, 1.000, 0.000, 0.000, 0.000, 0.000},
// {0.000, 1.000, 0.000, 0.000, 0.000, 0.000},
// {0.000, 0.000, 1.000, 0.000, 0.000, 0.000},
// {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 1.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {2.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {3.000, 1.000, 0.000, 0.000, 0.000, 1.000},
// {0.000, 0.000, 1.000, 0.000, 0.000, 0.000},
// {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 1.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {2.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {3.000, 1.000, 0.000, 0.000, 0.000, 1.000},
// {1.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {1.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000},
// {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000},
// {1.000, 0.000, 0.000, 1.000, 0.000, 1.000},
// {0.000, 1.000, 0.000, 0.000, 0.000, 0.000},
// {0.000, 1.000, 0.000, 0.000, 0.000, 0.000},
// {0.000, 0.000, 1.000, 0.000, 0.000, 0.000},
// {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 1.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {2.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {3.000, 1.000, 0.000, 0.000, 0.000, 1.000},
// {0.000, 0.000, 1.000, 0.000, 0.000, 0.000},
// {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 1.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {2.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {3.000, 1.000, 0.000, 0.000, 0.000, 1.000},
// {1.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {1.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000},
// {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000},
// {1.000, 0.000, 0.000, 1.000, 0.000, 1.000},
// {0.000, 1.000, 0.000, 0.000, 0.000, 0.000},
// {0.000, 1.000, 0.000, 0.000, 0.000, 0.000},
// {0.000, 0.000, 1.000, 0.000, 0.000, 0.000},
// {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 1.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {2.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {3.000, 1.000, 0.000, 0.000, 0.000, 1.000},
// {0.000, 0.000, 1.000, 0.000, 0.000, 0.000},
// {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 1.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {2.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {3.000, 1.000, 0.000, 0.000, 0.000, 1.000},
// {1.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {1.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000},
// {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000},
// {1.000, 0.000, 0.000, 1.000, 0.000, 1.000},
// {0.000, 1.000, 0.000, 0.000, 0.000, 0.000},
// {0.000, 1.000, 0.000, 0.000, 0.000, 0.000},
// {0.000, 0.000, 1.000, 0.000, 0.000, 0.000},
// {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 1.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {2.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {3.000, 1.000, 0.000, 0.000, 0.000, 1.000},
// {0.000, 0.000, 1.000, 0.000, 0.000, 0.000},
// {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 1.000},
// {0.000, 0.000, 0.000, 1.000, 0.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {0.000, 0.000, 0.000, 0.000, 1.000, 0.000},
// {2.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {3.000, 1.000, 0.000, 0.000, 0.000, 1.000}
//
//};
//
//
//
// double[][] A
//= new double[][] {
// {4.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {3.000, 0.000, 0.000, 0.000, 0.000, 1.000},
// {1.000, 2.000, 0.000, 0.000, 0.000 ,0.000},
// {3.000, 3.000, 0.000, 2.000, 0.000 ,0.000},
// {1.000, 0.000, 0.000, 1.000, 0.000, 1.000},
// {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000},
// {1.000, 0.000, 0.000, 2.000, 0.000 ,0.000},
// {1.000, 2.000, 0.000, 1.000, 0.000, 1.000}};
//
//*/
////double [][] A = new double[][] {{1,1,1},{-1,3,1}};
//
////double [][] A = new double [][] {{0.5, 0.5,-0.5},{0,0,-2},{1.5, -0.5,4.5}};
//
//rr= A.length;
//cc = A[0].length;
//
////int n = a1.length;
////double[][] A = new double[n][n];
//
////double[][] A = { { 1, 2,3, 1,2,2}, { 4,5,6,2,1,2 }, {2,1,3,1,1,3}, {1,1,0,1,1,5}, {2,1,0,0,1,3}};
//
//int n = A.length;
//
//
//
//double[][] AT = transpose(A);
//double [][] AAT = matrixMultiplication(AT, A); // a*at
//// double[][] aat = (double[][])AAT.clone();
//printM(AAT);
//
//n = AAT.length;
//
//
//double[][] I = new double[n][n];
//q = new double[n][];
//
//
//
//double[][] o = (double[][])A.clone();
//int i = 0;
//for(i = 0; i < n; i++)
// for(int j = 0; j < n; j++)
// I[i][j] = (i == j) ? 1 : 0;
//
//System.out.println("Starting Matrix");
//printM(A);
//
//int j = 0;
//A = lanczos(AAT);
//
//double[][] l = (double[][])A.clone();
//
////dsfsdfsdfsdfsd
//System.out.println(" ");
//System.out.println("Generating a tri-diagonal matrix");
//printM(A);
//System.out.println(" ");
//
//double[][][] qrArrays;
//boolean iteration= true;
//i = 0;
//double count =0;
//
//while(iteration && count <30)
//{
// qrArrays = qRDecompose(A);
// A = matrixMultiplication(qrArrays[1],qrArrays[0]);
//
// if(checkSubDiagonal(A))
// {
// iteration = false;
// }
// count ++;
//}
//
//System.out.println(" ");
//System.out.println("QR factoriztion");
//printM(A);
////don't put .3f in calculating eigen
//double [] eigen = new double[n];
//
//for(i=0; i1; out--)
//{
// for(int in=0; in sen = getSentences(r.data,2);
//System.out.println("Summary: ");
//printSentences(sen);
//
//
////System.out.println("Multiplication of three matrices above");
////Matrix out = left.times(singular).times(r);
////out.show();
self::makeIgnorelists();
self::makeFinalWSMatrix();
$A = self::copy(self::$matrix);
$ev = array();
self::$rr = count($A);
self::$cc = count($A[0]);
$n = count($A);
$AT = self::transpose($A);
$AAT = self::matrixMultiplication($AT, $A); // a*at
self::printM($AAT);
$n = count($AAT);
$I = array();
self::$q = array();
$o = self::copy($A);
$i = 0;
for ($i = 0; $i < $n; $i++) {
for ($j = 0; $j < $n; $j++) {
$I[$i][$j] = ($i == $j) ? 1 : 0;
}
}
print("Starting Matrix\n");
self::printM($A);
$j = 0;
$A = self::lanczos($AAT);
$l = self::copy($A);
print(" \n");
print("Generating a tri-diagonal matrix\n");
self::printM($A);
print(" \n");
$qrArrays = array();
$iteration = true;
$i = 0;
$count = 0.0;
while ($iteration && $count < 30) {
$qrArrays = self::qRDecompose($A);
$A = self::matrixMultiplication($qrArrays[1], $qrArrays[0]);
if (self::checkSubDiagonal($A)) {
$iteration = false;
}
$count++;
}
print(" \n");
print("QR factoriztion\n");
self::printM($A);
//don't put .3f in calculating eigen
$eigen = array();
for ($i = 0; $i < $n; $i++) {
for ($j = 0; $j < $n; $j++) {
if ($i == $j) {
if (abs($A[$i][$j]) < floatval(0.001)) {
$A[$i][$j] = 0;
}
$eigen[$i] = $A[$i][$j];
}
}
}
//sorting
for ($out = count($eigen) - 1; $out > 1; $out--) {
for ($in = 0; $in < $out; $in++) {
if (abs($eigen[$in]) < abs($eigen[$in + 1])) {
$temp = $eigen[$in];
$eigen[$in] = $eigen[$in + 1];
$eigen[$in + 1] = $temp;
}
}
}
for ($i = 0; $i < $n; $i++) {
print(number_format($eigen[$i], 3));
}
print("Eigenvalues sorted\n");
for ($i = 0; $i < $n; $i++) {
print(sqrt($eigen[$i]) . " ");
}
$x = self::copy($l);
for ($k = 0; $k < $n; $k++) {
for ($i = 0; $i < $n; $i++) {
for ($j = 0; $j < $n; $j++) {
if ($i == $j) {
$x[$i][$j] = $x[$i][$j] - $eigen[$k];
}
}
}
print("\n");
$d = new Matrix($x);
$ff = self::inverse($d->data);
$fm = new Matrix($ff);
$fs = $fm->getNorm();
$ev[] = $fs;
$x = self::copy($l);
}
$eigenvectors = self::getV($ev);
print("eigenvvvvvv\n");
$eigenvectors->show();
// transpose so that q can be in column vector,
//right now it's horizontal
$l_temp = new Matrix(self::$q);
$lancvectors = $l_temp->transpose();
$lancvectors->getRidOfNegativeZero();
print("\n");
$lancvectors->show();
print("Vector U\n");
$left = $lancvectors->times($eigenvectors->cClone());
$left->show();
print("Vector S\n");
$singular = self::getS($eigen);
$singular->show();
print("Transpose of vector V\n");
$a = new Matrix($o);
$at = $a->transpose();
$r = $at->times($left);
$r = $r->getMultiNorm();
$r = self::getTransposeOfV($r);
$r->show();
$sen = self::getSentences($r->data, 2);
print("Summary: \n");
self::printSentences($sen);
}
public static function checkSubDiagonal($A)
{
//for(int i=0; i < A.length; i++)
//{
// for(int j=i+1; j 0.001D)
//
// //if(Double.parseDouble(String.format("%.2f",A[j][i])) !=0)
// return false;
// }
//}
//
//return true;
for ($i = 0; $i < count($A); $i++) {
for ($j= $i + 1; $j < count($A); $j++) {
if (abs($A[$j][$i]) > floatval("0.001")) {
//if(Double.parseDouble(String.format("%.2f",A[j][i])) !=0)
return false;
}
}
}
return true;
}
public static function getTransposeOfV($V)
{
//return V.transpose();
return $V->transpose();
}
public static function getS($eigen)
{
//double[][] S = new double[eigen.length][eigen.length];
//for(int i=0; i < eigen.length; i++)
//{
// double val = Math.abs(eigen[i]);
// for(int j=0; j mitr = ev.listIterator();
//
//Matrix V = new Matrix(ev.size(), ev.size());
//int j=0;
//while(mitr.hasNext())
//{
// Matrix e = mitr.next();
// for(int i=0;idata[$i][$j] = $e->data[$i][0];
}
$j++;
}
return $V;
}
public static function qRDecompose($M)
{
//double[][][] arrays = new double[2][][];
//
//int n = M.length;
//double[][] A = (double[][]) M.clone();
//double[][] Q = new double[n][];
//double[][] R = new double[n][n];
//
//int i = 0;
//int j = 1;
//while(i-1)
// {
// while(k -1) {
while ($k < $m) {
$sum = $sum + $rm[$k][$i] * $x[$k];
$k++;
}
$x[$i] = ($b[$i] - $sum) / $rm[$i][$i];
$k = $i;
$i = $i - 1;
$sum = floatval("0.0");
}
return $x;
}
public static function transpose($M)
{
//int n = M.length;
//int m = M[0].length;
//double[][] A = new double[m][n];
//
//for (int i = 0; i < n; i++)
// for (int j = 0; j < m; j++)
// A[j][i] = M[i][j];
//return A;
$n = count($M);
$m = count($M[0]);
$A = array();
for ($i = 0; $i < $n; $i++) {
for ($j = 0; $j < $m; $j++) {
$A[$j][$i] = $M[$i][$j];
}
}
return $A;
}
public static function vectorPlus($u, $v)
{
//int n = u.length;
//double[] A = new double[n];
//for(int i=0; i list = new LinkedList();
//
//int j=0;
//for(int i=0; i < num; i++)
//{
//
// max = M[i][j];
// for(int k = j+1; k < M[0].length; k++)
// {
// if(Math.abs(M[i][k]) > Math.abs(max))
// {
// j=k;
// max = M[i][k];
//
// }
// }
// list.add(new Integer(j));
// j=0;
//}
//
//return list;
$max = 0.0;
$list = array();
$j = 0;
for ($i = 0; $i < $num; $i++) {
$max = $M[$i][$j];
for ($k = $j + 1; $k < count($M[0]); $k++) {
if(abs($M[$i][$k]) > abs($max)) {
$j = $k;
$max = $M[$i][$k];
}
}
$list[] = $j;
$j = 0;
}
return $list;
}
public static function printSentences($sens)
{
//for(Integer aa : sens)
//{
// //System.out.println(aa.intValue());
// System.out.println(senMatrix[aa.intValue()]);
//}
for ($i = 0; $i < count($sens); $i++) {
print(self::$senMatrix[$sens[$i]] . "\n");
}
}
public static function words($senStrings)
{
//words = new HashSet ();
//
// for(int i=0; i < senStrings.length; i++)
// {
// StringTokenizer parser = new StringTokenizer(senStrings[i], " \t\n\r\f.,;:!?'-()");
// while (parser.hasMoreTokens())
// {
// final String currentWord = parser.nextToken();
// if(!ignores.contains(currentWord))
// words.add(currentWord);
//
// }
//
//}
self::$words = array();
for ($i = 0; $i < count($senStrings); $i++) {
$parser = strtok($senStrings[$i], " \t\n\r\f.,;:!?'-()");
while ($parser !== false) {
$currentWord = $parser;
//this should check for case mb_strtolower()
if (!in_array($currentWord, self::$ignores, true) &&
!in_array($currentWord, self::$words, true)) {
self::$words[] = $currentWord;
}
$parser = strtok(" \t\n\r\f.,;:!?'-()");
}
}
}
public static function makeWSMatrix($senStrings)
{
//SortedSet ss = new TreeSet(words);
//Object [] slist = ss.toArray();
//
//for(int k=0; k < slist.length; k++)
//{
//
// for(int i=0; i < senStrings.length; i++)
// {
//
// int count =0;
// String line = senStrings[i];
// //System.out.println((String)slist[k]);
// Pattern hunter = Pattern.compile((String)slist[k]);
// Matcher fit = hunter.matcher(line);
// while(fit.find())
// {
// count++;
// }
// //System.out.println(count+"");
// matrix[k][i]= count;
// }
//
//
//
//}
$slist = self::$words;
asort($slist);
//for ($k = 0; $k < count($slist); $k++) {
$k = 0;
foreach ($slist as $item) {
for ($i = 0; $i < count($senStrings); $i++) {
$line = $senStrings[$i];
$count = preg_match_all("/" . $item . "/u",
$line, $matches);
self::$matrix[$k][$i] = $count;
}
$k = $k + 1;
}
}
//fills in the senMatrix string array and senten linked list
private static function countLines($line)
{
//String[] sentences = line.split("[.?!]+\\s*");
///*
//for(String a : sentences){
// System.out.println(a);
//}
//*/
//Pattern pat = Pattern.compile("[.?!]+\\s*");
//Matcher mat = pat.matcher(line);
//
// int start = 0, end = 0;
// while(mat.find()) {
// start = end;
// end = mat.end();
// senten.add(line.substring(start, end));
// }
//senMatrix = new String[senten.size()];
//int in=0;
//for(String s: senten)
//{
// senMatrix[in] = s;
// in++;
//}
self::$senten = preg_split(
'/[.?!]+\\s*/ui',
$line, 0, PREG_SPLIT_NO_EMPTY);
self::$senMatrix = self::$senten;
}
public static function makeFinalWSMatrix()
{
//BufferedReader r = null;
// String thisLine;
//try {
// r = new BufferedReader(new FileReader(".\\a.txt"));
//} catch (FileNotFoundException e1) {
// e1.printStackTrace();
//}
//String lines ="";
//try
//{
// while ((thisLine = r.readLine()) != null)
// {
// lines = lines+thisLine;
// }
//}catch (IOException e)
//{
// e.printStackTrace();
//}
//countLines(lines);
////System.out.println(senMatrix.length+"");
//words(senMatrix);
////System.out.println(words.size()+"");
//SortedSet ss = new TreeSet(words);
//Object [] slist = ss.toArray();
//
///*
//for(Object a: slist)
//{
// System.out.println(a);
//}
//*/
//matrix = new double[words.size()][senMatrix.length];
//makeWSMatrix(senMatrix);
//
//for(int m =0; m ();
//BufferedReader r = null;
//String thisLine;
//try {
// r = new BufferedReader(new FileReader(".\\ignore.txt"));
//} catch (FileNotFoundException e1) {
// e1.printStackTrace();
//}
//String lines ="";
//try
//{
// while ((thisLine = r.readLine()) != null)
// {
// ignores.add(thisLine);
// }
//}catch (IOException e)
//{
// e.printStackTrace();
//}
//we need these to get the stop words based on the locale
$r = file("c:/temp/ignore.txt");
for ($i = 0; $i < count($r); $i++) {
$thisLine = $r[$i];
self::$ignores[] = trim($thisLine);
}
/* self::$ignores = ['a','able','about','above','abst',
'accordance','according','based','accordingly','across','act',
'actually','added','adj','affected','affecting','affects','after',
'afterwards','again','against','ah','all','almost','alone','along',
'already','also','although','always','am','among','amongst','an','and',
'announce','another','any','anybody','anyhow','anymore','anyone',
'anything','anyway','anyways','anywhere','apparently','approximately',
'are','aren','arent','arise','around','as','aside','ask','asking','at',
'auth','available','away','awfully','b','back','be','became','because',
'become','becomes','becoming','been','before','beforehand','begin',
'beginning','beginnings','begins','behind','being','believe','below',
'beside','besides','between','beyond','biol','both','brief','briefly',
'but','by','c','ca','came','can','cannot','cant','cause','causes',
'certain','certainly','co','com','come','comes','contain','containing',
'contains','could','couldnt','d','date','did','didnt',
'different','do','does','doesnt','doing',
'done','dont','down','downwards',
'due','during','e','each','ed','edu','effect','eg','eight','eighty',
'either','else','elsewhere','end',
'ending','enough','especially','et',
'et-al','etc','even','ever','every',
'everybody','everyone','everything'
,'everywhere','ex','except','f','far','few','ff','fifth','first',
'five','fix','followed','following','follows','for','former',
'formerly','forth','found','four','from','further','furthermore',
'g','gave','get','gets','getting','give','given','gives','giving','go',
'goes','gone','got','gotten','h','had','happens','hardly','has','hasnt',
'have','havent','having','he','hed','hence','her','here','hereafter',
'hereby','herein','heres','hereupon','hers','herself','hes','hi','hid',
'him','himself','his','hither','home','how','howbeit',
'however', 'http', 'https', 'hundred','i','id','ie','if','ill',
'im','immediate','immediately',
'importance','important','in','inc','indeed','index','information',
'instead','into','invention','inward','is','isnt','it','itd','itll',
'its','itself','ive','j','just','k','keep','keeps',
'kept','kg','km','know',
'known','knows','l','largely','last','lately',
'later','latter','latterly',
'least','less','lest','let','lets','like','liked','likely','line',
'little','ll','look','looking','looks','ltd','m','made','mainly','make',
'makes','many','may','maybe','me','mean','means','meantime','meanwhile',
'merely','mg','might','million','miss','ml','more','moreover','most',
'mostly','mr','mrs','much','mug','must','my','myself','n','na','name',
'namely','nay','nd','near','nearly','necessarily','necessary','need',
'needs','neither','never','nevertheless','new','next',
'nine','ninety','no',
'nobody','non','none','nonetheless','noone',
'nor','normally','nos','not',
'noted','nothing','now','nowhere','o','obtain',
'obtained','obviously','of',
'off','often','oh','ok','okay','old','omitted','on','once','one','ones',
'only','onto','or','ord','other','others',
'otherwise','ought','our','ours',
'ourselves','out','outside','over','overall','owing','own','p','page',
'pages','part','particular','particularly',
'past','per','perhaps','placed',
'please','plus','poorly','possible','possibly','potentially','pp',
'predominantly','present','previously',
'primarily','probably','promptly',
'proud','provides','put','q','que','quickly','quite','qv','r','ran',
'rather','rd','re','readily','really','recent','recently','ref','refs',
'regarding','regardless','regards','related','relatively','research',
'respectively','resulted','resulting',
'results','right','run','s','said',
'same','saw','say','saying','says','sec',
'section','see','seeing','seem',
'seemed','seeming','seems',
'seen','self','selves','sent','seven','several',
'shall','she','shed','shell',
'shes','should','shouldnt','show','showed','shown','showns','shows',
'significant','significantly','similar','similarly','since',
'six','slightly',
'so','some','somebody','somehow','someone','somethan',
'something','sometime',
'sometimes','somewhat','somewhere','soon',
'sorry','specifically','specified',
'specify','specifying','still','stop','strongly','sub','substantially',
'successfully','such','sufficiently','suggest','sup','sure','t','take',
'taken','taking','tell','tends','th','than',
'thank','thanks','thanx','that',
'thatll','thats','thatve','the','their',
'theirs','them','themselves','then',
'thence','there','thereafter','thereby','thered','therefore','therein',
'therell','thereof','therere','theres','thereto','thereupon','thereve',
'these','they','theyd','theyll','theyre',
'theyve','think','this','those',
'thou','though','thoughh','thousand','throug',
'through','throughout','thru',
'thus','til','tip','to','together','too',
'took','toward','towards','tried',
'tries','truly','try','trying','ts','twice','two','u','un','under',
'unfortunately','unless','unlike','unlikely','until','unto','up','upon',
'ups','us','use','used','useful','usefully','usefulness','uses','using',
'usually','v','value','various','ve','very',
'via','viz','vol','vols','vs',
'w','want','wants','was','wasnt','way','we',
'wed','welcome','well','went',
'were','werent','weve','what','whatever',
'whatll','whats','when','whence',
'whenever','where','whereafter','whereas','whereby','wherein','wheres',
'whereupon','wherever','whether','which','while','whim','whither','who',
'whod','whoever','whole','wholl','whom','whomever','whos','whose','why',
'widely','willing','wish','with','within',
'without','wont','words','world',
'would','wouldnt','www','x','y','yes','yet','you','youd','youll','your',
'youre','yours','yourself','yourselves','youve','z','zero'];*/
}
}