. * * END LICENSE * * @author Chris Pollett (chris@pollett.org) * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ * @copyright 2009 - 2015 * @filesource */ namespace seekquarry\yioop\library\summarizers; use seekquarry\yioop\configs as C; use seekquarry\yioop\library as L; use seekquarry\yioop\library\CrawlConstants; use seekquarry\yioop\library\PhraseParser; use seekquarry\yioop\library\processors\PageProcessor; /** * Class which may be used by the processors to get a summary for a text * document that may later be used for indexing. Generate a summary based * the Lanczos algorithm. * @author Charles Bocage (charles.bocage@sjsu.edu) */ class LanczosSummarizer extends Summarizer { /** * Number of bytes in a sentence before it is considered long * We use strlen rather than mbstrlen. This might actually be * a better metric of the potential of a sentence to have info. */ const LONG_SENTENCE_LEN = 50; /** * Number of sentences in a document before only consider longer * sentences in centroid */ const LONG_SENTENCE_THRESHOLD = 100; /** * Number of distinct terms to use in generating summary */ const MAX_DISTINCT_TERMS = 1000; /** * Number of words in word cloud */ const WORD_CLOUD_LEN = 5; /** * Number of nonzero centroid components */ const CENTROID_COMPONENTS = 50; /** * whether to output the results to the disk or not */ const OUTPUT_TO_FILE = false; /** * The full disk location to save the result to */ const OUTPUT_FILE_PATH = "/temp/centroid_weighted_summarizer_result.txt"; /** * Generate a summary based on it closeness to the average sentence. * It also weights sentences based on the CMS that produced it. * @param string $doc complete raw page to generate the summary from. * @param string $lang language of the page to decide which stop words to * call proper tokenizer.php of the specified language. * * @return array array of summary and word cloud */ public static function getLanczosSummary($doc, $lang) { $raw_doc = $doc; $doc = self::pageProcessing($doc); /* Format the document to remove characters other than periods and alphanumerics. */ $formatted_doc = self::formatDoc($doc); $stop_obj = PhraseParser::getTokenizer($lang); /* Splitting into sentences */ $out_sentences = self::getSentences($doc); $sentences = self::removeStopWords($out_sentences, $stop_obj); $sentence_array = self::splitSentences($sentences, $lang, $raw_doc); $terms = $sentence_array[0]; $tf_per_sentence = $sentence_array[1]; $tf_per_sentence_normalized = $sentence_array[2]; $tf_average_sentence = self::getAverageSentence($tf_per_sentence_normalized); $tf_dot_product_per_sentence = self::getDotProduct($tf_per_sentence_normalized, $tf_average_sentence); usort($tf_dot_product_per_sentence, 'self::sortInAscendingOrder'); $summary = self::getSummary($tf_dot_product_per_sentence, $out_sentences); $n = count($out_sentences); $terms = array_filter($terms); $terms_counts = array_count_values($terms); arsort($terms_counts); $terms_counts = array_slice($terms_counts, 0, self::MAX_DISTINCT_TERMS); $terms = array_unique(array_keys($terms_counts)); $t = count($terms); if ($t == 0) { return ["", ""]; } /* Initialize Nk [Number of sentences the term occurs] */ $nk = []; $nk = array_fill(0, $t, 0); $nt = []; /* Count TF for each word */ for ($i = 0; $i < $n; $i++) { for ($j = 0; $j < $t; $j++) { if (strpos($sentences[$i], $terms[$j]) !== false) { $nk[$j]++; } } } /* Calculate weights of each term for every sentence */ $w = []; $idf = []; $idf_temp = 0; for ($k = 0; $k < $t; $k++) { if ($nk[$k] == 0) { $idf_temp = 0; $tmp = 0; } else { $idf_temp = $n / $nk[$k]; $tmp = log($idf_temp); } $idf[$k] = $tmp; } /* Count TF for finding centroid */ $wc = []; $max_nt = -1; $b = "\b"; if (in_array($lang, ["zh-CN", "ja", "ko"])) { $b = ""; } for ($j = 0; $j < $t; $j++) { $nt = @preg_match_all("/$b{$terms[$j]}$b/", $formatted_doc, $matches); //$matches included for backwards compatibility $wc[$j] = $nt * $idf[$j]; if (is_nan($wc[$j]) || is_infinite($wc[$j])) { $wc[$j] = 0; } } /* Calculate centroid */ arsort($wc); $centroid = array_slice($wc, 0, self::CENTROID_COMPONENTS, true); /* Initializing centroid weight array by 0 */ $wc = array_fill(0, $t, 0); /* Word cloud */ $i = 0; $word_cloud = []; foreach ($centroid as $key => $value) { $wc[$key] = $value; if ($i < self::WORD_CLOUD_LEN) { $word_cloud[$i] = $terms[$key]; } $i++; } //should not need anything below this line // if (strlen($formatted_doc) < PageProcessor::$max_description_len // || $n == 1) { // //if input short only use above to get a word cloud // $formatted_doc = substr($formatted_doc, 0, // PageProcessor::$max_description_len); // return [$formatted_doc, $word_cloud]; // } // ksort($wc); // /* Calculate similarity measure between centroid and each sentence */ // $sim = []; // for ($i=0; $i < $n; $i++) { // $a = $b1 = $b2 = $c1 = $c2 = $d = 0; // for ($k = 0; $k < $t; $k++) { // $wck = $wc[$k]; // $idfk = $idf[$k]; // $tmp = substr_count($sentences[$i], $terms[$k]); // $wik = ($tmp > 0) ? $idfk * (1 + log($tmp)) : 0; // $a += ($wik * $wck * $idfk); // $b1 += ($wik * $wik); // $c1 += ($wck * $wck); // } // $b2 = sqrt($b1); // $c2 = sqrt($c1); // $d = $b2 * $c2; // if ($d == 0) { // $sim[$i] = 0; // } else { // $sim[$i] = $a / $d; // } // } // arsort($sim); // /* Getting how many sentences should be there in summary */ // $top = self::summarySentenceCount($out_sentences, $sim); // $sum_array = []; // $sum_array = array_keys(array_slice($sim, 0, $top - 1, true)); // sort($sum_array); // $summary = ''; // foreach ($sum_array as $key) { // $summary .= $out_sentences[$key] . ". "; // } // // // // // if (self::OUTPUT_TO_FILE) { // $output_file_contents = ""; // foreach ($sum_array as $key) { // $output_file_contents .= $out_sentences[$key] . ".\n"; // } // file_put_contents(C\WORK_DIRECTORY . self::OUTPUT_FILE_PATH, // $output_file_contents); // } /* Summary of text summarization */ return [$summary, $word_cloud]; } /** * Calculates how many sentences to put in the summary to match the * MAX_DESCRIPTION_LEN. * * @param array $sentences sentences in doc in their original order * @param array $sim associative array of sentence-number-in-doc => * similarity score to centroid (sorted from highest to lowest score). * @return int number of sentences */ public static function summarySentenceCount($sentences, $sim) { $top = null; $count = 0; foreach ($sim as $key => $value) { if ($count < PageProcessor::$max_description_len) { $count += strlen($sentences[$key]); $top++; } } return $top; } /** * Breaks any content into sentences by splitting it on spaces or carriage * returns * @param string $content complete page. * @return array array of sentences from that content. */ public static function getSentencesOriginal($content) { $lines = preg_split( '/(\.|\||\!|\?|!|?|。)\s+|(\n|\r)(\n|\r)+|\s{5}/', $content, 0, PREG_SPLIT_NO_EMPTY); $out = []; $sentence = ""; $count = 0; $theshold_factor = 1; foreach ($lines as $line) { $sentence .= " " . $line; if (strlen($line) < 2) { continue; } if ($count < self::LONG_SENTENCE_THRESHOLD || strlen($sentence) > $theshold_factor * self::LONG_SENTENCE_LEN){ $sentence = preg_replace("/\s+/ui", " ", $sentence); $out[] = trim($sentence); $count++; $theshold_factor = pow(1.5, floor($count/self::LONG_SENTENCE_THRESHOLD)); } $sentence = ""; } if (trim($sentence) != "") { $sentence = preg_replace("/\s+/ui", " ", $sentence); $out[] = trim($sentence); } return $out; } /** * Formats the sentences to remove all characters except words, * digits and spaces * @param string $sent complete page. * @return string formatted sentences. */ public static function formatSentence($sent) { $sent = trim(preg_replace('/[^\p{L}\p{N}\s]+/u', ' ', mb_strtolower($sent))); return $sent; } /** * Formats the document to remove carriage returns, hyphens and digits * as we will not be using digits in word cloud. * The formatted document generated by this function is only used to * compute centroid. * @param string $content formatted page. * @return string formatted document. */ public static function formatDoc($content) { $substitute = ['/[\n\r\-]+/', '/[^\p{L}\s\.]+/u', '/[\.]+/']; $content = preg_replace($substitute, ' ', mb_strtolower($content)); return $content; } /** * This function does an additional processing on the page * such as removing all the tags from the page * @param string $page complete page. * @return string processed page. */ public static function pageProcessing($page) { $substitutions = ['@]*?>.*?@si', '/\ \;|\&rdquo\;|\&ldquo\;|\&mdash\;/si', '@]*?>.*?@si', '/[\^\(\)]/', '/\[(.*?)\]/', '/\t\n/' ]; $page = preg_replace($substitutions, ' ', $page); $page = preg_replace('/\s{2,}/', ' ', $page); $new_page = preg_replace("/\/", "\n", $page); $changed = false; if ($new_page != $page) { $changed = true; $page = $new_page; } $page = preg_replace("/\<\/(h1|h2|h3|h4|h5|h6|table|tr|td|div|". "p|address|section)\s*\>/", "\n\n", $page); $page = preg_replace("/\ $v) { $sum_of_squares += ($v * $v); } $square_root = sqrt($sum_of_squares); foreach ($term_frequencies as $k => $v) { if ($square_root == 0) { $result[$k] = 0; } else { $result[$k] = ($v / $square_root); } } foreach ($result as $k => $v) { $result_sum += $v; } } return $result; } /** * Get the average sentence by adding up the values from each column and * dividing it by the rows in the array. * @param array $term_frequencies_normalized the array with the terms as * the key and its normalized frequency as the value * @return array array of frequencies averaged */ public static function getAverageSentence($term_frequencies_normalized) { $result = []; if (count($term_frequencies_normalized) != 0) { foreach ($term_frequencies_normalized as $k => $v) { foreach ($v as $l => $w) { if (count($result) == 0) { $result[$l] = $w; } else { if (@array_key_exists($l, $result)) { $result[$l] = $result[$l] + $w; } else { $result[$l] = $w; } } } } $count = count($term_frequencies_normalized); foreach ($result as $k => $v) { $result[$k] = ($v / $count); } } return $result; } /** * Get the dot product of the normalized array and the average sentence * @param array $term_frequencies_normalized the array with the terms as * the key and its normalized frequency as the value * @param array $average_sentence an array of each words average * frequency value * @return array array of frequencies averaged */ public static function getDotProduct($term_frequencies_normalized, $average_sentence) { $result = []; $count = 0; foreach ($term_frequencies_normalized as $k => $v) { $tempResult = 0; foreach ($v as $l => $w) { if (@array_key_exists($l, $average_sentence)) { $tempResult = $tempResult + ($average_sentence[$l] * $w); } } $result[$count] = $tempResult; $count++; } return $result; } /** * Compare the two values and return if b is greater than a * @param string $a the first value to compare * @param string $b the second value to compare * @return boolean if b is greater than a */ public static function sortInAscendingOrder($a, $b) { return $b > $a ? 1 : -1; } /** * Returns a new array of sentences without the stop words * @param array $sentences the array of sentences to process * @param object $stop_obj the class that has the stopworedRemover method * @return array a new array of sentences without the stop words */ public static function removeStopWords($sentences, $stop_obj) { $n = count($sentences); $result = []; if ($stop_obj && method_exists($stop_obj, "stopwordsRemover")) { for ($i = 0; $i < $n; $i++ ) { $result[$i] = $stop_obj->stopwordsRemover( self::formatDoc($sentences[$i])); } } else { $result = $sentences; } return $result; } /** * Split up the sentences and return an array with all of the needed parts * @param array $sentences the array of sentences to process * @param string $lang the current locale * @param string $doc complete raw page to generate the summary from. * @return array an array with all of the needed parts */ public static function splitSentences($sentences, $lang, $doc) { $result = []; $terms = []; $tf_index = 0; $tf_per_sentence = []; $tf_per_sentence_normalized = []; foreach ($sentences as $sentence) { $temp_terms = PhraseParser::segmentSegment($sentence, $lang); $terms = array_merge($terms, $temp_terms); $tf_per_sentence[$tf_index] = self::getTermFrequencies($temp_terms, $sentence, $doc); $tf_per_sentence_normalized[$tf_index] = self::normalizeTermFrequencies($tf_per_sentence[$tf_index]); $tf_index++; } $result[0] = $terms; $result[1] = $tf_per_sentence; $result[2] = $tf_per_sentence_normalized; return $result; } /** * Split up the sentences and return an array with all of the needed parts * @param array $tf_dot_product_per_sentence an array that holds the dot product of each sentence. It should be sorted from highest to lowest when it is passed to this method. * @param array $sentences the array of sentences to process * @return string a string that represents the summary */ public static function getSummary($tf_dot_product_per_sentence, $sentences) { $result = ""; $result_length = 0; $i = 0; foreach ($tf_dot_product_per_sentence as $k => $v) { if ($result_length + strlen($sentences[$k]) > PageProcessor::$max_description_len) { break; } else { $result_length += strlen($sentences[$k]); if ($i == 0) { $i = 1; $result = $sentences[$k] . ". "; if (self::OUTPUT_TO_FILE) { $output_file_contents = $sentences[$k] . ". "; } } else { $result .= " " . $sentences[$k] . ". "; if (self::OUTPUT_TO_FILE) { $output_file_contents = $output_file_contents . "\r\n" . $sentences[$k] . ". "; } } } } if (self::OUTPUT_TO_FILE) { file_put_contents(C\WORK_DIRECTORY . self::OUTPUT_FILE_PATH, $output_file_contents); } return $result; } public static $q; public static $lineCount = 0; public static $matrix; public static $senten; public static $senMatrix; public static $words; public static $ignores; public static $rr; public static $cc; public static function main($args){ //makeIgnorelists();//remove stop words //makeFinalWSMatrix(); //double [][]A = copy(matrix); //List ev = new LinkedList(); // ///* //double[][] A //= new double[][] { // {1.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {1.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000}, // {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000}, // {1.000, 0.000, 0.000, 1.000, 0.000, 1.000}, // {0.000, 1.000, 0.000, 0.000, 0.000, 0.000}, // {0.000, 1.000, 0.000, 0.000, 0.000, 0.000}, // {0.000, 0.000, 1.000, 0.000, 0.000, 0.000}, // {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 1.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {2.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {3.000, 1.000, 0.000, 0.000, 0.000, 1.000}, // {0.000, 0.000, 1.000, 0.000, 0.000, 0.000}, // {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 1.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {2.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {3.000, 1.000, 0.000, 0.000, 0.000, 1.000}, // {1.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {1.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000}, // {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000}, // {1.000, 0.000, 0.000, 1.000, 0.000, 1.000}, // {0.000, 1.000, 0.000, 0.000, 0.000, 0.000}, // {0.000, 1.000, 0.000, 0.000, 0.000, 0.000}, // {0.000, 0.000, 1.000, 0.000, 0.000, 0.000}, // {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 1.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {2.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {3.000, 1.000, 0.000, 0.000, 0.000, 1.000}, // {0.000, 0.000, 1.000, 0.000, 0.000, 0.000}, // {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 1.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {2.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {3.000, 1.000, 0.000, 0.000, 0.000, 1.000}, // {1.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {1.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000}, // {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000}, // {1.000, 0.000, 0.000, 1.000, 0.000, 1.000}, // {0.000, 1.000, 0.000, 0.000, 0.000, 0.000}, // {0.000, 1.000, 0.000, 0.000, 0.000, 0.000}, // {0.000, 0.000, 1.000, 0.000, 0.000, 0.000}, // {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 1.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {2.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {3.000, 1.000, 0.000, 0.000, 0.000, 1.000}, // {0.000, 0.000, 1.000, 0.000, 0.000, 0.000}, // {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 1.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {2.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {3.000, 1.000, 0.000, 0.000, 0.000, 1.000}, // {1.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {1.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000}, // {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000}, // {1.000, 0.000, 0.000, 1.000, 0.000, 1.000}, // {0.000, 1.000, 0.000, 0.000, 0.000, 0.000}, // {0.000, 1.000, 0.000, 0.000, 0.000, 0.000}, // {0.000, 0.000, 1.000, 0.000, 0.000, 0.000}, // {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 1.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {2.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {3.000, 1.000, 0.000, 0.000, 0.000, 1.000}, // {0.000, 0.000, 1.000, 0.000, 0.000, 0.000}, // {0.000, 0.000, 1.000, 0.000, 0.000 ,0.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 1.000}, // {0.000, 0.000, 0.000, 1.000, 0.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {0.000, 0.000, 0.000, 0.000, 1.000, 0.000}, // {2.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {3.000, 1.000, 0.000, 0.000, 0.000, 1.000} // //}; // // // // double[][] A //= new double[][] { // {4.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {3.000, 0.000, 0.000, 0.000, 0.000, 1.000}, // {1.000, 2.000, 0.000, 0.000, 0.000 ,0.000}, // {3.000, 3.000, 0.000, 2.000, 0.000 ,0.000}, // {1.000, 0.000, 0.000, 1.000, 0.000, 1.000}, // {1.000, 0.000, 0.000, 0.000, 0.000 ,0.000}, // {1.000, 0.000, 0.000, 2.000, 0.000 ,0.000}, // {1.000, 2.000, 0.000, 1.000, 0.000, 1.000}}; // //*/ ////double [][] A = new double[][] {{1,1,1},{-1,3,1}}; // ////double [][] A = new double [][] {{0.5, 0.5,-0.5},{0,0,-2},{1.5, -0.5,4.5}}; // //rr= A.length; //cc = A[0].length; // ////int n = a1.length; ////double[][] A = new double[n][n]; // ////double[][] A = { { 1, 2,3, 1,2,2}, { 4,5,6,2,1,2 }, {2,1,3,1,1,3}, {1,1,0,1,1,5}, {2,1,0,0,1,3}}; // //int n = A.length; // // // //double[][] AT = transpose(A); //double [][] AAT = matrixMultiplication(AT, A); // a*at //// double[][] aat = (double[][])AAT.clone(); //printM(AAT); // //n = AAT.length; // // //double[][] I = new double[n][n]; //q = new double[n][]; // // // //double[][] o = (double[][])A.clone(); //int i = 0; //for(i = 0; i < n; i++) // for(int j = 0; j < n; j++) // I[i][j] = (i == j) ? 1 : 0; // //System.out.println("Starting Matrix"); //printM(A); // //int j = 0; //A = lanczos(AAT); // //double[][] l = (double[][])A.clone(); // ////dsfsdfsdfsdfsd //System.out.println(" "); //System.out.println("Generating a tri-diagonal matrix"); //printM(A); //System.out.println(" "); // //double[][][] qrArrays; //boolean iteration= true; //i = 0; //double count =0; // //while(iteration && count <30) //{ // qrArrays = qRDecompose(A); // A = matrixMultiplication(qrArrays[1],qrArrays[0]); // // if(checkSubDiagonal(A)) // { // iteration = false; // } // count ++; //} // //System.out.println(" "); //System.out.println("QR factoriztion"); //printM(A); ////don't put .3f in calculating eigen //double [] eigen = new double[n]; // //for(i=0; i1; out--) //{ // for(int in=0; in sen = getSentences(r.data,2); //System.out.println("Summary: "); //printSentences(sen); // // ////System.out.println("Multiplication of three matrices above"); ////Matrix out = left.times(singular).times(r); ////out.show(); self::makeIgnorelists(); self::makeFinalWSMatrix(); $A = self::copy(self::$matrix); $ev = array(); self::$rr = count($A); self::$cc = count($A[0]); $n = count($A); $AT = self::transpose($A); $AAT = self::matrixMultiplication($AT, $A); // a*at self::printM($AAT); $n = count($AAT); $I = array(); self::$q = array(); $o = self::copy($A); $i = 0; for ($i = 0; $i < $n; $i++) { for ($j = 0; $j < $n; $j++) { $I[$i][$j] = ($i == $j) ? 1 : 0; } } print("Starting Matrix\n"); self::printM($A); $j = 0; $A = self::lanczos($AAT); $l = self::copy($A); print(" \n"); print("Generating a tri-diagonal matrix\n"); self::printM($A); print(" \n"); $qrArrays = array(); $iteration = true; $i = 0; $count = 0.0; while ($iteration && $count < 30) { $qrArrays = self::qRDecompose($A); $A = self::matrixMultiplication($qrArrays[1], $qrArrays[0]); if (self::checkSubDiagonal($A)) { $iteration = false; } $count++; } print(" \n"); print("QR factoriztion\n"); self::printM($A); //don't put .3f in calculating eigen $eigen = array(); for ($i = 0; $i < $n; $i++) { for ($j = 0; $j < $n; $j++) { if ($i == $j) { if (abs($A[$i][$j]) < floatval(0.001)) { $A[$i][$j] = 0; } $eigen[$i] = $A[$i][$j]; } } } //sorting for ($out = count($eigen) - 1; $out > 1; $out--) { for ($in = 0; $in < $out; $in++) { if (abs($eigen[$in]) < abs($eigen[$in + 1])) { $temp = $eigen[$in]; $eigen[$in] = $eigen[$in + 1]; $eigen[$in + 1] = $temp; } } } for ($i = 0; $i < $n; $i++) { print(number_format($eigen[$i], 3)); } print("Eigenvalues sorted\n"); for ($i = 0; $i < $n; $i++) { print(sqrt($eigen[$i]) . " "); } $x = self::copy($l); for ($k = 0; $k < $n; $k++) { for ($i = 0; $i < $n; $i++) { for ($j = 0; $j < $n; $j++) { if ($i == $j) { $x[$i][$j] = $x[$i][$j] - $eigen[$k]; } } } print("\n"); $d = new Matrix($x); $ff = self::inverse($d->data); $fm = new Matrix($ff); $fs = $fm->getNorm(); $ev[] = $fs; $x = self::copy($l); } $eigenvectors = self::getV($ev); print("eigenvvvvvv\n"); $eigenvectors->show(); // transpose so that q can be in column vector, //right now it's horizontal $l_temp = new Matrix(self::$q); $lancvectors = $l_temp->transpose(); $lancvectors->getRidOfNegativeZero(); print("\n"); $lancvectors->show(); print("Vector U\n"); $left = $lancvectors->times($eigenvectors->cClone()); $left->show(); print("Vector S\n"); $singular = self::getS($eigen); $singular->show(); print("Transpose of vector V\n"); $a = new Matrix($o); $at = $a->transpose(); $r = $at->times($left); $r = $r->getMultiNorm(); $r = self::getTransposeOfV($r); $r->show(); $sen = self::getSentences($r->data, 2); print("Summary: \n"); self::printSentences($sen); } public static function checkSubDiagonal($A) { //for(int i=0; i < A.length; i++) //{ // for(int j=i+1; j 0.001D) // // //if(Double.parseDouble(String.format("%.2f",A[j][i])) !=0) // return false; // } //} // //return true; for ($i = 0; $i < count($A); $i++) { for ($j= $i + 1; $j < count($A); $j++) { if (abs($A[$j][$i]) > floatval("0.001")) { //if(Double.parseDouble(String.format("%.2f",A[j][i])) !=0) return false; } } } return true; } public static function getTransposeOfV($V) { //return V.transpose(); return $V->transpose(); } public static function getS($eigen) { //double[][] S = new double[eigen.length][eigen.length]; //for(int i=0; i < eigen.length; i++) //{ // double val = Math.abs(eigen[i]); // for(int j=0; j mitr = ev.listIterator(); // //Matrix V = new Matrix(ev.size(), ev.size()); //int j=0; //while(mitr.hasNext()) //{ // Matrix e = mitr.next(); // for(int i=0;idata[$i][$j] = $e->data[$i][0]; } $j++; } return $V; } public static function qRDecompose($M) { //double[][][] arrays = new double[2][][]; // //int n = M.length; //double[][] A = (double[][]) M.clone(); //double[][] Q = new double[n][]; //double[][] R = new double[n][n]; // //int i = 0; //int j = 1; //while(i-1) // { // while(k -1) { while ($k < $m) { $sum = $sum + $rm[$k][$i] * $x[$k]; $k++; } $x[$i] = ($b[$i] - $sum) / $rm[$i][$i]; $k = $i; $i = $i - 1; $sum = floatval("0.0"); } return $x; } public static function transpose($M) { //int n = M.length; //int m = M[0].length; //double[][] A = new double[m][n]; // //for (int i = 0; i < n; i++) // for (int j = 0; j < m; j++) // A[j][i] = M[i][j]; //return A; $n = count($M); $m = count($M[0]); $A = array(); for ($i = 0; $i < $n; $i++) { for ($j = 0; $j < $m; $j++) { $A[$j][$i] = $M[$i][$j]; } } return $A; } public static function vectorPlus($u, $v) { //int n = u.length; //double[] A = new double[n]; //for(int i=0; i list = new LinkedList(); // //int j=0; //for(int i=0; i < num; i++) //{ // // max = M[i][j]; // for(int k = j+1; k < M[0].length; k++) // { // if(Math.abs(M[i][k]) > Math.abs(max)) // { // j=k; // max = M[i][k]; // // } // } // list.add(new Integer(j)); // j=0; //} // //return list; $max = 0.0; $list = array(); $j = 0; for ($i = 0; $i < $num; $i++) { $max = $M[$i][$j]; for ($k = $j + 1; $k < count($M[0]); $k++) { if(abs($M[$i][$k]) > abs($max)) { $j = $k; $max = $M[$i][$k]; } } $list[] = $j; $j = 0; } return $list; } public static function printSentences($sens) { //for(Integer aa : sens) //{ // //System.out.println(aa.intValue()); // System.out.println(senMatrix[aa.intValue()]); //} for ($i = 0; $i < count($sens); $i++) { print(self::$senMatrix[$sens[$i]] . "\n"); } } public static function words($senStrings) { //words = new HashSet (); // // for(int i=0; i < senStrings.length; i++) // { // StringTokenizer parser = new StringTokenizer(senStrings[i], " \t\n\r\f.,;:!?'-()"); // while (parser.hasMoreTokens()) // { // final String currentWord = parser.nextToken(); // if(!ignores.contains(currentWord)) // words.add(currentWord); // // } // //} self::$words = array(); for ($i = 0; $i < count($senStrings); $i++) { $parser = strtok($senStrings[$i], " \t\n\r\f.,;:!?'-()"); while ($parser !== false) { $currentWord = $parser; //this should check for case mb_strtolower() if (!in_array($currentWord, self::$ignores, true) && !in_array($currentWord, self::$words, true)) { self::$words[] = $currentWord; } $parser = strtok(" \t\n\r\f.,;:!?'-()"); } } } public static function makeWSMatrix($senStrings) { //SortedSet ss = new TreeSet(words); //Object [] slist = ss.toArray(); // //for(int k=0; k < slist.length; k++) //{ // // for(int i=0; i < senStrings.length; i++) // { // // int count =0; // String line = senStrings[i]; // //System.out.println((String)slist[k]); // Pattern hunter = Pattern.compile((String)slist[k]); // Matcher fit = hunter.matcher(line); // while(fit.find()) // { // count++; // } // //System.out.println(count+""); // matrix[k][i]= count; // } // // // //} $slist = self::$words; asort($slist); //for ($k = 0; $k < count($slist); $k++) { $k = 0; foreach ($slist as $item) { for ($i = 0; $i < count($senStrings); $i++) { $line = $senStrings[$i]; $count = preg_match_all("/" . $item . "/u", $line, $matches); self::$matrix[$k][$i] = $count; } $k = $k + 1; } } //fills in the senMatrix string array and senten linked list private static function countLines($line) { //String[] sentences = line.split("[.?!]+\\s*"); ///* //for(String a : sentences){ // System.out.println(a); //} //*/ //Pattern pat = Pattern.compile("[.?!]+\\s*"); //Matcher mat = pat.matcher(line); // // int start = 0, end = 0; // while(mat.find()) { // start = end; // end = mat.end(); // senten.add(line.substring(start, end)); // } //senMatrix = new String[senten.size()]; //int in=0; //for(String s: senten) //{ // senMatrix[in] = s; // in++; //} self::$senten = preg_split( '/[.?!]+\\s*/ui', $line, 0, PREG_SPLIT_NO_EMPTY); self::$senMatrix = self::$senten; } public static function makeFinalWSMatrix() { //BufferedReader r = null; // String thisLine; //try { // r = new BufferedReader(new FileReader(".\\a.txt")); //} catch (FileNotFoundException e1) { // e1.printStackTrace(); //} //String lines =""; //try //{ // while ((thisLine = r.readLine()) != null) // { // lines = lines+thisLine; // } //}catch (IOException e) //{ // e.printStackTrace(); //} //countLines(lines); ////System.out.println(senMatrix.length+""); //words(senMatrix); ////System.out.println(words.size()+""); //SortedSet ss = new TreeSet(words); //Object [] slist = ss.toArray(); // ///* //for(Object a: slist) //{ // System.out.println(a); //} //*/ //matrix = new double[words.size()][senMatrix.length]; //makeWSMatrix(senMatrix); // //for(int m =0; m (); //BufferedReader r = null; //String thisLine; //try { // r = new BufferedReader(new FileReader(".\\ignore.txt")); //} catch (FileNotFoundException e1) { // e1.printStackTrace(); //} //String lines =""; //try //{ // while ((thisLine = r.readLine()) != null) // { // ignores.add(thisLine); // } //}catch (IOException e) //{ // e.printStackTrace(); //} //we need these to get the stop words based on the locale $r = file("c:/temp/ignore.txt"); for ($i = 0; $i < count($r); $i++) { $thisLine = $r[$i]; self::$ignores[] = trim($thisLine); } /* self::$ignores = ['a','able','about','above','abst', 'accordance','according','based','accordingly','across','act', 'actually','added','adj','affected','affecting','affects','after', 'afterwards','again','against','ah','all','almost','alone','along', 'already','also','although','always','am','among','amongst','an','and', 'announce','another','any','anybody','anyhow','anymore','anyone', 'anything','anyway','anyways','anywhere','apparently','approximately', 'are','aren','arent','arise','around','as','aside','ask','asking','at', 'auth','available','away','awfully','b','back','be','became','because', 'become','becomes','becoming','been','before','beforehand','begin', 'beginning','beginnings','begins','behind','being','believe','below', 'beside','besides','between','beyond','biol','both','brief','briefly', 'but','by','c','ca','came','can','cannot','cant','cause','causes', 'certain','certainly','co','com','come','comes','contain','containing', 'contains','could','couldnt','d','date','did','didnt', 'different','do','does','doesnt','doing', 'done','dont','down','downwards', 'due','during','e','each','ed','edu','effect','eg','eight','eighty', 'either','else','elsewhere','end', 'ending','enough','especially','et', 'et-al','etc','even','ever','every', 'everybody','everyone','everything' ,'everywhere','ex','except','f','far','few','ff','fifth','first', 'five','fix','followed','following','follows','for','former', 'formerly','forth','found','four','from','further','furthermore', 'g','gave','get','gets','getting','give','given','gives','giving','go', 'goes','gone','got','gotten','h','had','happens','hardly','has','hasnt', 'have','havent','having','he','hed','hence','her','here','hereafter', 'hereby','herein','heres','hereupon','hers','herself','hes','hi','hid', 'him','himself','his','hither','home','how','howbeit', 'however', 'http', 'https', 'hundred','i','id','ie','if','ill', 'im','immediate','immediately', 'importance','important','in','inc','indeed','index','information', 'instead','into','invention','inward','is','isnt','it','itd','itll', 'its','itself','ive','j','just','k','keep','keeps', 'kept','kg','km','know', 'known','knows','l','largely','last','lately', 'later','latter','latterly', 'least','less','lest','let','lets','like','liked','likely','line', 'little','ll','look','looking','looks','ltd','m','made','mainly','make', 'makes','many','may','maybe','me','mean','means','meantime','meanwhile', 'merely','mg','might','million','miss','ml','more','moreover','most', 'mostly','mr','mrs','much','mug','must','my','myself','n','na','name', 'namely','nay','nd','near','nearly','necessarily','necessary','need', 'needs','neither','never','nevertheless','new','next', 'nine','ninety','no', 'nobody','non','none','nonetheless','noone', 'nor','normally','nos','not', 'noted','nothing','now','nowhere','o','obtain', 'obtained','obviously','of', 'off','often','oh','ok','okay','old','omitted','on','once','one','ones', 'only','onto','or','ord','other','others', 'otherwise','ought','our','ours', 'ourselves','out','outside','over','overall','owing','own','p','page', 'pages','part','particular','particularly', 'past','per','perhaps','placed', 'please','plus','poorly','possible','possibly','potentially','pp', 'predominantly','present','previously', 'primarily','probably','promptly', 'proud','provides','put','q','que','quickly','quite','qv','r','ran', 'rather','rd','re','readily','really','recent','recently','ref','refs', 'regarding','regardless','regards','related','relatively','research', 'respectively','resulted','resulting', 'results','right','run','s','said', 'same','saw','say','saying','says','sec', 'section','see','seeing','seem', 'seemed','seeming','seems', 'seen','self','selves','sent','seven','several', 'shall','she','shed','shell', 'shes','should','shouldnt','show','showed','shown','showns','shows', 'significant','significantly','similar','similarly','since', 'six','slightly', 'so','some','somebody','somehow','someone','somethan', 'something','sometime', 'sometimes','somewhat','somewhere','soon', 'sorry','specifically','specified', 'specify','specifying','still','stop','strongly','sub','substantially', 'successfully','such','sufficiently','suggest','sup','sure','t','take', 'taken','taking','tell','tends','th','than', 'thank','thanks','thanx','that', 'thatll','thats','thatve','the','their', 'theirs','them','themselves','then', 'thence','there','thereafter','thereby','thered','therefore','therein', 'therell','thereof','therere','theres','thereto','thereupon','thereve', 'these','they','theyd','theyll','theyre', 'theyve','think','this','those', 'thou','though','thoughh','thousand','throug', 'through','throughout','thru', 'thus','til','tip','to','together','too', 'took','toward','towards','tried', 'tries','truly','try','trying','ts','twice','two','u','un','under', 'unfortunately','unless','unlike','unlikely','until','unto','up','upon', 'ups','us','use','used','useful','usefully','usefulness','uses','using', 'usually','v','value','various','ve','very', 'via','viz','vol','vols','vs', 'w','want','wants','was','wasnt','way','we', 'wed','welcome','well','went', 'were','werent','weve','what','whatever', 'whatll','whats','when','whence', 'whenever','where','whereafter','whereas','whereby','wherein','wheres', 'whereupon','wherever','whether','which','while','whim','whither','who', 'whod','whoever','whole','wholl','whom','whomever','whos','whose','why', 'widely','willing','wish','with','within', 'without','wont','words','world', 'would','wouldnt','www','x','y','yes','yet','you','youd','youll','your', 'youre','yours','yourself','yourselves','youve','z','zero'];*/ } }