From de8c8150d5fda9ea80c94110e51321dbef959b89 Mon Sep 17 00:00:00 2001 From: Salil Shenoy Date: Sun, 9 Oct 2016 16:10:42 -0700 Subject: [PATCH 1/5] Integrating the question answer patch with the latest version of yioop --- src/controllers/components/CrawlComponent.php | 15 +- src/executables/ArcTool.php | 3 +- src/executables/Fetcher.php | 31 +- src/library/CrawlConstants.php | 1 + src/library/PhraseParser.php | 24 +- src/library/TripletExtractor.php | 1062 +++++++++++++++++++++++++ src/models/PhraseModel.php | 41 + src/views/SearchView.php | 17 +- src/views/elements/PageoptionsElement.php | 5 + 9 files changed, 1172 insertions(+), 27 deletions(-) create mode 100644 src/library/TripletExtractor.php diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php index f7f5f9e..1efafea 100644 --- a/src/controllers/components/CrawlComponent.php +++ b/src/controllers/components/CrawlComponent.php @@ -1346,7 +1346,7 @@ class CrawlComponent extends Component implements CrawlConstants PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); - if (PhraseParser::computeSafeSearchScore($word_lists, $len) < + if (PhraseParser::computeSafeSearchScore($word_lists['WORD_LIST'], $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; @@ -1355,13 +1355,20 @@ class CrawlComponent extends Component implements CrawlConstants $safe = false; } } - if (!isset($word_lists)) { - $word_lists = []; + if (!isset($word_lists['WORD_LIST'])) { + $word_lists['WORD_LIST'] = []; } + + if (!isset($word_lists['QUESTION_ANSWER_LIST'])) { + $word_lists['QUESTION_ANSWER_LIST'] = []; + } + $data["EXTRACTED_WORDS"] = wordwrap($parent->clean( - print_r($word_lists, true), "string"), 75, "\n", true);; + print_r($word_lists, true), "string"), 75, "\n", true); $data["EXTRACTED_META_WORDS"] = wordwrap($parent->clean( print_r($meta_ids, true), "string"), 75, "\n", true); + $data["QUESTIONS_TRIPLET"] = wordwrap($parent->clean( + print_r($word_lists['QUESTION_ANSWER_LIST'], true), "string"), 75, "\n", true); } return $data; } diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index 6b93c0b..b6049cf 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -925,9 +925,10 @@ class ArcTool implements CrawlConstants mb_substr($site[self::DESCRIPTION], 0, C\AD_HOC_TITLE_LENGTH), $site[self::LANG]); } - $word_lists = + $triplet_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); + $word_lists = $triplet_list['WORD_LIST']; $len = strlen($phrase_string); if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) { diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 25bbf7f..963f982 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -44,6 +44,7 @@ use seekquarry\yioop\library\PhraseParser; use seekquarry\yioop\library\processors\PageProcessor; use seekquarry\yioop\library\UrlParser; use seekquarry\yioop\library\WebArchiveBundle; +use seekquarry\yioop\library\TripletExtractor; if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();} ini_set("memory_limit", "1200M"); //so have enough memory to crawl sitemaps @@ -558,7 +559,6 @@ class Fetcher implements CrawlConstants $local_archives = [""]; while (CrawlDaemon::processHandler()) { $start_time = microtime(true); - $info = []; $fetcher_message_file = C\CRAWL_DIR. "/schedules/{$prefix}FetcherMessages.txt"; if (file_exists($fetcher_message_file)) { @@ -577,8 +577,6 @@ class Fetcher implements CrawlConstants if ($info[self::CRAWL_TIME] == 0) { $info[self::STATUS] = self::NO_DATA_STATE; $this->to_crawl = []; - } else { - L\crawlLog("Crawl time is now " . $this->crawl_time); } } else if ($this->crawl_type == self::ARCHIVE_CRAWL && $this->arc_type != "WebArchiveBundle" && @@ -657,7 +655,6 @@ class Fetcher implements CrawlConstants $this->to_crawl_again = []; $this->found_sites = []; gc_collect_cycles(); - $this->web_archive = new WebArchiveBundle($tmp_base_name, false); $this->crawl_time = $info[self::CRAWL_TIME]; @@ -665,7 +662,7 @@ class Fetcher implements CrawlConstants $this->sum_seen_description_length = 0; $this->sum_seen_site_link_length = 0; $this->num_seen_sites = 0; - L\crawlLog("New name: " . $this->web_archive->dir_name); + L\crawlLog("New name: ".$this->web_archive->dir_name); L\crawlLog("Switching archive..."); if (!isset($info[self::ARC_DATA])) { continue; @@ -984,11 +981,6 @@ class Fetcher implements CrawlConstants if (isset($info[self::CRAWL_TIME]) && ($info[self::CRAWL_TIME] != $this->crawl_time || $info[self::CRAWL_TIME] == 0)) { - if ($info[self::CRAWL_TIME] > 0) { - L\crawlLog("New Crawl Time Found: {$info[self::CRAWL_TIME]}"); - } else { - L\crawlLog("Crawl Time Changing to 0"); - } $dir = C\CRAWL_DIR."/schedules"; $time_change = true; /* Zero out the crawl. If haven't done crawl before, then scheduler @@ -1041,8 +1033,6 @@ class Fetcher implements CrawlConstants "{$this->crawl_time}.txt") && file_exists( "$dir/$prefix".self::fetch_batch_name. "{$this->crawl_time}.txt")) { - L\crawlLog("Loading old batches for ". - "{$this->crawl_time}."); $info = unserialize(file_get_contents( "$dir/$prefix".self::fetch_crawl_info. "{$this->crawl_time}.txt")); @@ -1486,6 +1476,7 @@ class Fetcher implements CrawlConstants */ public function getFetchSites() { + $web_archive = $this->web_archive; $start_time = microtime(true); $seeds = []; $delete_indices = []; @@ -1772,9 +1763,6 @@ class Fetcher implements CrawlConstants } $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]); - if (C\FETCHER_PROCESS_DELAY > 0 ) { - usleep(C\FETCHER_PROCESS_DELAY); - } if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $site[self::URL] = $tmp_url_store; @@ -2705,6 +2693,7 @@ class Fetcher implements CrawlConstants $this->video_sources); } $word_lists = []; + $triplet_lists = []; /* self::JUST_METAS check to avoid getting sitemaps in results for popular words @@ -2737,9 +2726,10 @@ class Fetcher implements CrawlConstants $lang = L\guessLocaleFromString( mb_substr($site[self::DESCRIPTION], 0, C\AD_HOC_TITLE_LENGTH), $site[self::LANG]); - $word_lists = + $triplet_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); + $word_lists = $triplet_lists['WORD_LIST']; $len = strlen($phrase_string); if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len) < @@ -2780,6 +2770,12 @@ class Fetcher implements CrawlConstants ]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, $word_lists, $meta_ids, PhraseParser::$materialized_metas, true, $doc_rank); + + if(isset($triplet_lists['QUESTION_ANSWER_LIST'])) { + $question_list = $triplet_lists['QUESTION_ANSWER_LIST']; + $site[self::QUESTION_TRIPLETS] = $question_list; + $this->found_sites[self::SEEN_URLS][$i] = $site; + } /* $this->no_process_links is set when doing things like mix recrawls. In this case links likely already will appear @@ -2831,9 +2827,10 @@ class Fetcher implements CrawlConstants $summary[self::LANG] = $lang; $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary; - $link_word_lists = + $link_lists = PhraseParser::extractPhrasesInLists($link_text, $lang); + $link_word_lists = $link_lists['WORD_LIST']; $link_meta_ids = PhraseParser::calculateLinkMetas($url, $link_host, $link_text, $site_url); if (!isset($this->found_sites[self::INVERTED_INDEX][ diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php index dab8b7b..38bf0b9 100755 --- a/src/library/CrawlConstants.php +++ b/src/library/CrawlConstants.php @@ -231,4 +231,5 @@ interface CrawlConstants const CENTROID_WEIGHTED_SUMMARIZER = 'dt'; const SCRAPER_LABEL = 'du'; const SCRAPERS = 'dv'; + const QUESTION_TRIPLETS = 'dw'; } diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index 128f53f..8f866e3 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -33,6 +33,7 @@ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; use seekquarry\yioop\models\LocaleModel; use seekquarry\yioop\library\processors\PageProcessor; +use seekquarry\yioop\library\TripletExtractor; /** * For crawlHash @@ -210,7 +211,8 @@ class PhraseParser */ public static function extractPhrasesAndCount($string, $lang = null) { - $phrases = self::extractPhrasesInLists($string, $lang); + $triplet_list = self::extractPhrasesInLists($string, $lang); + $phrases = $triplet_list['WORD_LIST']; $phrase_counts = []; foreach ($phrases as $term => $positions) { $phrase_counts[$term] = count($positions); @@ -232,7 +234,20 @@ class PhraseParser if (!isset(self::$programming_language_map[$lang])) { self::canonicalizePunctuatedTerms($string, $lang); } - return self::extractMaximalTermsAndFilterPhrases($string, $lang); + $phrase_list = array(); + $word_lists = self::extractMaximalTermsAndFilterPhrases( + $string, $lang); + // COMMENT BELOW CODE IN IF TO DISABLE QUESTION ANSWERING SYSTEM + if (isset($word_lists)) { + $triplets_list = + TripletExtractor::storeStatementArraysAsTriplet($word_lists); + $word_lists = + array_merge($word_lists, $triplets_list['QUESTION_LIST']); + $phrase_list['QUESTION_ANSWER_LIST'] = + $triplets_list['QUESTION_ANSWER_LIST']; + } + $phrase_list['WORD_LIST'] = $word_lists; + return $phrase_list; } /** * This functions tries to convert acronyms, e-mail, urls, etc into @@ -1254,8 +1269,9 @@ vaffanculo fok hoer kut lul やりまん 打っ掛け return 0; } if ($unsafe_terms == []) { - $unsafe_lists = PhraseParser::extractPhrasesInLists($unsafe_phrase, - "en-US"); + $triplet_list = PhraseParser::extractPhrasesInLists($unsafe_phrase, + "en-US"); + $unsafe_lists = $triplet_list['WORD_LIST']; $unsafe_terms = array_keys($unsafe_lists); } $num_unsafe_terms = 0; diff --git a/src/library/TripletExtractor.php b/src/library/TripletExtractor.php new file mode 100644 index 0000000..2dd8bbc --- /dev/null +++ b/src/library/TripletExtractor.php @@ -0,0 +1,1062 @@ +. + * + * @author Chris Pollett chris@pollett.org + * @license http://www.gnu.org/licenses/ GPL3 + * @link http://www.seekquarry.com/ + * @copyright 2009 - 2015 + * @filesource + */ + +namespace seekquarry\yioop\library; + +use seekquarry\yioop\configs as C; +use seekquarry\yioop\library as L; + +/** + * + * @author Nirav Patel + * @package seekquarry\yioop\library + */ +class TripletExtractor +{ + + /** + * Any unique identifier of question word + * @question_word string identifier + */ + public static $question_word = "qqque"; + + /** + * Takes a phrase and tags each term in it with its part of speech. + * So each term in the original phrase gets mapped to term~part_of_speech + * This tagger is based on a Brill tagger. It makes uses a lexicon + * consisting of words from the Brown corpus together with a list of + * part of speech tags that that word had in the Brown Corpus. These are + * used to get an initial part of speech (in word was not present than + * we assume it is a noun). From this a fixed set of rules is used to modify + * the initial tag if necessary. + * + * @param string $phrase text to add parts speech tags to + * @return string $tagged_phrase phrase where each term has ~part_of_speech + * appended + */ + public static function tagPartsOfSpeechPhrase($phrase) + { + preg_match_all("/[\w\d]+/", $phrase, $matches); + $tagged_tokens = self::tagTokenizePartOfSpeech($phrase); + $tagged_phrase = self::taggedPartOfSpeechTokensToString( + $tagged_tokens); + return $tagged_phrase; + } + + /** + * Split input text into terms and output an array with one element + * per term, that element consisting of array with the term token + * and the part of speech tag. + * + * @param string $text string to tag and tokenize + * @return array of pairs of the form( "token" => token_for_term, + * "tag"=> part_of_speech_tag_for_term) for one each token in $text + */ + public static function tagTokenizePartOfSpeech($text) + { + static $lex_string = null; + if (!$lex_string) { + $lex_string = gzdecode(file_get_contents("lexicon.txt.gz")); + } + preg_match_all("/[\w\d]+/", $text, $matches); + $tokens = $matches[0]; + $nouns = array('NN', 'NNS', 'NNP'); + $verbs = array('VBD', 'VBP', 'VB'); + $result = array(); + $previous = array('token' => -1, 'tag' => -1); + $previous_token = -1; + sort($tokens); + $dictionary = array(); + /* + Notice we sorted the tokens, and notice how we use $cur_pos + so only advance forward through $lex_string. So the + run time of this is bound by at most one scan of $lex_string + */ + $cur_pos = 0; + foreach ($tokens as $token) { + $token = strtolower(rtrim($token, ".")); + $token_pos = stripos($lex_string, "\n" . $token . " ", $cur_pos); + if ($token_pos !== false) { + $token_pos++; + $cur_pos = stripos($lex_string, "\n", $token_pos); + $line = trim(substr($lex_string, $token_pos, + $cur_pos - $token_pos)); + $tag_list = explode(' ', $line); + $dictionary[strtolower(rtrim($token, "."))] = + array_slice($tag_list, 1); + $cur_pos++; + } + } + // now using our dictionary we tag + $i = 0; + $tag_list = array(); + foreach ($matches[0] as $token) { + $prev_tag_list = $tag_list; + $tag_list = array(); + // default to a common noun + $current = array('token' => $token, 'tag' => 'NN'); + // remove trailing full stops + $token = strtolower(rtrim($token, ".")); + if (isset($dictionary[$token])) { + $tag_list = $dictionary[$token]; + $current['tag'] = $tag_list[0]; + } + // Converts verbs after 'the' to nouns + if ($previous['tag'] == 'DT' && in_array($current['tag'], $verbs)) { + $current['tag'] = 'NN'; + } + // Convert noun to number if . appears + if ($current['tag'][0] == 'N' && strpos($token, '.') !== false) { + $current['tag'] = 'CD'; + } + $ends_with = substr($token, -2); + switch ($ends_with) { + case 'ed': + // Convert noun to past particle if ends with 'ed' + if ($current['tag'][0] == 'N') { + $current['tag'] = 'VBN'; + } + break; + case 'ly': + // Anything that ends 'ly' is an adverb + $current['tag'] = 'RB'; + break; + case 'al': + // Common noun to adjective if it ends with al + if (in_array($current['tag'], $nouns)) { + $current['tag'] = 'JJ'; + } + break; + } + // Noun to verb if the word before is 'would' + if ($current['tag'] == 'NN' && $previous_token == 'would') { + $current['tag'] = 'VB'; + } + // Convert common noun to gerund + if (in_array($current['tag'], $nouns) && + substr($token, -3) == 'ing' + ) { + $current['tag'] = 'VBG'; + } + //nouns followed by adjectives + if (in_array($previous['tag'], $nouns) && + $current['tag'] == 'JJ' && in_array('JJ', $prev_tag_list) + ) { + $result[$i - 1]['tag'] = 'JJ'; + $current['tag'] = 'NN'; + } + /* If we get noun noun, and the second can be a verb, + * convert to verb; if noun noun and previous could be an + * adjective convert to adjective + */ + if (in_array($previous['tag'], $nouns) && + in_array($current['tag'], $nouns) + ) { + if (in_array('VBN', $tag_list)) { + $current['tag'] = 'VBN'; + } else { + if (in_array('VBZ', $tag_list)) { + $current['tag'] = 'VBZ'; + } else { + if (in_array('JJ', $prev_tag_list)) { + $result[$i - 1]['tag'] = 'JJ'; + } + } + } + } + $result[$i] = $current; + $i++; + $previous = $current; + $previous_token = $token; + } + return $result; + } + + /** + * Takes an array of pairs (token, tag) that came from phrase + * and builds a new phrase where terms look like token~tag. + * + * @param array $tagged_tokens array of pairs as might come from tagTokenize + * @return string $tagged_phrase a phrase with terms in the format token~tag + */ + public static function taggedPartOfSpeechTokensToString($tagged_tokens) + { + $tagged_phrase = ""; + $simplified_parts_of_speech = array( + "NN" => "NN", + "NNS" => "NN", + "NNP" => "NN", + "NNPS" => "NN", + "PRP" => "NN", + 'PRP$' => "NN", + "WP" => "NN", + "VB" => "VB", + "VBD" => "VB", + "VBN" => "VB", + "VBP" => "VB", + "VBZ" => "VB", + "JJ" => "AJ", + "JJR" => "AJ", + "JJS" => "AJ", + "RB" => "AV", + "RBR" => "AV", + "RBS" => "AV", + "WRB" => "AV" + ); + foreach ($tagged_tokens as $t) { + $tag = trim($t['tag']); + $tag = (isset($simplified_parts_of_speech[$tag])) ? + $simplified_parts_of_speech[$tag] : $tag; + $tagged_phrase .= $t['token'] . "~" . $tag . " "; + } + return $tagged_phrase; + } + /** + * Takes a statement and return the array of words from text + * along with the tags of Part Of Speech + * + * @param $text any statement + * @return array words tagged with POS tags + */ + public static function partOfSpeechTagger_Brill($text) + { + static $dict = null; + $lexicon = C\LOCALE_DIR . "/en_US/resources/lexicon.txt"; + if (!$dict) { + $fh = fopen($lexicon, 'r'); + while ($line = fgets($fh)) { + $tags = explode(' ', $line); + $dict[strtolower(array_shift($tags))] = $tags; + } + fclose($fh); + } + preg_match_all("/[\w\d\.]+/", $text, $matches); + $nouns = array('NN', 'NNS'); + $return = array(); + $i = 0; + foreach ($matches[0] as $token) { + // default to a common noun + $return[$i] = array('token' => $token, 'tag' => 'NN'); + // remove trailing full stops + if (substr($token, -1) == '.') { + $token = preg_replace('/\.+$/', '', $token); + } + // get from dict if set + if (isset($dict[strtolower($token)])) { + $return[$i]['tag'] = trim($dict[strtolower($token)][0]); + } + // Converts verbs after 'the' to nouns + if ($i > 0) { + if ($return[$i - 1]['tag'] == 'DT' && + in_array($return[$i]['tag'], + array('VBD', 'VBP', 'VB')) + ) { + $return[$i]['tag'] = 'NN'; + } + } + // Convert noun to number if . appears + if ($return[$i]['tag'][0] == 'N' && strpos($token, '.') !== false) { + $return[$i]['tag'] = 'CD'; + } + // Convert noun to past particile if ends with 'ed' + if ($return[$i]['tag'][0] == 'N' && substr($token, -2) == 'ed') { + $return[$i]['tag'] = 'VBN'; + } + // Anything that ends 'ly' is an adverb + if (substr($token, -2) == 'ly') { + $return[$i]['tag'] = 'RB'; + } + // Common noun to adjective if it ends with al + if (in_array($return[$i]['tag'], $nouns) + && substr($token, -2) == 'al' + ) { + $return[$i]['tag'] = 'JJ'; + } + // Noun to verb if the word before is 'would' + if ($i > 0) { + if ($return[$i]['tag'] == 'NN' + && strtolower($return[$i - 1]['token']) == 'would' + ) { + $return[$i]['tag'] = 'VB'; + } + } + // Convert noun to plural if it ends with an s + if ($return[$i]['tag'] == 'NN' && substr($token, -1) == 's') { + $return[$i]['tag'] = 'NNS'; + } + // Convert common noun to gerund + if (in_array($return[$i]['tag'], $nouns) + && substr($token, -3) == 'ing' + ) { + $return[$i]['tag'] = 'VBG'; + } + // If we get noun, and the second can be a verb, convert to verb + if ($i > 0) { + if (in_array($return[$i]['tag'], $nouns) + && in_array($return[$i - 1]['tag'], $nouns) + && isset($dict[strtolower($token)]) + ) { + if (in_array('VBN', $dict[strtolower($token)])) { + $return[$i]['tag'] = 'VBN'; + } else { + if (in_array('VBZ', + $dict[strtolower($token)])) { + $return[$i]['tag'] = 'VBZ'; + } + } + } + } + $i++; + } + return $return; + } + + /** + * Takes tagged array from the Part Of Speech tagger and + * returns tree generated from the tagged statement + * + * @param $tagger_array tagged array from part of speech tagger + * @return array formed tree array + */ + public static function generateParseTreeUsingRDP($tagger_array) + { + $tree = array(); + $tree = ["cur_node" => 0]; + $tree_np = TripletExtractor::extractNPUsingRDP($tagger_array, $tree); + $tree = ["cur_node" => $tree_np['cur_node']]; + $tree_vp = TripletExtractor::extractVPUsingRDP($tagger_array, $tree); + if ($tree == $tree_vp) { + return $tree; + } + $tree['cur_node'] = $tree_vp['cur_node']; + unset($tree_np['cur_node']); + unset($tree_vp['cur_node']); + $tree['NP'] = $tree_np['NP']; + $tree['VP'] = $tree_vp['VP']; + return $tree; + } + + /** + * Takes tree generated till Statement and returns + * tree by adding Noun Phrase subtree to it + * + * @param $tagger_array tagged array + * @param $tree current tree + * @return mixed tree with NP node + */ + public static function extractNPUsingRDP($tagger_array, $tree) + { + $NP = $tree['cur_node']; + $tree_dt = ['cur_node' => $NP]; + $tree_dt = TripletExtractor::extractDTUsingRDP($tagger_array, $tree_dt); + $tree_jj = ['cur_node' => $tree_dt['cur_node']]; + $tree_jj = TripletExtractor::extractJJUsingRDP($tagger_array, $tree_jj); + $tree_nn = ['cur_node' => $tree_jj['cur_node']]; + $tree_nn = TripletExtractor::extractNNUsingRDP($tagger_array, $tree_nn); + if ($NP == $tree_nn['cur_node']) { + $tree['NP'] = ""; + } else { + $cur_node = $tree_nn['cur_node']; + unset($tree_dt['cur_node']); + $tree_new_sub['DT'] = $tree_dt; + unset($tree_jj['cur_node']); + $tree_new_sub['JJ'] = $tree_jj; + unset($tree_nn['cur_node']); + $tree_new_sub['NN'] = $tree_nn; + $tree_new['cur_node'] = $cur_node; + $tree_new['NP'] = $tree_new_sub; + return $tree_new; + } + return $tree; + } + + /** + * Takes current tree and returns + * tree by adding determiner node to it + * + * @param $tagger_array pos tagged array + * @param $tree current tree + * @return mixed DT added tree + */ + public static function extractDTUsingRDP($tagger_array, $tree) + { + if (isset($tagger_array[$tree['cur_node']]['tag']) && + ("DT" == $tagger_array[$tree['cur_node']]['tag']) + ) { + $tree['DT'] = $tagger_array[$tree['cur_node']]['token']; + $tree['cur_node'] = $tree['cur_node'] + 1; + return $tree; + } + return $tree; + } + /** + * Takes current tree and returns + * tree by adding adjective node to it + * + * @param $tagger_array POS tagged array + * @param $tree current tree + * @return mixed JJ added tree + */ + public static function extractJJUsingRDP($tagger_array, $tree) + { + $adjective_string = ""; + while (isset($tagger_array[$tree['cur_node']]['tag']) && + ("JJ" == $tagger_array[$tree['cur_node']]['tag'] || + "JJR" == $tagger_array[$tree['cur_node']]['tag'] || + "JJS" == $tagger_array[$tree['cur_node']]['tag'])) { + $adjective_string .= " " . + $tagger_array[$tree['cur_node']]['token']; + $tree['cur_node'] = $tree['cur_node'] + 1; + } + if ("" != $adjective_string) { + $tag = "JJ"; + $tree[$tag] = $adjective_string; + } + return $tree; + } + /** + * Takes current tree and returns + * tree by adding Noun node to it + * + * @param $tagger_array POS tagged array + * @param $tree current generated tree + * @return mixed NN added tree + */ + public static function extractNNUsingRDP($tagger_array, $tree) + { + //Combining multiple noun into one + $noun_string = ""; + while (isset($tagger_array[$tree['cur_node']]['tag']) && ( + "NN" == $tagger_array[$tree['cur_node']]['tag'] || + "NNS" == $tagger_array[$tree['cur_node']]['tag'] || + "NNP" == $tagger_array[$tree['cur_node']]['tag'] || + "NNPS" == $tagger_array[$tree['cur_node']]['tag'] || + "PRP" == $tagger_array[$tree['cur_node']]['tag'])) { + $noun_string .= " " . $tagger_array[$tree['cur_node']]['token']; + $tree['cur_node'] = $tree['cur_node'] + 1; + } + if ("" != $noun_string) { + $tag = "NN"; + $tree[$tag] = $noun_string; + } + // Checking for preposition + if (isset($tagger_array[$tree['cur_node']]['tag']) && + ("IN" == $tagger_array[$tree['cur_node']]['tag'])) { + $tag = $tagger_array[$tree['cur_node']]['tag']; + $preposition_string = $tagger_array[$tree['cur_node']]['token']; + $tree[$tag] = $preposition_string; + $tree['cur_node'] = $tree['cur_node'] + 1; + } + if (isset($tagger_array[$tree['cur_node']]['tag']) && + ("DT" == $tagger_array[$tree['cur_node']]['tag'])) { + $tree['DT_1'] = $tagger_array[$tree['cur_node']]['token']; + $tree['cur_node'] = $tree['cur_node'] + 1; + } + $adjective_string = ""; + while (isset($tagger_array[$tree['cur_node']]['tag']) && + ("JJ" == $tagger_array[$tree['cur_node']]['tag'] || + "JJR" == $tagger_array[$tree['cur_node']]['tag'] || + "JJS" == $tagger_array[$tree['cur_node']]['tag'])) { + $adjective_string .= " " . + $tagger_array[$tree['cur_node']]['token']; + $tree['cur_node'] = $tree['cur_node'] + 1; + } + if ("" != $adjective_string) { + $tag = "JJ_1"; + $tree[$tag] = $adjective_string; + } + $propernoun_string = ""; + while (isset($tagger_array[$tree['cur_node']]['tag']) && + ("NN" == $tagger_array[$tree['cur_node']]['tag'] || + "NNS" == $tagger_array[$tree['cur_node']]['tag'] || + "NNP" == $tagger_array[$tree['cur_node']]['tag'] || + "NNPS" == $tagger_array[$tree['cur_node']]['tag'] || + "PRP" == $tagger_array[$tree['cur_node']]['tag'])) { + $propernoun_string .= " " . + $tagger_array[$tree['cur_node']]['token']; + $tree['cur_node'] = $tree['cur_node'] + 1; + } + if ("" != $propernoun_string) { + $tag = "NNP"; + $tree[$tag] = $propernoun_string; + } + return $tree; + } + + /** + * @param $tagger_array POS tagged array + * @param $tree current tree + * @return mixed VP added tree + */ + public static function extractVPUsingRDP($tagger_array, $tree) + { + $VP = $tree['cur_node']; + $tree_vp = ['cur_node' => $VP]; + $tree_vb = ['cur_node' => $tree_vp['cur_node']]; + $tree_vb = TripletExtractor::extractVBUsingRDP($tagger_array, $tree_vb); + if ($VP == $tree_vb['cur_node']) { + return $tree; + } + $tree_np = ['cur_node' => $tree_vb['cur_node']]; + $tree_np = TripletExtractor::extractNPUsingRDP($tagger_array, $tree_np); + if ($VP == $tree_np['cur_node']) { + } else { + $cur_node = $tree_np['cur_node']; + unset($tree_vb['cur_node']); + unset($tree_np['cur_node']); + $tree_new_sub['VB'] = $tree_vb; + $tree_new_sub['NP'] = $tree_np['NP']; + $tree_new['cur_node'] = $cur_node; + $tree_new['VP'] = $tree_new_sub; + return $tree_new; + } + return $tree; + } + + /** + * @param $tagger_array POS tagged tree + * @param $tree current tree + * @return mixed VB added tree + */ + public static function extractVBUsingRDP($tagger_array, $tree) + { + while (isset($tagger_array[$tree['cur_node']]['tag']) && + ("VB" != $tagger_array[$tree['cur_node']]['tag'] && + "VBD" != $tagger_array[$tree['cur_node']]['tag'] && + "VBG" != $tagger_array[$tree['cur_node']]['tag'] && + "VBN" != $tagger_array[$tree['cur_node']]['tag'] && + "VBP" != $tagger_array[$tree['cur_node']]['tag'] && + "VBZ" != $tagger_array[$tree['cur_node']]['tag'])) { + $tree['cur_node'] = $tree['cur_node'] + 1; + } + $verb_string = ""; + while (isset($tagger_array[$tree['cur_node']]['tag']) && + ("VB" == $tagger_array[$tree['cur_node']]['tag'] || + "VBD" == $tagger_array[$tree['cur_node']]['tag'] || + "VBG" == $tagger_array[$tree['cur_node']]['tag'] || + "VBN" == $tagger_array[$tree['cur_node']]['tag'] || + "VBP" == $tagger_array[$tree['cur_node']]['tag'] || + "VBZ" == $tagger_array[$tree['cur_node']]['tag'])) { + $verb_string .= " " . $tagger_array[$tree['cur_node']]['token']; + $tree['cur_node'] = $tree['cur_node'] + 1; + } + if ("" != $verb_string) { + $tag = "VB"; + $tree[$tag] = $verb_string; + } + return $tree; + } + /** + * @param $tree fully generated tree + * @return array triplet array + */ + public static function extractTriplet($tree) + { + $triplet = array(); + $triplet['subject'] = TripletExtractor::extractSubjectFromTree($tree); + $triplet['predicate'] = + TripletExtractor::extractPredicateFromTree($tree); + $triplet['object'] = TripletExtractor::extractObjectFromTree($tree); + return $triplet; + } + + /** + * @param $triplet_tree any statement + * @return array processed triplet + */ + public static function processTripletForStorage($triplet_tree) + { + $processed_triplet = array(); + $processed_triplet['RAW'] = + TripletExtractor::getRawTripletForStorage($triplet_tree); + $processed_triplet['FEATURED'] = + TripletExtractor::getFeaturedTripletForStorage($triplet_tree); + return $processed_triplet; + } + + /** + * @param $triplet_tree triplet array + * @return array raw triplet array + */ + public static function getRawTripletForStorage($triplet_tree) + { + $raw_triplet = array(); + $question_answer_triplet = array(); + if (isset($triplet_tree['subject']['RAW']) + && isset($triplet_tree['predicate']['RAW']) + && isset($triplet_tree['object']['RAW']) + && !TripletExtractor::IsNullOrEmptyString($triplet_tree['subject']['RAW']) + && !TripletExtractor::IsNullOrEmptyString($triplet_tree['predicate']['RAW']) + && !TripletExtractor::IsNullOrEmptyString($triplet_tree['object']['RAW']) + ) { + + $SUBJECT = trim($triplet_tree['subject']['RAW']); + $PREDICATE = trim($triplet_tree['predicate']['RAW']); + $OBJECT = trim($triplet_tree['object']['RAW']); + + $raw_triplet['SUBJECT'] = + TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT; + $raw_triplet['PREDICATE'] = + $SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT; + $raw_triplet['OBJECT'] = + $SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word; + + $question_answer_triplet[TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT] = $SUBJECT; + $question_answer_triplet[$SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT] = $PREDICATE; + $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word] = $OBJECT; + $raw_triplet['QUESTION_ANSWER_LIST'] = $question_answer_triplet; + } + return $raw_triplet; + } + + /** + * @param $triplet_tree triplet array + * @return array featured triplet array + */ + public static function getFeaturedTripletForStorage($triplet_tree) + { + $featured_triplet = array(); + $question_answer_triplet = array(); + if (isset($triplet_tree['subject']['FEATURED']) + && isset($triplet_tree['predicate']['FEATURED']) + && isset($triplet_tree['object']['FEATURED']) + && !TripletExtractor::IsNullOrEmptyString($triplet_tree['subject']['FEATURED']) + && !TripletExtractor::IsNullOrEmptyString($triplet_tree['predicate']['FEATURED']) + && !TripletExtractor::IsNullOrEmptyString($triplet_tree['object']['FEATURED']) + ) { + $SUBJECT = trim($triplet_tree['subject']['FEATURED']); + $PREDICATE = trim($triplet_tree['predicate']['FEATURED']); + $OBJECT = trim($triplet_tree['object']['FEATURED']); + + $featured_triplet['SUBJECT'] = + TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT; + $featured_triplet['PREDICATE'] = + $SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT; + $featured_triplet['OBJECT'] = + $SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word; + + $question_answer_triplet[TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT] = $SUBJECT; + $question_answer_triplet[$SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT] = $PREDICATE; + $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word] = $OBJECT; + + $featured_triplet['QUESTION_ANSWER_LIST'] = $question_answer_triplet; + } + return $featured_triplet; + } + + /** + * @param $string any string + * @return bool true if null of empty string + */ + public static function IsNullOrEmptyString($string) + { + return (!isset($string) || trim($string) === ''); + } + + /** + * @param $tree generated tree + * @return array subject array + */ + public static function extractSubjectFromTree($tree) + { + $subject = array(); + if (isset($tree['NP']) && $tree['NP'] != null) { + $tree_np = $tree['NP']; + $value = TripletExtractor::extractFirstNounFromNPTree($tree_np); + $subject['RAW'] = $value; + $featured_subject = ""; + $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($tree_np)); + foreach ($it as $v) { + $featured_subject .= $v . " "; + } + $subject['FEATURED'] = $featured_subject; + } else { + $subject['RAW'] = ""; + $subject['FEATURED'] = ""; + } + return $subject; + } + + /** + * @param $tree generated tree + * @return array predicate array + */ + public static function extractPredicateFromTree($tree) + { + $predicate = array(); + if (isset($tree['VP']) && $tree['VP'] != null) { + $tree_vp = $tree['VP']; + $value = TripletExtractor::extractDeepestVerbFromVBTree($tree_vp); + $predicate['RAW'] = $value; + $featured_predicate = ""; + if (isset($tree_vp['VB']) && $tree_vp['VB'] != null) { + $tree_vb = $tree_vp['VB']; + $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($tree_vb)); + foreach ($it as $v) { + $featured_predicate .= $v . " "; + } + $predicate['FEATURED'] = $featured_predicate; + } + } else { + $predicate['RAW'] = ""; + $predicate['FEATURED'] = ""; + } + return $predicate; + } + + /** + * @param $tree generated tree + * @return array object array + */ + public static function extractObjectFromTree($tree) + { + $object = array(); + if (isset($tree['VP']) && $tree['VP'] != null) { + $tree_vp = $tree['VP']; + if (isset($tree_vp['NP']) && $tree_vp['NP'] != null) { + $nb = $tree_vp['NP']; + $value = TripletExtractor::extractFirstNounFromNPTree($nb); + $object['RAW'] = $value; + $featured_object = ""; + $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($nb)); + foreach ($it as $v) { + $featured_object .= $v . " "; + } + $object['FEATURED'] = $featured_object; + } else { + $object['RAW'] = ""; + $object['FEATURED'] = ""; + } + } else { + $object['RAW'] = ""; + $object['FEATURED'] = ""; + } + return $object; + } + + /** + * @param $tree_np noun phrase subtree + * @return string first noun + */ + public static function extractFirstNounFromNPTree($tree_np) + { + if (isset($tree_np['NN']) && $tree_np['NN'] != null) { + $nn = $tree_np['NN']; + if (isset($nn['NN']) && $nn['NN'] != null) { + $nn = $nn['NN']; + return $nn; + } + } + return ""; + } + + /** + * @param $tree_vp verb phrase subtree + * @return string deepest verb + */ + public static function extractDeepestVerbFromVBTree($tree_vp) + { + if (isset($tree_vp['VB']) && $tree_vp['VB'] != null) { + $vb = $tree_vp['VB']; + if ($vb['VB'] != null) { + $vb = $vb['VB']; + return $vb; + } + } + return ""; + } + + /** + * @param $tree generated tree + * @return array attributes array + */ + public static function extractAttributes($tree) + { + $attribute_map = array(); + if (isset($tree['JJ']) && count($tree['JJ']) > 0) { + $attribute_map['JJ'] = $tree['JJ']['JJ']; + } + + if (isset($tree['NN']) && count($tree['NN']) > 0) { + + $nn_tree = $tree['NN']; + + if (isset($nn_tree['IN']) && count($nn_tree['IN']) > 0) { + $attribute_map['IN'] = $nn_tree['IN']; + } + + if (isset($nn_tree['JJ_1']) && count($nn_tree['JJ_1']) > 0) { + $attribute_map['JJ_1'] = $nn_tree['JJ_1']; + } + + if (isset($nn_tree['NNP']) && count($nn_tree['NNP']) > 0) { + $attribute_map['NNP'] = $nn_tree['NNP']; + } + } + return $attribute_map; + } + + /** + * @param $statement any statement + * @return array processed triplet + */ + public static function storeStatementAsTriplet($statement) + { + try { + $tagged_statement = + TripletExtractor::partOfSpeechTagger_Brill($statement); + $statement_tree = + TripletExtractor::generateParseTreeUsingRDP($tagged_statement); + $triplet_tree = TripletExtractor::extractTriplet($statement_tree); + return TripletExtractor::processTripletForStorage($triplet_tree); + } catch (\Exception $e) { + + } + } + + /** + * @param $statement_array array of statements + * @return array list of triplets + */ + public static function storeStatementArraysAsTriplet($statement_array) + { + $triplets_list = array(); + $question_list = array(); + $question_answer_list = array(); + foreach ($statement_array as $key => $value) { + try { + if (str_word_count($key) >= 3) { + $extracted_triplet = TripletExtractor::storeStatementAsTriplet($key); + + if (isset($extracted_triplet['RAW']) && + sizeof($extracted_triplet['RAW']) > 0) { + $question_list[$extracted_triplet['RAW']['SUBJECT']] = $value; + $question_list[$extracted_triplet['RAW']['PREDICATE']] = $value; + $question_list[$extracted_triplet['RAW']['OBJECT']] = $value; + $question_answer_list = array_merge($question_answer_list, + $extracted_triplet['RAW']['QUESTION_ANSWER_LIST']); + } + + if (isset($extracted_triplet['FEATURED']) && + sizeof($extracted_triplet['FEATURED']) > 0) { + $question_list[$extracted_triplet['FEATURED']['SUBJECT']] = $value; + $question_list[$extracted_triplet['FEATURED']['PREDICATE']] = $value; + $question_list[$extracted_triplet['FEATURED']['OBJECT']] = $value; + $question_answer_list = array_merge($question_answer_list, + $extracted_triplet['FEATURED']['QUESTION_ANSWER_LIST']); + } + } + } catch (\Exception $e) { + echo 'Caught exception: ', $e->getMessage(), "\n"; + continue; + } + } + $triplets_list['QUESTION_LIST'] = $question_list; + $triplets_list['QUESTION_ANSWER_LIST'] = $question_answer_list; + return $triplets_list; + } + + /** + * Takes any question started with WH question and returns the + * triplet from the question + * + * @param $question_string question string + * @return array question triplet + */ + public static function questionParser($question_string) + { + $question_string_tagged = TripletExtractor::partOfSpeechTagger_Brill( + $question_string); + $index = 0; + $generated_question_array = array(); + if (isset($question_string_tagged[$index]) && + ("WRB" == trim($question_string_tagged[$index]['tag']) || + "WP" == trim($question_string_tagged[$index]['tag'])) + ) { + if ("WHO" == strtoupper( + trim($question_string_tagged[$index]['token']))) { + $index = $index + 1; + $generated_question_array = + TripletExtractor::parseWHOQuestion( + $question_string_tagged, $index); + } else { + if ("WHERE" == strtoupper( + trim($question_string_tagged[$index]['token'])) || + "WHEN" == strtoupper( + trim($question_string_tagged[$index]['token'])) || + "WHAT" == strtoupper( + trim($question_string_tagged[$index]['token'])) + ) { + $index = $index + 1; + $generated_question_array = + TripletExtractor::parseWHPlusQuestion_New( + $question_string_tagged, + $index); + } + } + } + return $generated_question_array; + } + + /** + * Takes tagged question string starts with Who + * and returns question triplet from the question string + * + * @param $question_string_tagged tagged question statement + * @param $index current index in statement + * @return array parsed triplet + */ + public static function parseWHOQuestion($question_string_tagged, $index) + { + $generated_question_array = array(); + $tree = ["cur_node" => $index]; + $tree['NP'] = "WHO"; + $triplet = array(); + $tree_vp = TripletExtractor::extractVPUsingRDP( + $question_string_tagged, $tree); + $triplet['predicate'] = TripletExtractor::extractPredicateFromTree( + $tree_vp); + $triplet['object'] = TripletExtractor::extractObjectFromTree( + $tree_vp); + if (isset($triplet['object']['RAW']) + && isset($triplet['predicate']['RAW']) + && !TripletExtractor::IsNullOrEmptyString( + $triplet['object']['RAW']) + && !TripletExtractor::IsNullOrEmptyString( + $triplet['predicate']['RAW']) + ) { + $generated_question_array['RAW']['1'] = + trim($triplet['object']['RAW']) . + " " . trim($triplet['predicate']['RAW']) . + " " . trim(TripletExtractor::$question_word); + $generated_question_array['RAW']['2'] = + trim(TripletExtractor::$question_word) . + " " . trim($triplet['predicate']['RAW']) . + " " . trim($triplet['object']['RAW']); + } + if (isset($triplet['object']['FEATURED']) + && isset($triplet['predicate']['FEATURED']) + && !TripletExtractor::IsNullOrEmptyString( + $triplet['object']['FEATURED']) + && !TripletExtractor::IsNullOrEmptyString( + $triplet['predicate']['FEATURED']) + ) { + $generated_question_array['FEATURED']['1'] = + trim($triplet['object']['FEATURED']) . + " " . trim($triplet['predicate']['FEATURED']) . + " " . trim(TripletExtractor::$question_word); + $generated_question_array['FEATURED']['2'] = + trim($triplet['object']['FEATURED']) . + " " . trim($triplet['predicate']['FEATURED']) . + " " . trim(TripletExtractor::$question_word); + } + return $generated_question_array; + } + + /** + * Takes tagged question string starts with Wh+ except Who + * and returns question triplet from the question string + * + * @param $question_string_tagged tagged question statement + * @param $index current index in statement + * @return array parsed triplet + */ + public static function parseWHPlusQuestion($question_string_tagged, $index) + { + $generated_question_array = array(); + $aux_verb = ""; + while (isset($question_string_tagged[$index]) && + ("VB" == trim($question_string_tagged[$index]['tag']) || + "VBD" == trim($question_string_tagged[$index]['tag']) || + "VBG" == trim($question_string_tagged[$index]['tag']) || + "VBN" == trim($question_string_tagged[$index]['tag']) || + "VBP" == trim($question_string_tagged[$index]['tag']) || + "VBZ" == trim($question_string_tagged[$index]['tag']))) { + $aux_verb .= " " . trim($question_string_tagged[$index]['token']); + $index = $index + 1; + } + $tree = ["cur_node" => $index]; + $tree['NP'] = "WHPlus"; + $triplet = array(); + $tree_np = TripletExtractor::extractNPUsingRDP( + $question_string_tagged, $tree); + $triplet['subject'] = TripletExtractor::extractSubjectFromTree( + $tree_np); + $tree_vp = TripletExtractor::extractVPUsingRDP( + $question_string_tagged, $tree); + $triplet['predicate'] = TripletExtractor::extractPredicateFromTree( + $tree_vp); + $triplet['object'] = TripletExtractor::extractObjectFromTree( + $tree_vp); + if (isset($aux_verb) + && !TripletExtractor::IsNullOrEmptyString($aux_verb) + ) { + $triplet['predicate']['RAW'] = trim($aux_verb) . + " " . $triplet['predicate']['RAW']; + if (!isset($triplet['predicate']['FEATURED'])) { + $triplet['predicate']['FEATURED'] = ""; + } + $triplet['predicate']['FEATURED'] = trim($aux_verb) . + " " . $triplet['predicate']['FEATURED']; + } + if (isset($triplet['subject']['RAW']) + && isset($triplet['predicate']['RAW']) + && !TripletExtractor::IsNullOrEmptyString( + $triplet['subject']['RAW']) + && !TripletExtractor::IsNullOrEmptyString( + $triplet['predicate']['RAW']) + ) { + $generated_question_array['RAW']['1'] = + trim($triplet['subject']['RAW']) . + " " . trim($triplet['predicate']['RAW']) . + " " . trim(TripletExtractor::$question_word); + $generated_question_array['RAW']['2'] = + trim(TripletExtractor::$question_word) . + " " . trim($triplet['predicate']['RAW']) . + " " . trim($triplet['subject']['RAW']); + } + if (isset($triplet['subject']['FEATURED']) + && isset($triplet['predicate']['FEATURED']) + && !TripletExtractor::IsNullOrEmptyString( + $triplet['subject']['FEATURED']) + && !TripletExtractor::IsNullOrEmptyString( + $triplet['predicate']['FEATURED']) + ) { + $generated_question_array['FEATURED']['1'] = + trim($triplet['subject']['FEATURED']) . + " " . trim($triplet['predicate']['FEATURED']) . + " " . trim(TripletExtractor::$question_word); + $generated_question_array['FEATURED']['2'] = + trim(TripletExtractor::$question_word) . + " " . trim($triplet['predicate']['FEATURED']) . + " " . trim($triplet['subject']['FEATURED']); + } + return $generated_question_array; + } +} \ No newline at end of file diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index fd2ab14..c33e644 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -38,6 +38,7 @@ use seekquarry\yioop\library\IndexManager; use seekquarry\yioop\library\PhraseParser; use seekquarry\yioop\library\Thesaurus; use seekquarry\yioop\library\index_bundle_iterators as I; +use seekquarry\yioop\library\TripletExtractor; /** * logging is done during crawl not through web, @@ -398,6 +399,21 @@ class PhraseModel extends ParallelModel if (isset($out_results['PAGES'][$out_count])) { $results['PAGES'][$i] = $out_results['PAGES'][$out_count]; + + if (isset($out_results['PAGES'][$out_count] + [self::QUESTION_TRIPLETS])) { + $triplets_with_answer = + $out_results['PAGES'][$out_count] + [self::QUESTION_TRIPLETS]; + $question = trim($phrase); + + if (isset($triplets_with_answer[$question])) { + $out_results['PAGES'][$out_count]['ANSWER'] + = $triplets_with_answer[$question]; + } + } + $results['PAGES'][$i] = + $out_results['PAGES'][$out_count]; $out_count++; } } @@ -899,8 +915,33 @@ class PhraseModel extends ParallelModel } } } + if ($this->isQuestion($phrase)) { + $generated_question = TripletExtractor::questionParser(trim($phrase)); + if(isset($generated_question['FEATURED'])){ + $phrase = $generated_question['FEATURED']['1']; + }else if(isset($generated_question['RAW'])){ + $phrase = $generated_question['RAW']['1']; + } + } return $phrase; } + + /** + * Takes a phrase query entered by user and return true if it is question + * and false if not + * + * @param $phrase any statement + * @return bool returns true if statement is question + */ + public function isQuestion($phrase) + { + $regex_starts_with_que = "/^(who|what|which|where|when|whose|whome|how)(.*)$/"; + $regex_ends_with_que = "/^(.*)\?$/"; // Not in use + if (preg_match($regex_starts_with_que, trim($phrase))) { + return true; + } + return false; + } /** * Matches terms (non white-char strings) in the language $lang_tag in * $phrase that begin with $start_with and don't contain $not_contain, diff --git a/src/views/SearchView.php b/src/views/SearchView.php index abcda33..19df84c 100755 --- a/src/views/SearchView.php +++ b/src/views/SearchView.php @@ -334,8 +334,23 @@ class SearchView extends View implements CrawlConstants is_array($page[self::WORD_CLOUD])) { ?>

> + + > + ". + "Possible Answer:"."");?> + + + + + ". tl('search_view_word_cloud').""); diff --git a/src/views/elements/PageoptionsElement.php b/src/views/elements/PageoptionsElement.php index 16a7ed7..3acfd4c 100644 --- a/src/views/elements/PageoptionsElement.php +++ b/src/views/elements/PageoptionsElement.php @@ -403,6 +403,11 @@ class PageOptionsElement extends Element e("

".tl('pageoptions_element_extracted_words')."

"); e("
\n{$data['EXTRACTED_WORDS']}\n
"); } + if (isset($data["QUESTIONS_TRIPLET"])) { + e("

" . tl('pageoptions_element_extracted_questions') . + "

"); + e("
\n{$data['QUESTIONS_TRIPLET']}\n
"); + } if (isset($data["EXTRACTED_META_WORDS"])) { e("

".tl('pageoptions_element_extracted_metas')."

"); e("
\n{$data['EXTRACTED_META_WORDS']}\n
"); -- 2.10.0.windows.1 From b5f477f2e40a02770887101f0e06dd1df0ddb6e4 Mon Sep 17 00:00:00 2001 From: Salil Shenoy Date: Sat, 22 Oct 2016 15:33:18 -0700 Subject: [PATCH 2/5] Replaced the use of array() by [] --- src/executables/Fetcher.php | 15 ++++++++- src/library/TripletExtractor.php | 69 ++++++++++++++++++++-------------------- 2 files changed, 48 insertions(+), 36 deletions(-) diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 963f982..53f2908 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -15,7 +15,7 @@ * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU General Public License for more details.89 * * You should have received a copy of the GNU General Public License * along with this program. If not, see . @@ -559,6 +559,7 @@ class Fetcher implements CrawlConstants $local_archives = [""]; while (CrawlDaemon::processHandler()) { $start_time = microtime(true); + $info = []; $fetcher_message_file = C\CRAWL_DIR. "/schedules/{$prefix}FetcherMessages.txt"; if (file_exists($fetcher_message_file)) { @@ -577,6 +578,8 @@ class Fetcher implements CrawlConstants if ($info[self::CRAWL_TIME] == 0) { $info[self::STATUS] = self::NO_DATA_STATE; $this->to_crawl = []; + } else { + L\crawlLog("Crawl time is now " . $this->crawl_time); } } else if ($this->crawl_type == self::ARCHIVE_CRAWL && $this->arc_type != "WebArchiveBundle" && @@ -981,6 +984,11 @@ class Fetcher implements CrawlConstants if (isset($info[self::CRAWL_TIME]) && ($info[self::CRAWL_TIME] != $this->crawl_time || $info[self::CRAWL_TIME] == 0)) { + if ($info[self::CRAWL_TIME] > 0) { + L\crawlLog("New Crawl Time Found: {$info[self::CRAWL_TIME]}"); + } else { + L\crawlLog("Crawl Time Changing to 0"); + } $dir = C\CRAWL_DIR."/schedules"; $time_change = true; /* Zero out the crawl. If haven't done crawl before, then scheduler @@ -1033,6 +1041,8 @@ class Fetcher implements CrawlConstants "{$this->crawl_time}.txt") && file_exists( "$dir/$prefix".self::fetch_batch_name. "{$this->crawl_time}.txt")) { + L\crawlLog("Loading old batches for ". + "{$this->crawl_time}."); $info = unserialize(file_get_contents( "$dir/$prefix".self::fetch_crawl_info. "{$this->crawl_time}.txt")); @@ -1763,6 +1773,9 @@ class Fetcher implements CrawlConstants } $doc_info = $processor->handle($site[self::PAGE], $site[self::URL]); + if (C\FETCHER_PROCESS_DELAY > 0 ) { + usleep(C\FETCHER_PROCESS_DELAY); + } if (isset($site[self::REPOSITORY_TYPE]) && $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) { $site[self::URL] = $tmp_url_store; diff --git a/src/library/TripletExtractor.php b/src/library/TripletExtractor.php index 2dd8bbc..6bb3887 100644 --- a/src/library/TripletExtractor.php +++ b/src/library/TripletExtractor.php @@ -23,10 +23,9 @@ * @author Chris Pollett chris@pollett.org * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ - * @copyright 2009 - 2015 + * @copyright 2009 - 2016 * @filesource */ - namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; @@ -49,10 +48,10 @@ class TripletExtractor /** * Takes a phrase and tags each term in it with its part of speech. * So each term in the original phrase gets mapped to term~part_of_speech - * This tagger is based on a Brill tagger. It makes uses a lexicon + * This tagger is based on a Brill tagger. It uses a lexicon * consisting of words from the Brown corpus together with a list of * part of speech tags that that word had in the Brown Corpus. These are - * used to get an initial part of speech (in word was not present than + * used to get an initial part of speech (if word was not present than * we assume it is a noun). From this a fixed set of rules is used to modify * the initial tag if necessary. * @@ -86,13 +85,13 @@ class TripletExtractor } preg_match_all("/[\w\d]+/", $text, $matches); $tokens = $matches[0]; - $nouns = array('NN', 'NNS', 'NNP'); - $verbs = array('VBD', 'VBP', 'VB'); - $result = array(); - $previous = array('token' => -1, 'tag' => -1); + $nouns = ['NN', 'NNS', 'NNP']; + $verbs = ['VBD', 'VBP', 'VB']; + $result = []; + $previous = ['token' => -1, 'tag' => -1]; $previous_token = -1; sort($tokens); - $dictionary = array(); + $dictionary = []; /* Notice we sorted the tokens, and notice how we use $cur_pos so only advance forward through $lex_string. So the @@ -118,9 +117,9 @@ class TripletExtractor $tag_list = array(); foreach ($matches[0] as $token) { $prev_tag_list = $tag_list; - $tag_list = array(); + $tag_list = []; // default to a common noun - $current = array('token' => $token, 'tag' => 'NN'); + $current = ['token' => $token, 'tag' => 'NN']; // remove trailing full stops $token = strtolower(rtrim($token, ".")); if (isset($dictionary[$token])) { @@ -208,7 +207,7 @@ class TripletExtractor public static function taggedPartOfSpeechTokensToString($tagged_tokens) { $tagged_phrase = ""; - $simplified_parts_of_speech = array( + $simplified_parts_of_speech = [ "NN" => "NN", "NNS" => "NN", "NNP" => "NN", @@ -228,7 +227,7 @@ class TripletExtractor "RBR" => "AV", "RBS" => "AV", "WRB" => "AV" - ); + ]; foreach ($tagged_tokens as $t) { $tag = trim($t['tag']); $tag = (isset($simplified_parts_of_speech[$tag])) ? @@ -257,12 +256,12 @@ class TripletExtractor fclose($fh); } preg_match_all("/[\w\d\.]+/", $text, $matches); - $nouns = array('NN', 'NNS'); - $return = array(); + $nouns = ['NN', 'NNS']; + $return = []; $i = 0; foreach ($matches[0] as $token) { // default to a common noun - $return[$i] = array('token' => $token, 'tag' => 'NN'); + $return[$i] = ['token' => $token, 'tag' => 'NN']; // remove trailing full stops if (substr($token, -1) == '.') { $token = preg_replace('/\.+$/', '', $token); @@ -275,7 +274,7 @@ class TripletExtractor if ($i > 0) { if ($return[$i - 1]['tag'] == 'DT' && in_array($return[$i]['tag'], - array('VBD', 'VBP', 'VB')) + ['VBD', 'VBP', 'VB']) ) { $return[$i]['tag'] = 'NN'; } @@ -346,7 +345,7 @@ class TripletExtractor */ public static function generateParseTreeUsingRDP($tagger_array) { - $tree = array(); + $tree = []; $tree = ["cur_node" => 0]; $tree_np = TripletExtractor::extractNPUsingRDP($tagger_array, $tree); $tree = ["cur_node" => $tree_np['cur_node']]; @@ -578,7 +577,7 @@ class TripletExtractor */ public static function extractTriplet($tree) { - $triplet = array(); + $triplet = []; $triplet['subject'] = TripletExtractor::extractSubjectFromTree($tree); $triplet['predicate'] = TripletExtractor::extractPredicateFromTree($tree); @@ -592,7 +591,7 @@ class TripletExtractor */ public static function processTripletForStorage($triplet_tree) { - $processed_triplet = array(); + $processed_triplet = []; $processed_triplet['RAW'] = TripletExtractor::getRawTripletForStorage($triplet_tree); $processed_triplet['FEATURED'] = @@ -606,8 +605,8 @@ class TripletExtractor */ public static function getRawTripletForStorage($triplet_tree) { - $raw_triplet = array(); - $question_answer_triplet = array(); + $raw_triplet = []; + $question_answer_triplet = []; if (isset($triplet_tree['subject']['RAW']) && isset($triplet_tree['predicate']['RAW']) && isset($triplet_tree['object']['RAW']) @@ -641,8 +640,8 @@ class TripletExtractor */ public static function getFeaturedTripletForStorage($triplet_tree) { - $featured_triplet = array(); - $question_answer_triplet = array(); + $featured_triplet = []; + $question_answer_triplet = []; if (isset($triplet_tree['subject']['FEATURED']) && isset($triplet_tree['predicate']['FEATURED']) && isset($triplet_tree['object']['FEATURED']) @@ -685,7 +684,7 @@ class TripletExtractor */ public static function extractSubjectFromTree($tree) { - $subject = array(); + $subject = []; if (isset($tree['NP']) && $tree['NP'] != null) { $tree_np = $tree['NP']; $value = TripletExtractor::extractFirstNounFromNPTree($tree_np); @@ -709,7 +708,7 @@ class TripletExtractor */ public static function extractPredicateFromTree($tree) { - $predicate = array(); + $predicate = []; if (isset($tree['VP']) && $tree['VP'] != null) { $tree_vp = $tree['VP']; $value = TripletExtractor::extractDeepestVerbFromVBTree($tree_vp); @@ -736,7 +735,7 @@ class TripletExtractor */ public static function extractObjectFromTree($tree) { - $object = array(); + $object = []; if (isset($tree['VP']) && $tree['VP'] != null) { $tree_vp = $tree['VP']; if (isset($tree_vp['NP']) && $tree_vp['NP'] != null) { @@ -798,7 +797,7 @@ class TripletExtractor */ public static function extractAttributes($tree) { - $attribute_map = array(); + $attribute_map = []; if (isset($tree['JJ']) && count($tree['JJ']) > 0) { $attribute_map['JJ'] = $tree['JJ']['JJ']; } @@ -846,8 +845,8 @@ class TripletExtractor */ public static function storeStatementArraysAsTriplet($statement_array) { - $triplets_list = array(); - $question_list = array(); + $triplets_list = []; + $question_list = []; $question_answer_list = array(); foreach ($statement_array as $key => $value) { try { @@ -894,7 +893,7 @@ class TripletExtractor $question_string_tagged = TripletExtractor::partOfSpeechTagger_Brill( $question_string); $index = 0; - $generated_question_array = array(); + $generated_question_array = []; if (isset($question_string_tagged[$index]) && ("WRB" == trim($question_string_tagged[$index]['tag']) || "WP" == trim($question_string_tagged[$index]['tag'])) @@ -934,10 +933,10 @@ class TripletExtractor */ public static function parseWHOQuestion($question_string_tagged, $index) { - $generated_question_array = array(); + $generated_question_array = []; $tree = ["cur_node" => $index]; $tree['NP'] = "WHO"; - $triplet = array(); + $triplet = []; $tree_vp = TripletExtractor::extractVPUsingRDP( $question_string_tagged, $tree); $triplet['predicate'] = TripletExtractor::extractPredicateFromTree( @@ -989,7 +988,7 @@ class TripletExtractor */ public static function parseWHPlusQuestion($question_string_tagged, $index) { - $generated_question_array = array(); + $generated_question_array = []; $aux_verb = ""; while (isset($question_string_tagged[$index]) && ("VB" == trim($question_string_tagged[$index]['tag']) || @@ -1003,7 +1002,7 @@ class TripletExtractor } $tree = ["cur_node" => $index]; $tree['NP'] = "WHPlus"; - $triplet = array(); + $triplet = []; $tree_np = TripletExtractor::extractNPUsingRDP( $question_string_tagged, $tree); $triplet['subject'] = TripletExtractor::extractSubjectFromTree( -- 2.10.0.windows.1 From e6f31b220bc2d531512c8129bdf6a6fc94d0702b Mon Sep 17 00:00:00 2001 From: Salil Shenoy Date: Sat, 22 Oct 2016 15:42:07 -0700 Subject: [PATCH 3/5] Removingg extra characters in Fetcher.php --- src/executables/Fetcher.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 53f2908..9b9161f 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -15,7 +15,7 @@ * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details.89 + * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . -- 2.10.0.windows.1 From 08364c1e03ecf40c1a6264b2ba6920270bbe073a Mon Sep 17 00:00:00 2001 From: Salil Shenoy Date: Wed, 2 Nov 2016 22:24:55 -0700 Subject: [PATCH 4/5] Implemneted review changes 1. Wrapped some lines > 80 characters. 2.Code formatting, removed some repeated code --- src/library/TripletExtractor.php | 238 +++++++++++++++++++++++++++------------ 1 file changed, 163 insertions(+), 75 deletions(-) diff --git a/src/library/TripletExtractor.php b/src/library/TripletExtractor.php index 6bb3887..77ff6a1 100644 --- a/src/library/TripletExtractor.php +++ b/src/library/TripletExtractor.php @@ -107,17 +107,15 @@ class TripletExtractor $line = trim(substr($lex_string, $token_pos, $cur_pos - $token_pos)); $tag_list = explode(' ', $line); - $dictionary[strtolower(rtrim($token, "."))] = - array_slice($tag_list, 1); + $dictionary[$token] = array_slice($tag_list, 1); $cur_pos++; } } // now using our dictionary we tag $i = 0; - $tag_list = array(); + $tag_list = []; + $prev_tag_list = []; foreach ($matches[0] as $token) { - $prev_tag_list = $tag_list; - $tag_list = []; // default to a common noun $current = ['token' => $token, 'tag' => 'NN']; // remove trailing full stops @@ -193,6 +191,8 @@ class TripletExtractor $i++; $previous = $current; $previous_token = $token; + $prev_tag_list = $tag_list; + $tag_list = []; } return $result; } @@ -243,7 +243,7 @@ class TripletExtractor * @param $text any statement * @return array words tagged with POS tags */ - public static function partOfSpeechTagger_Brill($text) + public static function partOfSpeechTaggerBrill($text) { static $dict = null; $lexicon = C\LOCALE_DIR . "/en_US/resources/lexicon.txt"; @@ -509,6 +509,10 @@ class TripletExtractor } /** + * Takes current tree and returns + * tree by adding auxiliary verb + * node to it + * * @param $tagger_array POS tagged array * @param $tree current tree * @return mixed VP added tree @@ -539,6 +543,9 @@ class TripletExtractor } /** + * Takes current tree and returns + * tree by adding Verb node to it. + * * @param $tagger_array POS tagged tree * @param $tree current tree * @return mixed VB added tree @@ -572,6 +579,9 @@ class TripletExtractor return $tree; } /** + * Takes current tree and returns + * a triplet extracted from the tree. + * * @param $tree fully generated tree * @return array triplet array */ @@ -586,6 +596,10 @@ class TripletExtractor } /** + * Takes triplet tree and returns + * the processed triplet from the + * tree. + * * @param $triplet_tree any statement * @return array processed triplet */ @@ -600,6 +614,10 @@ class TripletExtractor } /** + * Takes triplet tree and returns + * tree an array of raw + * triplets. + * * @param $triplet_tree triplet array * @return array raw triplet array */ @@ -610,9 +628,12 @@ class TripletExtractor if (isset($triplet_tree['subject']['RAW']) && isset($triplet_tree['predicate']['RAW']) && isset($triplet_tree['object']['RAW']) - && !TripletExtractor::IsNullOrEmptyString($triplet_tree['subject']['RAW']) - && !TripletExtractor::IsNullOrEmptyString($triplet_tree['predicate']['RAW']) - && !TripletExtractor::IsNullOrEmptyString($triplet_tree['object']['RAW']) + && !TripletExtractor::isNullOrEmptyString( + $triplet_tree['subject']['RAW']) + && !TripletExtractor::isNullOrEmptyString( + $triplet_tree['predicate']['RAW']) + && !TripletExtractor::isNullOrEmptyString( + $triplet_tree['object']['RAW']) ) { $SUBJECT = trim($triplet_tree['subject']['RAW']); @@ -620,21 +641,31 @@ class TripletExtractor $OBJECT = trim($triplet_tree['object']['RAW']); $raw_triplet['SUBJECT'] = - TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT; + TripletExtractor::$question_word . " " . + $PREDICATE . " " . $OBJECT; $raw_triplet['PREDICATE'] = - $SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT; + $SUBJECT . " " . TripletExtractor::$question_word . " " . + $OBJECT; $raw_triplet['OBJECT'] = - $SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word; + $SUBJECT . " " . $PREDICATE . " " . + TripletExtractor::$question_word; - $question_answer_triplet[TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT] = $SUBJECT; - $question_answer_triplet[$SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT] = $PREDICATE; - $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word] = $OBJECT; + $question_answer_triplet[TripletExtractor::$question_word . " " . + $PREDICATE . " " . $OBJECT] = $SUBJECT; + $question_answer_triplet[$SUBJECT . " " . + TripletExtractor::$question_word . " " . $OBJECT] = $PREDICATE; + $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " . + TripletExtractor::$question_word] = $OBJECT; $raw_triplet['QUESTION_ANSWER_LIST'] = $question_answer_triplet; } return $raw_triplet; } /** + * Takes triplet tree and returns + * tree an array of featured + * triplets. + * * @param $triplet_tree triplet array * @return array featured triplet array */ @@ -645,24 +676,34 @@ class TripletExtractor if (isset($triplet_tree['subject']['FEATURED']) && isset($triplet_tree['predicate']['FEATURED']) && isset($triplet_tree['object']['FEATURED']) - && !TripletExtractor::IsNullOrEmptyString($triplet_tree['subject']['FEATURED']) - && !TripletExtractor::IsNullOrEmptyString($triplet_tree['predicate']['FEATURED']) - && !TripletExtractor::IsNullOrEmptyString($triplet_tree['object']['FEATURED']) + && !TripletExtractor::isNullOrEmptyString( + $triplet_tree['subject']['FEATURED']) + && !TripletExtractor::isNullOrEmptyString( + $triplet_tree['predicate']['FEATURED']) + && !TripletExtractor::isNullOrEmptyString( + $triplet_tree['object']['FEATURED']) ) { $SUBJECT = trim($triplet_tree['subject']['FEATURED']); $PREDICATE = trim($triplet_tree['predicate']['FEATURED']); $OBJECT = trim($triplet_tree['object']['FEATURED']); $featured_triplet['SUBJECT'] = - TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT; + TripletExtractor::$question_word . " " . $PREDICATE . + " " . $OBJECT; $featured_triplet['PREDICATE'] = - $SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT; + $SUBJECT . " " . TripletExtractor::$question_word . + " " . $OBJECT; $featured_triplet['OBJECT'] = - $SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word; + $SUBJECT . " " . $PREDICATE . " " . + TripletExtractor::$question_word; - $question_answer_triplet[TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT] = $SUBJECT; - $question_answer_triplet[$SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT] = $PREDICATE; - $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word] = $OBJECT; + $question_answer_triplet[TripletExtractor::$question_word . " " . + $PREDICATE . " " . $OBJECT] = $SUBJECT; + $question_answer_triplet[$SUBJECT . " " . + TripletExtractor::$question_word . + " " . $OBJECT] = $PREDICATE; + $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " . + TripletExtractor::$question_word] = $OBJECT; $featured_triplet['QUESTION_ANSWER_LIST'] = $question_answer_triplet; } @@ -670,15 +711,22 @@ class TripletExtractor } /** + * Takes a string and checks if + * it is set or empty. + * * @param $string any string * @return bool true if null of empty string */ - public static function IsNullOrEmptyString($string) + public static function isNullOrEmptyString($string) { return (!isset($string) || trim($string) === ''); } /** + * Takes current tree and returns + * the array of text tagged as + * Subject. + * * @param $tree generated tree * @return array subject array */ @@ -690,7 +738,8 @@ class TripletExtractor $value = TripletExtractor::extractFirstNounFromNPTree($tree_np); $subject['RAW'] = $value; $featured_subject = ""; - $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($tree_np)); + $it = new \RecursiveIteratorIterator( + new \RecursiveArrayIterator($tree_np)); foreach ($it as $v) { $featured_subject .= $v . " "; } @@ -703,6 +752,10 @@ class TripletExtractor } /** + * Takes current tree and returns + * the array of text tagged as + * Predicate. + * * @param $tree generated tree * @return array predicate array */ @@ -716,7 +769,8 @@ class TripletExtractor $featured_predicate = ""; if (isset($tree_vp['VB']) && $tree_vp['VB'] != null) { $tree_vb = $tree_vp['VB']; - $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($tree_vb)); + $it = new \RecursiveIteratorIterator( + new \RecursiveArrayIterator($tree_vb)); foreach ($it as $v) { $featured_predicate .= $v . " "; } @@ -730,6 +784,10 @@ class TripletExtractor } /** + * Takes current tree and returns + * the array of text tagged as + * Object. + * * @param $tree generated tree * @return array object array */ @@ -743,7 +801,8 @@ class TripletExtractor $value = TripletExtractor::extractFirstNounFromNPTree($nb); $object['RAW'] = $value; $featured_object = ""; - $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($nb)); + $it = new \RecursiveIteratorIterator( + new \RecursiveArrayIterator($nb)); foreach ($it as $v) { $featured_object .= $v . " "; } @@ -760,6 +819,9 @@ class TripletExtractor } /** + * Takes noun phrase tree and return + * the first noun from the tree. + * * @param $tree_np noun phrase subtree * @return string first noun */ @@ -776,6 +838,9 @@ class TripletExtractor } /** + * Takes verb phrase tree and returns + * the base form of the verb. + * * @param $tree_vp verb phrase subtree * @return string deepest verb */ @@ -792,6 +857,10 @@ class TripletExtractor } /** + * Takes current tree and return + * attribute maps for noun, adjectives, + * preposition. + * * @param $tree generated tree * @return array attributes array */ @@ -822,6 +891,11 @@ class TripletExtractor } /** + * Takes the statement and apply + * the rules in the defined in the + * lexicon, assign parts of speech + * and generate a triplet tree. + * * @param $statement any statement * @return array processed triplet */ @@ -829,7 +903,7 @@ class TripletExtractor { try { $tagged_statement = - TripletExtractor::partOfSpeechTagger_Brill($statement); + TripletExtractor::partOfSpeechTaggerBrill($statement); $statement_tree = TripletExtractor::generateParseTreeUsingRDP($tagged_statement); $triplet_tree = TripletExtractor::extractTriplet($statement_tree); @@ -840,6 +914,11 @@ class TripletExtractor } /** + * Process individual statements + * from the statement array. Generate + * a list of question and answer + * pairs. + * * @param $statement_array array of statements * @return array list of triplets */ @@ -851,23 +930,32 @@ class TripletExtractor foreach ($statement_array as $key => $value) { try { if (str_word_count($key) >= 3) { - $extracted_triplet = TripletExtractor::storeStatementAsTriplet($key); + $extracted_triplet = + TripletExtractor::storeStatementAsTriplet($key); if (isset($extracted_triplet['RAW']) && sizeof($extracted_triplet['RAW']) > 0) { - $question_list[$extracted_triplet['RAW']['SUBJECT']] = $value; - $question_list[$extracted_triplet['RAW']['PREDICATE']] = $value; - $question_list[$extracted_triplet['RAW']['OBJECT']] = $value; - $question_answer_list = array_merge($question_answer_list, + $question_list[$extracted_triplet['RAW']['SUBJECT']] + = $value; + $question_list[$extracted_triplet['RAW']['PREDICATE']] + = $value; + $question_list[$extracted_triplet['RAW']['OBJECT']] + = $value; + $question_answer_list = + array_merge($question_answer_list, $extracted_triplet['RAW']['QUESTION_ANSWER_LIST']); } if (isset($extracted_triplet['FEATURED']) && sizeof($extracted_triplet['FEATURED']) > 0) { - $question_list[$extracted_triplet['FEATURED']['SUBJECT']] = $value; - $question_list[$extracted_triplet['FEATURED']['PREDICATE']] = $value; - $question_list[$extracted_triplet['FEATURED']['OBJECT']] = $value; - $question_answer_list = array_merge($question_answer_list, + $question_list[$extracted_triplet['FEATURED']['SUBJECT']] + = $value; + $question_list[$extracted_triplet['FEATURED']['PREDICATE']] + = $value; + $question_list[$extracted_triplet['FEATURED']['OBJECT']] + = $value; + $question_answer_list = + array_merge($question_answer_list, $extracted_triplet['FEATURED']['QUESTION_ANSWER_LIST']); } } @@ -890,33 +978,30 @@ class TripletExtractor */ public static function questionParser($question_string) { - $question_string_tagged = TripletExtractor::partOfSpeechTagger_Brill( + $question_string_tagged = TripletExtractor::partOfSpeechTaggerBrill( $question_string); $index = 0; $generated_question_array = []; - if (isset($question_string_tagged[$index]) && - ("WRB" == trim($question_string_tagged[$index]['tag']) || - "WP" == trim($question_string_tagged[$index]['tag'])) - ) { - if ("WHO" == strtoupper( - trim($question_string_tagged[$index]['token']))) { - $index = $index + 1; - $generated_question_array = - TripletExtractor::parseWHOQuestion( - $question_string_tagged, $index); - } else { - if ("WHERE" == strtoupper( - trim($question_string_tagged[$index]['token'])) || - "WHEN" == strtoupper( - trim($question_string_tagged[$index]['token'])) || - "WHAT" == strtoupper( - trim($question_string_tagged[$index]['token'])) - ) { + if (isset($question_string_tagged[$index])) { + $tag = trim($question_string_tagged[$index]['tag']); + if ("WRB" == $tag || "WP" == $tag) { + $token = strtoupper( + trim($question_string_tagged[$index]['token'])); + if ("WHO" == $token) { $index = $index + 1; $generated_question_array = + TripletExtractor::parseWHOQuestion( + $question_string_tagged, $index); + } else { + if ("WHERE" == $token || + "WHEN" == $token || + "WHAT" == $token) { + $index = $index + 1; + $generated_question_array = TripletExtractor::parseWHPlusQuestion_New( $question_string_tagged, $index); + } } } } @@ -945,9 +1030,9 @@ class TripletExtractor $tree_vp); if (isset($triplet['object']['RAW']) && isset($triplet['predicate']['RAW']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::sNullOrEmptyString( $triplet['object']['RAW']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::isNullOrEmptyString( $triplet['predicate']['RAW']) ) { $generated_question_array['RAW']['1'] = @@ -961,9 +1046,9 @@ class TripletExtractor } if (isset($triplet['object']['FEATURED']) && isset($triplet['predicate']['FEATURED']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::isNullOrEmptyString( $triplet['object']['FEATURED']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::isNullOrEmptyString( $triplet['predicate']['FEATURED']) ) { $generated_question_array['FEATURED']['1'] = @@ -990,15 +1075,18 @@ class TripletExtractor { $generated_question_array = []; $aux_verb = ""; - while (isset($question_string_tagged[$index]) && - ("VB" == trim($question_string_tagged[$index]['tag']) || - "VBD" == trim($question_string_tagged[$index]['tag']) || - "VBG" == trim($question_string_tagged[$index]['tag']) || - "VBN" == trim($question_string_tagged[$index]['tag']) || - "VBP" == trim($question_string_tagged[$index]['tag']) || - "VBZ" == trim($question_string_tagged[$index]['tag']))) { - $aux_verb .= " " . trim($question_string_tagged[$index]['token']); - $index = $index + 1; + while (isset($question_string_tagged[$index])) { + $tag = trim($question_string_tagged[$index]['tag']); + if ("VB" == $tag || + "VBD" == $tag || + "VBG" == $tag || + "VBN" == $tag || + "VBP" == $tag || + "VBZ" == $tag) { + $token = trim($question_string_tagged[$index]['token']); + $aux_verb .= " " . $token; + $index = $index + 1; + } } $tree = ["cur_node" => $index]; $tree['NP'] = "WHPlus"; @@ -1014,7 +1102,7 @@ class TripletExtractor $triplet['object'] = TripletExtractor::extractObjectFromTree( $tree_vp); if (isset($aux_verb) - && !TripletExtractor::IsNullOrEmptyString($aux_verb) + && !TripletExtractor::isNullOrEmptyString($aux_verb) ) { $triplet['predicate']['RAW'] = trim($aux_verb) . " " . $triplet['predicate']['RAW']; @@ -1026,9 +1114,9 @@ class TripletExtractor } if (isset($triplet['subject']['RAW']) && isset($triplet['predicate']['RAW']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::isNullOrEmptyString( $triplet['subject']['RAW']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::isNullOrEmptyString( $triplet['predicate']['RAW']) ) { $generated_question_array['RAW']['1'] = @@ -1042,9 +1130,9 @@ class TripletExtractor } if (isset($triplet['subject']['FEATURED']) && isset($triplet['predicate']['FEATURED']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::isNullOrEmptyString( $triplet['subject']['FEATURED']) - && !TripletExtractor::IsNullOrEmptyString( + && !TripletExtractor::isNullOrEmptyString( $triplet['predicate']['FEATURED']) ) { $generated_question_array['FEATURED']['1'] = -- 2.10.0.windows.1 From 7eb3a0a9ca53bf581586414dfa589447fb613857 Mon Sep 17 00:00:00 2001 From: Salil Shenoy Date: Wed, 2 Nov 2016 22:41:20 -0700 Subject: [PATCH 5/5] Wrapping up code lines > 80 columns --- src/controllers/components/CrawlComponent.php | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php index 1efafea..122a48b 100644 --- a/src/controllers/components/CrawlComponent.php +++ b/src/controllers/components/CrawlComponent.php @@ -1346,8 +1346,8 @@ class CrawlComponent extends Component implements CrawlConstants PhraseParser::extractPhrasesInLists($phrase_string, $lang); $len = strlen($phrase_string); - if (PhraseParser::computeSafeSearchScore($word_lists['WORD_LIST'], $len) < - 0.012) { + if (PhraseParser::computeSafeSearchScore( + $word_lists['WORD_LIST'], $len) < 0.012) { $meta_ids[] = "safe:true"; $safe = true; } else { @@ -1368,7 +1368,8 @@ class CrawlComponent extends Component implements CrawlConstants $data["EXTRACTED_META_WORDS"] = wordwrap($parent->clean( print_r($meta_ids, true), "string"), 75, "\n", true); $data["QUESTIONS_TRIPLET"] = wordwrap($parent->clean( - print_r($word_lists['QUESTION_ANSWER_LIST'], true), "string"), 75, "\n", true); + print_r($word_lists['QUESTION_ANSWER_LIST'], true), + "string"), 75, "\n", true); } return $data; } -- 2.10.0.windows.1