From 60bb07ffbdaf9224a5609e2159b19bc47c3b7c74 Mon Sep 17 00:00:00 2001 From: Salil Shenoy Date: Tue, 13 Dec 2016 10:29:54 -0800 Subject: [PATCH 1/3] Code Refactoring: The code to create moved to Tokenizer from Question Answer Extractor --- src/library/QuestionAnswerExtractor.php | 353 +------------------------------ src/locale/en_US/resources/Tokenizer.php | 329 ++++++++++++++++++++++++++++ 2 files changed, 335 insertions(+), 347 deletions(-) diff --git a/src/library/QuestionAnswerExtractor.php b/src/library/QuestionAnswerExtractor.php index 53b2728..f62f8d4 100644 --- a/src/library/QuestionAnswerExtractor.php +++ b/src/library/QuestionAnswerExtractor.php @@ -48,335 +48,6 @@ class QuestionAnswerExtractor */ public static $question_marker = "qqq"; /** - * @array - */ - public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", - "VBZ"]; - /** - * @array - */ - public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "PRP"]; - /** - * @array - */ - public static $adjective_phrases = ["JJ", "JJR", "JJS"]; - /** - * Given a part-of-speeech tagged phrase array generates a parse tree - * for the phrase using a recursive descent parser. - * - * @param array $tagged_phrase - * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) - * @return array used to represent a tree. The array has up to three fields - * $tree["cur_node"] index of how far we parsed our$tagged_phrase - * $tree["NP"] contains a subtree for a noun phrase - * $tree["VP"] contains a subtree for a verb phrase - */ - public static function generatePhraseParseTree($tagged_phrase) - { - $tree = []; - //cur_node is the index in tagged_phrase we've parse to so far - $tree_np = self::extractNounPhrase($tagged_phrase, ["cur_node" => 0]); - $tree = ["cur_node" => $tree_np['cur_node']]; - $tree_vp = self::extractVerbPhrase($tagged_phrase, $tree); - if ($tree == $tree_vp) { - return $tree; - } - $tree['cur_node'] = $tree_vp['cur_node']; - unset($tree_np['cur_node']); - unset($tree_vp['cur_node']); - $tree['NP'] = $tree_np['NP']; - $tree['VP'] = $tree_vp['VP']; - return $tree; - } - /** - * Takes a part-of-speech tagged phrase and pre-tree with a - * parse-from position and builds a parse tree for a noun phrase if possible - * - * @param array $tagged_phrase - * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) - * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields - * "cur_node" index of how far we parsed $tagged_phrase - * "NP" a subarray with possible fields - * "DT" with value a determiner subtree - * "JJ" with value an adjective subtree - * "NN" with value a noun tree - */ - public static function extractNounPhrase($tagged_phrase, $tree) - { - $cur_node = $tree['cur_node']; - $tree_dt = self::extractDeterminer($tagged_phrase, - ['cur_node' => $cur_node]); - $tree_jj = self::extractAdjective($tagged_phrase, - ['cur_node' => $tree_dt['cur_node']]); - $tree_nn = self::extractNoun($tagged_phrase, - ['cur_node' => $tree_jj['cur_node']]); - $tree_pp = self::extractPrepositionalPhrases($tagged_phrase, - ['cur_node' => $tree_nn['cur_node']]); - if ($tree_nn['cur_node'] == $cur_node) { - $tree['NP'] = ""; - } else { - $cur_node = $tree_pp['cur_node']; - unset($tree_dt['cur_node']); - $tree_new_sub['DT'] = $tree_dt; - unset($tree_jj['cur_node']); - $tree_new_sub['JJ'] = $tree_jj; - unset($tree_nn['cur_node']); - $tree_new_sub['NN'] = $tree_nn; - unset($tree_pp['cur_node']); - $tree_new_sub['PRP'] = $tree_pp; - $tree_new['cur_node'] = $cur_node; - $tree_new['NP'] = $tree_new_sub; - return $tree_new; - } - return $tree; - } - /** - * Takes a part-of-speech tagged phrase and pre-tree with a - * parse-from position and builds a parse tree for a verb phrase if possible - * - * @param array $tagged_phrase - * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) - * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields - * "cur_node" index of how far we parsed $tagged_phrase - * "VP" a subarray with possible fields - * "VB" with value a verb subtree - * "NP" with value an noun phrase subtree - */ - public static function extractVerbPhrase($tagged_phrase, $tree) - { - $cur_node = $tree['cur_node']; - $tree_vb = self::extractVerb($tagged_phrase, ['cur_node' => $cur_node]); - if ($tree_vb['cur_node'] == $cur_node) { - return $tree; - } - $cur_node = $tree_vb['cur_node']; - $preposition_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - trim($tagged_phrase[$cur_node]['tag']) == "IN") { - $preposition_string .= " ". $tagged_phrase[$cur_node]['token']; - $cur_node++; - } - if (!empty($preposition_string)) { - $tree_vb["IN"] = $preposition_string; - } - $tree_np = self::extractNounPhrase($tagged_phrase, - ['cur_node' => $cur_node]); - $tree_new = []; - $tree_new_sub = []; - if ($tree_np['cur_node'] != $cur_node) { - $cur_node = $tree_np['cur_node']; - unset($tree_vb['cur_node']); - unset($tree_np['cur_node']); - $tree_new_sub['VB'] = $tree_vb; - $tree_new_sub['NP'] = $tree_np['NP']; - $tree_new['cur_node'] = $cur_node; - $tree_new['VP'] = $tree_new_sub; - return $tree_new; - } - unset($tree_vb['cur_node']); - $tree_new_sub['VB'] = $tree_vb; - $tree_new['cur_node'] = $cur_node; - $tree_new['VP'] = $tree_new_sub; - return $tree_new; - } - /** - * Takes a part-of-speech tagged phrase and pre-tree with a - * parse-from position and builds a parse tree for a determiner if possible - * - * @param array $tagged_phrase - * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) - * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields - * "cur_node" index of how far we parsed $tagged_phrase - * "DT" a subarray with a token node for the determiner that was - * parsed - */ - public static function extractDeterminer($tagged_phrase, $tree) - { - $cur_node = $tree['cur_node']; - if (isset($tagged_phrase[$cur_node]['tag']) && - trim($tagged_phrase[$cur_node]['tag']) == "DT" ) { - $tree['DT'] = $tagged_phrase[$cur_node]['token']; - $tree['cur_node']++; - return $tree; - } - return $tree; - } - /** - * Takes a part-of-speech tagged phrase and pre-tree with a - * parse-from position and builds a parse tree for an adjective if possible - * - * @param array $tagged_phrase - * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) - * @param array $tree that consists of ["cur_node" => - * current parse position in $tagged_phrase] - * @return array has fields - * "cur_node" index of how far we parsed $tagged_phrase - * "JJ" a subarray with a token node for the adjective that was - * parsed - */ - public static function extractAdjective($tagged_phrase, $tree) - { - $adjective_string = ""; - $cur_node = $tree['cur_node']; - while (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$adjective_phrases)) { - $adjective_string .= " " . $tagged_phrase[$cur_node]['token']; - $cur_node++; - } - if (!empty($adjective_string)) { - $tree["JJ"] = $adjective_string; - } - $tree['cur_node'] = $cur_node; - return $tree; - } - /** - * Takes a part-of-speech tagged phrase and pre-tree with a - * parse-from position and builds a parse tree for a noun if possible - * - * @param array $tagged_phrase - * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) - * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields - * "cur_node" index of how far we parsed $tagged_phrase - * "NN" a subarray with a token node for the noun string that was - * parsed - */ - public static function extractNoun($tagged_phrase, $tree) - { - //Combining multiple noun into one - $noun_string = ""; - $cur_node = $tree['cur_node']; - while (isset($tagged_phrase[$cur_node]['tag']) && - (in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$noun_phrases))) { - $noun_string .= " " . $tagged_phrase[$cur_node]['token']; - $cur_node++; - } - if (!empty($noun_string)) { - $tree["NN"] = $noun_string; - } - $tree['cur_node'] = $cur_node; - return $tree; - } - /** - * Takes a part-of-speech tagged phrase and pre-tree with a - * parse-from position and builds a parse tree for a sequence of - * prepositional phrases if possible - * - * @param array $tagged_phrase - * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) - * @param array $tree that consists of ["cur_node" => - * current parse position in $tagged_phrase] - * @return array has fields - * "cur_node" index of how far we parsed $tagged_phrase - * parsed followed by additional possible fields (here i - * represents the ith clause found): - * "IN_i" with value a preposition subtree - * "DT_i" with value a determiner subtree - * "JJ_i" with value an adjective subtree - * "NN_i" with value an additional noun subtree - */ - public static function extractPrepositionalPhrases($tagged_phrase, $tree, - $index = 1) - { - $cur_node = $tree['cur_node']; - // Checking for preposition.I.e, format: prep [det] [adjective] noun - if (isset($tagged_phrase[$cur_node]['tag']) && - trim($tagged_phrase[$cur_node]['tag']) == "IN") { - /* can have multiple prep's in a row, for example, - it is known in over 20 countries*/ - $preposition_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - trim($tagged_phrase[$cur_node]['tag']) == "IN") { - $preposition_string .= " ". $tagged_phrase[$cur_node]['token']; - $cur_node++; - } - if (!empty($preposition_string)) { - $tree["IN_$index"] = $preposition_string; - } - if (isset($tagged_phrase[$cur_node]['tag']) && - trim($tagged_phrase[$cur_node]['tag']) == "DT") { - $tree['DT_$index'] = $tagged_phrase[$cur_node]['token']; - $cur_node++; - } - $adjective_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$adjective_phrases)) { - $adjective_string .= " " . $tagged_phrase[$cur_node]['token']; - $cur_node++; - } - if (!empty($adjective_string)) { - $tree["JJ_$index"] = $adjective_string; - } - $prep_noun_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$noun_phrases)) { - $prep_noun_string .= " " . $tagged_phrase[$cur_node]['token']; - $cur_node++; - } - if ($prep_noun_string) { - $tree["NP_$index"] = $prep_noun_string; - } - $tree_next = self::extractPrepositionalPhrases($tagged_phrase, - ["cur_node" => $cur_node], $index + 1); - } - $tree['cur_node'] = $cur_node; - return $tree; - } - /** - * Takes a part-of-speech tagged phrase and pre-tree with a - * parse-from position and builds a parse tree for a verb if possible - * - * @param array $tagged_phrase - * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) - * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields - * "cur_node" index of how far we parsed $tagged_phrase - * "VB" a subarray with a token node for the verb string that was - * parsed - */ - public static function extractVerb($tagged_phrase, $tree) - { - $cur_node = $tree['cur_node']; - // skip stuff before verb (intensifiers and adverbs) - while (isset($tagged_phrase[$cur_node]['tag']) && - !in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$verb_phrases)) { - $cur_node++; - } - $verb_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$verb_phrases)) { - $verb_string .= " " . $tagged_phrase[$cur_node]['token']; - $cur_node++; - } - if (!empty($verb_string)) { - $tree["VB"] = $verb_string; - } - $tree['cur_node'] = $cur_node; - return $tree; - } - /** * Takes a parse tree of a phrase and computes subject, predicate, and * object arrays. Each of these array consists of two components CONCISE and * RAW, CONCISE corresponding to something more similar to the words in the @@ -569,22 +240,6 @@ class QuestionAnswerExtractor return $extract; } /** - * Takes the phrase and apply the rules in the defined in the - * lexicon, assign parts of speech and generate a triplet tree. - * - * @param $phrase any phrase to be converted into triplets - * @param string $lang locale tag to tag parts of speech in phrase - * @return array question triplets extracted from phrase - */ - public static function extractTripletsPhrase($phrase, $lang) - { - $tokenizer = PhraseParser::getTokenizer($lang); - $tagged_phrase = $tokenizer->tagTokenizePartOfSpeech($phrase); - $parse_tree = self::generatePhraseParseTree($tagged_phrase); - $triplets = self::extractTripletsParseTree($parse_tree); - return self::rearrangeTripletsByType($triplets, $lang); - } - /** * Scans a word list for phrases. For phrases found generate * a list of question and answer pairs at two levels of granularity: * CONCISE (using all terms in orginal phrase) and RAW (removing @@ -608,9 +263,13 @@ class QuestionAnswerExtractor }, \ARRAY_FILTER_USE_KEY ); $triplet_types = ['CONCISE', 'RAW']; $triplet_parts = ['subject', 'predicate', 'object']; + $tokenizer = PhraseParser::getTokenizer($lang); foreach ($word_and_phrase_list as $word_and_phrase => $position_list) { - $extracted_triplets = self::extractTripletsPhrase( - $word_and_phrase, $lang); + $tagged_phrase = $tokenizer->tagTokenizePartOfSpeech($word_and_phrase); + $parse_tree = $tokenizer->generatePhraseParseTree($tagged_phrase); + $triplets = self::extractTripletsParseTree($parse_tree); + $extracted_triplets = self::rearrangeTripletsByType($triplets, $lang); + foreach ($triplet_types as $type) { if (!empty($extracted_triplets[$type])) { $triplet = $extracted_triplets[$type]; diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php index afeabb1..d8b430b 100755 --- a/src/locale/en_US/resources/Tokenizer.php +++ b/src/locale/en_US/resources/Tokenizer.php @@ -74,6 +74,19 @@ class Tokenizer 'bruce schnier' => 'bruce schneier', ]; /** + * @array + */ + public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", + "VBZ"]; + /** + * @array + */ + public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "PRP"]; + /** + * @array + */ + public static $adjective_phrases = ["JJ", "JJR", "JJS"]; + /** * storage used in computing the stem * @var string */ @@ -529,6 +542,322 @@ class Tokenizer return true; } } + /** + * Takes a part-of-speech tagged phrase and pre-tree with a + * parse-from position and builds a parse tree for a determiner if possible + * + * @param array $tagged_phrase + * an array of pairs of the form ("token" => token_for_term, + * "tag"=> part_of_speech_tag_for_term) + * @param array $tree that consists of ["curnode" => + * current parse position in $tagged_phrase] + * @return array has fields + * "cur_node" index of how far we parsed $tagged_phrase + * "DT" a subarray with a token node for the determiner that was + * parsed + */ + public static function extractDeterminer($tagged_phrase, $tree) + { + $cur_node = $tree['cur_node']; + if (isset($tagged_phrase[$cur_node]['tag']) && + trim($tagged_phrase[$cur_node]['tag']) == "DT" ) { + $tree['DT'] = $tagged_phrase[$cur_node]['token']; + $tree['cur_node']++; + return $tree; + } + return $tree; + } + /** + * Takes a part-of-speech tagged phrase and pre-tree with a + * parse-from position and builds a parse tree for an adjective if possible + * + * @param array $tagged_phrase + * an array of pairs of the form ("token" => token_for_term, + * "tag"=> part_of_speech_tag_for_term) + * @param array $tree that consists of ["cur_node" => + * current parse position in $tagged_phrase] + * @return array has fields + * "cur_node" index of how far we parsed $tagged_phrase + * "JJ" a subarray with a token node for the adjective that was + * parsed + */ + public static function extractAdjective($tagged_phrase, $tree) + { + $adjective_string = ""; + $cur_node = $tree['cur_node']; + while (isset($tagged_phrase[$cur_node]['tag']) && + in_array(trim($tagged_phrase[$cur_node]['tag']), + self::$adjective_phrases)) { + $adjective_string .= " " . $tagged_phrase[$cur_node]['token']; + $cur_node++; + } + if (!empty($adjective_string)) { + $tree["JJ"] = $adjective_string; + } + $tree['cur_node'] = $cur_node; + return $tree; + } + /** + * Takes a part-of-speech tagged phrase and pre-tree with a + * parse-from position and builds a parse tree for a noun if possible + * + * @param array $tagged_phrase + * an array of pairs of the form ("token" => token_for_term, + * "tag"=> part_of_speech_tag_for_term) + * @param array $tree that consists of ["curnode" => + * current parse position in $tagged_phrase] + * @return array has fields + * "cur_node" index of how far we parsed $tagged_phrase + * "NN" a subarray with a token node for the noun string that was + * parsed + */ + public static function extractNoun($tagged_phrase, $tree) + { + //Combining multiple noun into one + $noun_string = ""; + $cur_node = $tree['cur_node']; + while (isset($tagged_phrase[$cur_node]['tag']) && + (in_array(trim($tagged_phrase[$cur_node]['tag']), + self::$noun_phrases))) { + $noun_string .= " " . $tagged_phrase[$cur_node]['token']; + $cur_node++; + } + if (!empty($noun_string)) { + $tree["NN"] = $noun_string; + } + $tree['cur_node'] = $cur_node; + return $tree; + } + /** + * Takes a part-of-speech tagged phrase and pre-tree with a + * parse-from position and builds a parse tree for a sequence of + * prepositional phrases if possible + * + * @param array $tagged_phrase + * an array of pairs of the form ("token" => token_for_term, + * "tag"=> part_of_speech_tag_for_term) + * @param array $tree that consists of ["cur_node" => + * current parse position in $tagged_phrase] + * @return array has fields + * "cur_node" index of how far we parsed $tagged_phrase + * parsed followed by additional possible fields (here i + * represents the ith clause found): + * "IN_i" with value a preposition subtree + * "DT_i" with value a determiner subtree + * "JJ_i" with value an adjective subtree + * "NN_i" with value an additional noun subtree + */ + public static function extractPrepositionalPhrases($tagged_phrase, $tree, + $index = 1) + { + $cur_node = $tree['cur_node']; + // Checking for preposition.I.e, format: prep [det] [adjective] noun + if (isset($tagged_phrase[$cur_node]['tag']) && + trim($tagged_phrase[$cur_node]['tag']) == "IN") { + /* can have multiple prep's in a row, for example, + it is known in over 20 countries*/ + $preposition_string = ""; + while (isset($tagged_phrase[$cur_node]['tag']) && + trim($tagged_phrase[$cur_node]['tag']) == "IN") { + $preposition_string .= " ". $tagged_phrase[$cur_node]['token']; + $cur_node++; + } + if (!empty($preposition_string)) { + $tree["IN_$index"] = $preposition_string; + } + if (isset($tagged_phrase[$cur_node]['tag']) && + trim($tagged_phrase[$cur_node]['tag']) == "DT") { + $tree['DT_$index'] = $tagged_phrase[$cur_node]['token']; + $cur_node++; + } + $adjective_string = ""; + while (isset($tagged_phrase[$cur_node]['tag']) && + in_array(trim($tagged_phrase[$cur_node]['tag']), + self::$adjective_phrases)) { + $adjective_string .= " " . $tagged_phrase[$cur_node]['token']; + $cur_node++; + } + if (!empty($adjective_string)) { + $tree["JJ_$index"] = $adjective_string; + } + $prep_noun_string = ""; + while (isset($tagged_phrase[$cur_node]['tag']) && + in_array(trim($tagged_phrase[$cur_node]['tag']), + self::$noun_phrases)) { + $prep_noun_string .= " " . $tagged_phrase[$cur_node]['token']; + $cur_node++; + } + if ($prep_noun_string) { + $tree["NP_$index"] = $prep_noun_string; + } + $tree_next = self::extractPrepositionalPhrases($tagged_phrase, + ["cur_node" => $cur_node], $index + 1); + } + $tree['cur_node'] = $cur_node; + return $tree; + } + /** + * Takes a part-of-speech tagged phrase and pre-tree with a + * parse-from position and builds a parse tree for a noun phrase if possible + * + * @param array $tagged_phrase + * an array of pairs of the form ("token" => token_for_term, + * "tag"=> part_of_speech_tag_for_term) + * @param array $tree that consists of ["curnode" => + * current parse position in $tagged_phrase] + * @return array has fields + * "cur_node" index of how far we parsed $tagged_phrase + * "NP" a subarray with possible fields + * "DT" with value a determiner subtree + * "JJ" with value an adjective subtree + * "NN" with value a noun tree + */ + public static function extractNounPhrase($tagged_phrase, $tree) + { + $cur_node = $tree['cur_node']; + $tree_dt = self::extractDeterminer($tagged_phrase, + ['cur_node' => $cur_node]); + $tree_jj = self::extractAdjective($tagged_phrase, + ['cur_node' => $tree_dt['cur_node']]); + $tree_nn = self::extractNoun($tagged_phrase, + ['cur_node' => $tree_jj['cur_node']]); + $tree_pp = self::extractPrepositionalPhrases($tagged_phrase, + ['cur_node' => $tree_nn['cur_node']]); + if ($tree_nn['cur_node'] == $cur_node) { + $tree['NP'] = ""; + } else { + $cur_node = $tree_pp['cur_node']; + unset($tree_dt['cur_node']); + $tree_new_sub['DT'] = $tree_dt; + unset($tree_jj['cur_node']); + $tree_new_sub['JJ'] = $tree_jj; + unset($tree_nn['cur_node']); + $tree_new_sub['NN'] = $tree_nn; + unset($tree_pp['cur_node']); + $tree_new_sub['PRP'] = $tree_pp; + $tree_new['cur_node'] = $cur_node; + $tree_new['NP'] = $tree_new_sub; + return $tree_new; + } + return $tree; + } + /** + * Takes a part-of-speech tagged phrase and pre-tree with a + * parse-from position and builds a parse tree for a verb if possible + * + * @param array $tagged_phrase + * an array of pairs of the form ("token" => token_for_term, + * "tag"=> part_of_speech_tag_for_term) + * @param array $tree that consists of ["curnode" => + * current parse position in $tagged_phrase] + * @return array has fields + * "cur_node" index of how far we parsed $tagged_phrase + * "VB" a subarray with a token node for the verb string that was + * parsed + */ + public static function extractVerb($tagged_phrase, $tree) + { + $cur_node = $tree['cur_node']; + // skip stuff before verb (intensifiers and adverbs) + while (isset($tagged_phrase[$cur_node]['tag']) && + !in_array(trim($tagged_phrase[$cur_node]['tag']), + self::$verb_phrases)) { + $cur_node++; + } + $verb_string = ""; + while (isset($tagged_phrase[$cur_node]['tag']) && + in_array(trim($tagged_phrase[$cur_node]['tag']), + self::$verb_phrases)) { + $verb_string .= " " . $tagged_phrase[$cur_node]['token']; + $cur_node++; + } + if (!empty($verb_string)) { + $tree["VB"] = $verb_string; + } + $tree['cur_node'] = $cur_node; + return $tree; + } + /** + * Takes a part-of-speech tagged phrase and pre-tree with a + * parse-from position and builds a parse tree for a verb phrase if possible + * + * @param array $tagged_phrase + * an array of pairs of the form ("token" => token_for_term, + * "tag"=> part_of_speech_tag_for_term) + * @param array $tree that consists of ["curnode" => + * current parse position in $tagged_phrase] + * @return array has fields + * "cur_node" index of how far we parsed $tagged_phrase + * "VP" a subarray with possible fields + * "VB" with value a verb subtree + * "NP" with value an noun phrase subtree + */ + public static function extractVerbPhrase($tagged_phrase, $tree) + { + $cur_node = $tree['cur_node']; + $tree_vb = self::extractVerb($tagged_phrase, ['cur_node' => $cur_node]); + if ($tree_vb['cur_node'] == $cur_node) { + return $tree; + } + $cur_node = $tree_vb['cur_node']; + $preposition_string = ""; + while (isset($tagged_phrase[$cur_node]['tag']) && + trim($tagged_phrase[$cur_node]['tag']) == "IN") { + $preposition_string .= " ". $tagged_phrase[$cur_node]['token']; + $cur_node++; + } + if (!empty($preposition_string)) { + $tree_vb["IN"] = $preposition_string; + } + $tree_np = self::extractNounPhrase($tagged_phrase, + ['cur_node' => $cur_node]); + $tree_new = []; + $tree_new_sub = []; + if ($tree_np['cur_node'] != $cur_node) { + $cur_node = $tree_np['cur_node']; + unset($tree_vb['cur_node']); + unset($tree_np['cur_node']); + $tree_new_sub['VB'] = $tree_vb; + $tree_new_sub['NP'] = $tree_np['NP']; + $tree_new['cur_node'] = $cur_node; + $tree_new['VP'] = $tree_new_sub; + return $tree_new; + } + unset($tree_vb['cur_node']); + $tree_new_sub['VB'] = $tree_vb; + $tree_new['cur_node'] = $cur_node; + $tree_new['VP'] = $tree_new_sub; + return $tree_new; + } + /** + * Given a part-of-speeech tagged phrase array generates a parse tree + * for the phrase using a recursive descent parser. + * + * @param array $tagged_phrase + * an array of pairs of the form ("token" => token_for_term, + * "tag"=> part_of_speech_tag_for_term) + * @return array used to represent a tree. The array has up to three fields + * $tree["cur_node"] index of how far we parsed our$tagged_phrase + * $tree["NP"] contains a subtree for a noun phrase + * $tree["VP"] contains a subtree for a verb phrase + */ + public static function generatePhraseParseTree($tagged_phrase) + { + $tree = []; + //cur_node is the index in tagged_phrase we've parse to so far + $tree_np = self::extractNounPhrase($tagged_phrase, ["cur_node" => 0]); + $tree = ["cur_node" => $tree_np['cur_node']]; + $tree_vp = self::extractVerbPhrase($tagged_phrase, $tree); + if ($tree == $tree_vp) { + return $tree; + } + $tree['cur_node'] = $tree_vp['cur_node']; + unset($tree_np['cur_node']); + unset($tree_vp['cur_node']); + $tree['NP'] = $tree_np['NP']; + $tree['VP'] = $tree_vp['VP']; + return $tree; + } //private methods for stemming /** * m() measures the number of consonant sequences between 0 and j. if c is -- 2.10.0.windows.1 From d0a730a309c9a175f880b9f90031414119c8f168 Mon Sep 17 00:00:00 2001 From: Salil Shenoy Date: Tue, 13 Dec 2016 11:20:43 -0800 Subject: [PATCH 2/3] Code Refactoring Part 2: Handling the question passed to yioop --- src/library/PhraseParser.php | 6 +- src/library/QuestionAnswerExtractor.php | 407 --------------------------- src/locale/en_US/resources/Tokenizer.php | 458 ++++++++++++++++++++++++++++--- src/models/PhraseModel.php | 3 +- 4 files changed, 418 insertions(+), 456 deletions(-) delete mode 100644 src/library/QuestionAnswerExtractor.php diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index 76a0aea..dad8b7b 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -33,7 +33,6 @@ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; use seekquarry\yioop\models\LocaleModel; use seekquarry\yioop\library\processors\PageProcessor; -use seekquarry\yioop\library\QuestionAnswerExtractor; /** * For crawlHash @@ -159,7 +158,8 @@ class PhraseParser stuff for now */ } - if (stristr($whole_phrase, QuestionAnswerExtractor::$question_marker) + $tokenizer = self::getTokenizer($lang); + if (stristr($whole_phrase, $tokenizer::getQuestionMarker()) !== false) { $terms = [$whole_phrase, $terms[0]]; return $terms; @@ -256,7 +256,7 @@ class PhraseParser $tokenizer = self::getTokenizer($lang); if (method_exists($tokenizer, "tagTokenizePartOfSpeech")) { $triplets_list = - QuestionAnswerExtractor::extractTripletsPhrases( + $tokenizer->extractTripletsPhrases( $phrase_and_sentences["SENTENCES"], $lang); $phrase_and_sentences["TERMS_AND_PHRASES"] = array_merge($phrase_and_sentences["TERMS_AND_PHRASES"], diff --git a/src/library/QuestionAnswerExtractor.php b/src/library/QuestionAnswerExtractor.php deleted file mode 100644 index f62f8d4..0000000 --- a/src/library/QuestionAnswerExtractor.php +++ /dev/null @@ -1,407 +0,0 @@ -. - * - * @author Chris Pollett chris@pollett.org - * @license http://www.gnu.org/licenses/ GPL3 - * @link http://www.seekquarry.com/ - * @copyright 2009 - 2016 - * @filesource - */ -namespace seekquarry\yioop\library; - -use seekquarry\yioop\configs as C; -use seekquarry\yioop\library as L; - -/** - * A class to extract triplet so that they may be used as part of a question - * answering system - * - * @author Nirav Patel (revisited by Salil Shenoy, code clean up pass - * Chris Pollett) - * @package seekquarry\yioop\library - */ -class QuestionAnswerExtractor -{ - /** - * Any unique identifier corresponding to the component of a triplet which - * can be answered using a question answer list - * @string - */ - public static $question_marker = "qqq"; - /** - * Takes a parse tree of a phrase and computes subject, predicate, and - * object arrays. Each of these array consists of two components CONCISE and - * RAW, CONCISE corresponding to something more similar to the words in the - * original phrase and RAW to the case where extraneous words have been - * removed - * - * @param are $parse_tree a parse tree for a sentence - * @return array triplet array - */ - public static function extractTripletsParseTree($tree) - { - $triplets = []; - $triplets['subject'] = self::extractSubjectParseTree($tree); - $triplets['predicate'] = self::extractPredicateParseTree($tree); - $triplets['object'] = self::extractObjectParseTree($tree); - return $triplets; - } - /** - * Takes a triplets array with subject, predicate, object fields with - * CONCISE and RAW subfields and rearranges it to have two fields CONCISE - * and RAW with subject, predicate, object, and QUESTION_ANSWER_LIST - * subfields - * - * @param array $sub_pred_obj_triplets in format described above - * @param string $lang locale tag for stem chargramming and segmenting - * @return array $processed_triplets in format described above - */ - public static function rearrangeTripletsByType($sub_pred_obj_triplets, - $lang) - { - $processed_triplet = []; - $processed_triplets['CONCISE'] = - self::extractTripletByType($sub_pred_obj_triplets, "CONCISE", - $lang); - $processed_triplets['RAW'] = - self::extractTripletByType($sub_pred_obj_triplets, "RAW", $lang); - return $processed_triplets; - } - /** - * Takes a triplets array with subject, predicate, object fields with - * CONCISE, RAW subfields and produces a triplits with $type subfield (where - * $type is one of CONCISE and RAW) and with subject, predicate, object, - * and QUESTION_ANSWER_LIST subfields - * - * @param array $sub_pred_obj_triplets in format described above - * @param string $lang locale tag for stem chargramming and segmentin - * @return array $triplets in format described above - */ - public static function extractTripletByType($sub_pred_obj_triplets, $type, - $lang) - { - $triplets = []; - if (!empty($sub_pred_obj_triplets['subject'][$type]) - && !empty($sub_pred_obj_triplets['predicate'][$type]) - && !empty($sub_pred_obj_triplets['object'][$type])) { - $question_answer_triplets = []; - $question_marker = self::$question_marker; - $tokenizer = PhraseParser::getTokenizer($lang); - $sentence = [ trim($sub_pred_obj_triplets['subject'][$type]), - trim($sub_pred_obj_triplets['predicate'][$type]), - trim($sub_pred_obj_triplets['object'][$type])]; - $parts = ['subject', 'predicate', 'object']; - for ($i = 0; $i < 3; $i++) { - $q_sentence = $sentence; - $q_sentence[$i] = $question_marker; - $q_sentence_string = implode(" ", $q_sentence); - $q_sentence_string = PhraseParser::stemCharGramSegment( - $q_sentence_string, $lang, true); - $triplets[$parts[$i]] = $q_sentence_string; - $question_answer_triplets[$q_sentence_string] = - PhraseParser::stemCharGramSegment($sentence[$i], $lang, - true); - } - $triplets['QUESTION_ANSWER_LIST'] = $question_answer_triplets; - } - return $triplets; - } - /** - * Takes a parse tree of a phrase or statement and returns an array - * with two fields CONCISE and RAW the former having the subject of - * the original phrase (as a string) the latter having the importart - * parts of the subject - * - * @param array representation of a parse tree of a phrase - * @return array with two fields CONCISE and RAW as described above - */ - public static function extractSubjectParseTree($tree) - { - $subject = []; - if (!empty($tree['NP'])) { - $subject['CONCISE'] = self::extractDeepestSpeechPartPhrase( - $tree['NP'], "NN"); - $raw_subject = ""; - $it = new \RecursiveIteratorIterator( - new \RecursiveArrayIterator($tree['NP'])); - foreach ($it as $v) { - $raw_subject .= $v . " "; - } - $subject['RAW'] = $raw_subject; - } else { - $subject['CONCISE'] = ""; - $subject['RAW'] = ""; - } - return $subject; - } - - /** - * Takes a parse tree of a phrase or statement and returns an array - * with two fields CONCISE and RAW the former having the predicate of - * the original phrase (as a string) the latter having the importart - * parts of the predicate - * - * @param array representation of a parse tree of a phrase - * @return array with two fields CONCISE and RAW as described above - */ - public static function extractPredicateParseTree($tree) - { - $predicate = []; - if (!empty($tree['VP'])) { - $tree_vp = $tree['VP']; - $predicate['CONCISE'] = self::extractDeepestSpeechPartPhrase( - $tree_vp, "VB"); - $raw_predicate = ""; - if (!empty($tree_vp['VB'])) { - $tree_vb = $tree_vp['VB']; - $it = new \RecursiveIteratorIterator( - new \RecursiveArrayIterator($tree_vb)); - foreach ($it as $v) { - $raw_predicate .= $v . " "; - } - $predicate['RAW'] = $raw_predicate; - } - } else { - $predicate['CONCISE'] = ""; - $predicate['RAW'] = ""; - } - return $predicate; - } - /** - * Takes a parse tree of a phrase or statement and returns an array - * with two fields CONCISE and RAW the former having the object of - * the original phrase (as a string) the latter having the importart - * parts of the object - * - * @param array representation of a parse tree of a phrase - * @return array with two fields CONCISE and RAW as described above - */ - public static function extractObjectParseTree($tree) - { - $object = []; - if (!empty($tree['VP'])) { - $tree_vp = $tree['VP']; - if (!empty($tree_vp['NP'])) { - $nb = $tree_vp['NP']; - $object['CONCISE'] = self::extractDeepestSpeechPartPhrase($nb, - "NN"); - $raw_object = ""; - $it = new \RecursiveIteratorIterator( - new \RecursiveArrayIterator($nb)); - foreach ($it as $v) { - $raw_object .= $v . " "; - } - $object['RAW'] = $raw_object; - } else { - $object['CONCISE'] = ""; - $object['RAW'] = ""; - } - } else { - $object['CONCISE'] = ""; - $object['RAW'] = ""; - } - return $object; - } - /** - * Takes phrase tree $tree and a part-of-speech $pos returns - * the deepest $pos only path in tree. - * - * @param $tree phrase to extract type from - * @return string deepest verb - */ - public static function extractDeepestSpeechPartPhrase($tree, $pos) - { - $extract = ""; - if (!empty($tree[$pos])) { - $extract = self::extractDeepestSpeechPartPhrase($tree[$pos], $pos); - } - if (!$extract && !empty($tree[$pos]) && !empty($tree[$pos][$pos])) { - $extract = $tree[$pos][$pos]; - } - return $extract; - } - /** - * Scans a word list for phrases. For phrases found generate - * a list of question and answer pairs at two levels of granularity: - * CONCISE (using all terms in orginal phrase) and RAW (removing - * (adjectives, etc). - * - * @param array $word_and_phrase_list of statements - * @param string $lang locale tag to tag parts of speech in phrase - * @return array with two fields: QUESTION_LIST consisting of triplets - * (SUBJECT, PREDICATES, OBJECT) where one of the components has been - * replaced with a question marker. - */ - public static function extractTripletsPhrases($word_and_phrase_list, - $lang) - { - $triplets_list = []; - $question_list = []; - $question_answer_list = []; - $word_and_phrase_list = array_filter($word_and_phrase_list, - function ($key) { - return str_word_count($key) >= C\PHRASE_THRESHOLD; - }, \ARRAY_FILTER_USE_KEY ); - $triplet_types = ['CONCISE', 'RAW']; - $triplet_parts = ['subject', 'predicate', 'object']; - $tokenizer = PhraseParser::getTokenizer($lang); - foreach ($word_and_phrase_list as $word_and_phrase => $position_list) { - $tagged_phrase = $tokenizer->tagTokenizePartOfSpeech($word_and_phrase); - $parse_tree = $tokenizer->generatePhraseParseTree($tagged_phrase); - $triplets = self::extractTripletsParseTree($parse_tree); - $extracted_triplets = self::rearrangeTripletsByType($triplets, $lang); - - foreach ($triplet_types as $type) { - if (!empty($extracted_triplets[$type])) { - $triplet = $extracted_triplets[$type]; - foreach ($triplet_parts as $part) { - if(!empty($triplet[$part])) { - $question_list[$triplet[$part]] = $position_list; - } - } - $question_answer_list = array_merge($question_answer_list, - $triplet['QUESTION_ANSWER_LIST']); - } - } - } - $triplets_list['QUESTION_LIST'] = $question_list; - $triplets_list['QUESTION_ANSWER_LIST'] = $question_answer_list; - return $triplets_list; - } - /** - * Takes any question started with WH question and returns the - * triplet from the question - * - * @param string $question question to parse - * @param string $lang locale tag to tag parts of speech in phrase - * @return array question triplet - */ - public static function questionParser($question, $lang) - { - $tokenizer = PhraseParser::getTokenizer($lang); - $tagged_question = $tokenizer->tagTokenizePartOfSpeech($question); - $generated_question_array = []; - if (isset($tagged_question[0])) { - if (in_array(trim($tagged_question[0]['tag']), - ["WRB", "WP"])) { - $token = strtoupper(trim($tagged_question[0]['token'])); - if ($token == "WHO") { - $generated_question_array = self::parseWhoQuestion( - $tagged_question, 1); - } else if (in_array($token, ["WHERE", "WHEN", "WHAT"])) { - $generated_question_array = self::parseWHPlusQuestion( - $tagged_question, 1); - } - } - } - return $generated_question_array; - } - /** - * Takes tagged question string starts with Who - * and returns question triplet from the question string - * - * @param string $tagged_question part-of-speech tagged question - * @param int $index current index in statement - * @return array parsed triplet - */ - public static function parseWhoQuestion($tagged_question, $index) - {; - $generated_questions = []; - $question_marker = self::$question_marker; - $tree = ["cur_node" => $index]; - $tree['NP'] = "WHO"; - $triplets = []; - $tree_vp = self::extractVerbPhrase($tagged_question, $tree); - $triplets['predicate'] = self::extractPredicateParseTree( - $tree_vp); - $triplets['object'] = self::extractObjectParseTree( - $tree_vp); - $triplet_types = ['CONCISE', 'RAW']; - foreach ($triplet_types as $type) { - if (!empty($triplets['object'][$type]) - && !empty($triplets['predicate'][$type])) { - $generated_questions[$type][] = - trim($triplets['object'][$type]) . - " " . trim($triplets['predicate'][$type]) . " " . - $question_marker; - $generated_questions[$type][] = $question_marker . - " " . trim($triplets['predicate'][$type]) . - " " . trim($triplets['object'][$type]); - } - } - return $generated_questions; - } - /** - * Takes tagged question string starts with Wh+ except Who - * and returns question triplet from the question string - * Unlike the WHO case, here we assume there is an auxliary verb - * followed by a noun phrase then the rest of the verb phrase. For example, - * Where is soccer played? - * - * @param string $tagged_question part-of-speech tagged question - * @param $index current index in statement - * @return array parsed triplet suitable for query look-up - */ - public static function parseWHPlusQuestion($tagged_question, $index) - { - $generated_questions = []; - $aux_verb = ""; - $question_marker = self::$question_marker; - while (isset($tagged_question[$index]) && - in_array(trim($tagged_question[$index]['tag']), - self::$verb_phrases)) { - $token = trim($tagged_question[$index]['token']); - $aux_verb .= " " . $token; - $index++; - } - $tree = ["cur_node" => $index]; - $tree['NP'] = "WHPlus"; - $triplets = []; - $tree_np = self::extractNounPhrase($tagged_question, $tree); - $triplets['subject'] = self::extractSubjectParseTree($tree_np); - $tree_vp = self::extractVerbPhrase($tagged_question, $tree_np); - $triplets['predicate'] = self::extractPredicateParseTree($tree_vp); - if (!empty($aux_verb)) { - $triplets['predicate']['CONCISE'] = trim($aux_verb) . - " " . $triplets['predicate']['CONCISE']; - if (!isset($triplets['predicate']['RAW'])) { - $triplets['predicate']['RAW'] = ""; - } - $triplets['predicate']['RAW'] = trim($aux_verb) . - " " . $triplets['predicate']['RAW']; - } - $triplet_types = ['CONCISE', 'RAW']; - foreach ($triplet_types as $type) { - if (!empty($triplets['subject'][$type])&& - !empty($triplets['predicate'][$type])) { - $generated_questions[$type][] = - trim($triplets['subject'][$type]) . - " " . trim($triplets['predicate'][$type]) . - " " . $question_marker; - $generated_questions[$type][] = $question_marker. - " " . trim($triplets['predicate'][$type]) . - " " . trim($triplets['subject'][$type]); - } - } - return $generated_questions; - } -} diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php index d8b430b..eeee5b3 100755 --- a/src/locale/en_US/resources/Tokenizer.php +++ b/src/locale/en_US/resources/Tokenizer.php @@ -74,6 +74,12 @@ class Tokenizer 'bruce schnier' => 'bruce schneier', ]; /** + * Any unique identifier corresponding to the component of a triplet which + * can be answered using a question answer list + * @string + */ + public static $question_marker = "qqq"; + /** * @array */ public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", @@ -519,39 +525,37 @@ class Tokenizer $result = self::numberSix($result); return $result; } - // private methods for stemming /** - * Checks to see if the ith character in the buffer is a consonant + * Takes a triplets array with subject, predicate, object fields with + * CONCISE and RAW subfields and rearranges it to have two fields CONCISE + * and RAW with subject, predicate, object, and QUESTION_ANSWER_LIST + * subfields * - * @param int $i the character to check - * @return if the ith character is a constant + * @param array $sub_pred_obj_triplets in format described above + * @param string $lang locale tag for stem chargramming and segmenting + * @return array $processed_triplets in format described above */ - private static function cons($i) + public static function rearrangeTripletsByType($sub_pred_obj_triplets, + $lang) { - switch (self::$buffer[$i]) { - case 'a': - // no break - case 'e': - case 'i': - case 'o': - case 'u': - return false; - case 'y': - return ($i== 0 ) ? true : !self::cons($i - 1); - default: - return true; - } + $processed_triplet = []; + $processed_triplets['CONCISE'] = + self::extractTripletByType($sub_pred_obj_triplets, "CONCISE", + $lang); + $processed_triplets['RAW'] = + self::extractTripletByType($sub_pred_obj_triplets, "RAW", $lang); + return $processed_triplets; } - /** + /** * Takes a part-of-speech tagged phrase and pre-tree with a * parse-from position and builds a parse tree for a determiner if possible * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields + * current parse position in $tagged_phrase] + * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase * "DT" a subarray with a token node for the determiner that was * parsed @@ -571,12 +575,12 @@ class Tokenizer * Takes a part-of-speech tagged phrase and pre-tree with a * parse-from position and builds a parse tree for an adjective if possible * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["cur_node" => - * current parse position in $tagged_phrase] - * @return array has fields + * current parse position in $tagged_phrase] + * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase * "JJ" a subarray with a token node for the adjective that was * parsed @@ -601,12 +605,12 @@ class Tokenizer * Takes a part-of-speech tagged phrase and pre-tree with a * parse-from position and builds a parse tree for a noun if possible * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields + * current parse position in $tagged_phrase] + * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase * "NN" a subarray with a token node for the noun string that was * parsed @@ -633,12 +637,12 @@ class Tokenizer * parse-from position and builds a parse tree for a sequence of * prepositional phrases if possible * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["cur_node" => - * current parse position in $tagged_phrase] - * @return array has fields + * current parse position in $tagged_phrase] + * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase * parsed followed by additional possible fields (here i * represents the ith clause found): @@ -700,12 +704,12 @@ class Tokenizer * Takes a part-of-speech tagged phrase and pre-tree with a * parse-from position and builds a parse tree for a noun phrase if possible * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields + * current parse position in $tagged_phrase] + * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase * "NP" a subarray with possible fields * "DT" with value a determiner subtree @@ -745,15 +749,15 @@ class Tokenizer * Takes a part-of-speech tagged phrase and pre-tree with a * parse-from position and builds a parse tree for a verb if possible * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields + * current parse position in $tagged_phrase] + * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase * "VB" a subarray with a token node for the verb string that was - * parsed + * parsed */ public static function extractVerb($tagged_phrase, $tree) { @@ -781,12 +785,12 @@ class Tokenizer * Takes a part-of-speech tagged phrase and pre-tree with a * parse-from position and builds a parse tree for a verb phrase if possible * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields + * current parse position in $tagged_phrase] + * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase * "VP" a subarray with possible fields * "VB" with value a verb subtree @@ -833,9 +837,9 @@ class Tokenizer * Given a part-of-speeech tagged phrase array generates a parse tree * for the phrase using a recursive descent parser. * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) + * "tag"=> part_of_speech_tag_for_term) * @return array used to represent a tree. The array has up to three fields * $tree["cur_node"] index of how far we parsed our$tagged_phrase * $tree["NP"] contains a subtree for a noun phrase @@ -858,7 +862,364 @@ class Tokenizer $tree['VP'] = $tree_vp['VP']; return $tree; } - //private methods for stemming + /** + * Takes a parse tree of a phrase and computes subject, predicate, and + * object arrays. Each of these array consists of two components CONCISE and + * RAW, CONCISE corresponding to something more similar to the words in the + * original phrase and RAW to the case where extraneous words have been + * removed + * + * @param are $parse_tree a parse tree for a sentence + * @return array triplet array + */ + public static function extractTripletsParseTree($tree) + { + $triplets = []; + $triplets['subject'] = self::extractSubjectParseTree($tree); + $triplets['predicate'] = self::extractPredicateParseTree($tree); + $triplets['object'] = self::extractObjectParseTree($tree); + return $triplets; + } + /** + * Scans a word list for phrases. For phrases found generate + * a list of question and answer pairs at two levels of granularity: + * CONCISE (using all terms in orginal phrase) and RAW (removing + * (adjectives, etc). + * + * @param array $word_and_phrase_list of statements + * @param string $lang locale tag to tag parts of speech in phrase + * @return array with two fields: QUESTION_LIST consisting of triplets + * (SUBJECT, PREDICATES, OBJECT) where one of the components has been + * replaced with a question marker. + */ + public static function extractTripletsPhrases($word_and_phrase_list, + $lang) + { + $triplets_list = []; + $question_list = []; + $question_answer_list = []; + $word_and_phrase_list = array_filter($word_and_phrase_list, + function ($key) { + return str_word_count($key) >= C\PHRASE_THRESHOLD; + }, \ARRAY_FILTER_USE_KEY ); + $triplet_types = ['CONCISE', 'RAW']; + $triplet_parts = ['subject', 'predicate', 'object']; + $tokenizer = PhraseParser::getTokenizer($lang); + foreach ($word_and_phrase_list as $word_and_phrase => $position_list) { + $tagged_phrase = $tokenizer->tagTokenizePartOfSpeech($word_and_phrase); + $parse_tree = $tokenizer->generatePhraseParseTree($tagged_phrase); + $triplets = self::extractTripletsParseTree($parse_tree); + $extracted_triplets = self::rearrangeTripletsByType($triplets, $lang); + + foreach ($triplet_types as $type) { + if (!empty($extracted_triplets[$type])) { + $triplet = $extracted_triplets[$type]; + foreach ($triplet_parts as $part) { + if(!empty($triplet[$part])) { + $question_list[$triplet[$part]] = $position_list; + } + } + $question_answer_list = array_merge($question_answer_list, + $triplet['QUESTION_ANSWER_LIST']); + } + } + } + $triplets_list['QUESTION_LIST'] = $question_list; + $triplets_list['QUESTION_ANSWER_LIST'] = $question_answer_list; + return $triplets_list; + } + /** + * Takes phrase tree $tree and a part-of-speech $pos returns + * the deepest $pos only path in tree. + * + * @param $tree phrase to extract type from + * @return string deepest verb + */ + public static function extractDeepestSpeechPartPhrase($tree, $pos) + { + $extract = ""; + if (!empty($tree[$pos])) { + $extract = self::extractDeepestSpeechPartPhrase($tree[$pos], $pos); + } + if (!$extract && !empty($tree[$pos]) && !empty($tree[$pos][$pos])) { + $extract = $tree[$pos][$pos]; + } + return $extract; + } + /** + * Takes a parse tree of a phrase or statement and returns an array + * with two fields CONCISE and RAW the former having the object of + * the original phrase (as a string) the latter having the importart + * parts of the object + * + * @param array representation of a parse tree of a phrase + * @return array with two fields CONCISE and RAW as described above + */ + public static function extractObjectParseTree($tree) + { + $object = []; + if (!empty($tree['VP'])) { + $tree_vp = $tree['VP']; + if (!empty($tree_vp['NP'])) { + $nb = $tree_vp['NP']; + $object['CONCISE'] = self::extractDeepestSpeechPartPhrase($nb, + "NN"); + $raw_object = ""; + $it = new \RecursiveIteratorIterator( + new \RecursiveArrayIterator($nb)); + foreach ($it as $v) { + $raw_object .= $v . " "; + } + $object['RAW'] = $raw_object; + } else { + $object['CONCISE'] = ""; + $object['RAW'] = ""; + } + } else { + $object['CONCISE'] = ""; + $object['RAW'] = ""; + } + return $object; + } + /** + * Takes a parse tree of a phrase or statement and returns an array + * with two fields CONCISE and RAW the former having the predicate of + * the original phrase (as a string) the latter having the importart + * parts of the predicate + * + * @param array representation of a parse tree of a phrase + * @return array with two fields CONCISE and RAW as described above + */ + public static function extractPredicateParseTree($tree) + { + $predicate = []; + if (!empty($tree['VP'])) { + $tree_vp = $tree['VP']; + $predicate['CONCISE'] = self::extractDeepestSpeechPartPhrase( + $tree_vp, "VB"); + $raw_predicate = ""; + if (!empty($tree_vp['VB'])) { + $tree_vb = $tree_vp['VB']; + $it = new \RecursiveIteratorIterator( + new \RecursiveArrayIterator($tree_vb)); + foreach ($it as $v) { + $raw_predicate .= $v . " "; + } + $predicate['RAW'] = $raw_predicate; + } + } else { + $predicate['CONCISE'] = ""; + $predicate['RAW'] = ""; + } + return $predicate; + } + /** + * Takes a parse tree of a phrase or statement and returns an array + * with two fields CONCISE and RAW the former having the subject of + * the original phrase (as a string) the latter having the importart + * parts of the subject + * + * @param array representation of a parse tree of a phrase + * @return array with two fields CONCISE and RAW as described above + */ + public static function extractSubjectParseTree($tree) + { + $subject = []; + if (!empty($tree['NP'])) { + $subject['CONCISE'] = self::extractDeepestSpeechPartPhrase( + $tree['NP'], "NN"); + $raw_subject = ""; + $it = new \RecursiveIteratorIterator( + new \RecursiveArrayIterator($tree['NP'])); + foreach ($it as $v) { + $raw_subject .= $v . " "; + } + $subject['RAW'] = $raw_subject; + } else { + $subject['CONCISE'] = ""; + $subject['RAW'] = ""; + } + return $subject; + } + /** + * Takes tagged question string starts with Who + * and returns question triplet from the question string + * + * @param string $tagged_question part-of-speech tagged question + * @param int $index current index in statement + * @return array parsed triplet + */ + public static function parseWhoQuestion($tagged_question, $index) + { + $generated_questions = []; + $question_marker = self::getQuestionMarker(); + $tree = ["cur_node" => $index]; + $tree['NP'] = "WHO"; + $triplets = []; + $tree_vp = self::extractVerbPhrase($tagged_question, $tree); + $triplets['predicate'] = self::extractPredicateParseTree( + $tree_vp); + $triplets['object'] = self::extractObjectParseTree( + $tree_vp); + $triplet_types = ['CONCISE', 'RAW']; + foreach ($triplet_types as $type) { + if (!empty($triplets['object'][$type]) + && !empty($triplets['predicate'][$type])) { + $generated_questions[$type][] = + trim($triplets['object'][$type]) . + " " . trim($triplets['predicate'][$type]) . " " . + $question_marker; + $generated_questions[$type][] = $question_marker . + " " . trim($triplets['predicate'][$type]) . + " " . trim($triplets['object'][$type]); + } + } + return $generated_questions; + } + /** + * Takes tagged question string starts with Wh+ except Who + * and returns question triplet from the question string + * Unlike the WHO case, here we assume there is an auxliary verb + * followed by a noun phrase then the rest of the verb phrase. For example, + * Where is soccer played? + * + * @param string $tagged_question part-of-speech tagged question + * @param $index current index in statement + * @return array parsed triplet suitable for query look-up + */ + public static function parseWHPlusQuestion($tagged_question, $index) + { + $generated_questions = []; + $aux_verb = ""; + $question_marker = self::getQuestionMarker(); + while (isset($tagged_question[$index]) && + in_array(trim($tagged_question[$index]['tag']), + self::$verb_phrases)) { + $token = trim($tagged_question[$index]['token']); + $aux_verb .= " " . $token; + $index++; + } + $tree = ["cur_node" => $index]; + $tree['NP'] = "WHPlus"; + $triplets = []; + $tree_np = self::extractNounPhrase($tagged_question, $tree); + $triplets['subject'] = self::extractSubjectParseTree($tree_np); + $tree_vp = self::extractVerbPhrase($tagged_question, $tree_np); + $triplets['predicate'] = self::extractPredicateParseTree($tree_vp); + if (!empty($aux_verb)) { + $triplets['predicate']['CONCISE'] = trim($aux_verb) . + " " . $triplets['predicate']['CONCISE']; + if (!isset($triplets['predicate']['RAW'])) { + $triplets['predicate']['RAW'] = ""; + } + $triplets['predicate']['RAW'] = trim($aux_verb) . + " " . $triplets['predicate']['RAW']; + } + $triplet_types = ['CONCISE', 'RAW']; + foreach ($triplet_types as $type) { + if (!empty($triplets['subject'][$type])&& + !empty($triplets['predicate'][$type])) { + $generated_questions[$type][] = + trim($triplets['subject'][$type]) . + " " . trim($triplets['predicate'][$type]) . + " " . $question_marker; + $generated_questions[$type][] = $question_marker. + " " . trim($triplets['predicate'][$type]) . + " " . trim($triplets['subject'][$type]); + } + } + return $generated_questions; + } + /** + * Takes any question started with WH question and returns the + * triplet from the question + * + * @param string $question question to parse + * @param string $lang locale tag to tag parts of speech in phrase + * @return array question triplet + */ + public static function questionParser($question, $lang) + { + $tokenizer = PhraseParser::getTokenizer($lang); + $tagged_question = $tokenizer->tagTokenizePartOfSpeech($question); + $generated_question_array = []; + if (isset($tagged_question[0])) { + if (in_array(trim($tagged_question[0]['tag']), + ["WRB", "WP"])) { + $token = strtoupper(trim($tagged_question[0]['token'])); + if ($token == "WHO") { + $generated_question_array = self::parseWhoQuestion( + $tagged_question, 1); + } else if (in_array($token, ["WHERE", "WHEN", "WHAT"])) { + $generated_question_array = self::parseWHPlusQuestion( + $tagged_question, 1); + } + } + } + return $generated_question_array; + } + /** + * Takes a triplets array with subject, predicate, object fields with + * CONCISE, RAW subfields and produces a triplits with $type subfield (where + * $type is one of CONCISE and RAW) and with subject, predicate, object, + * and QUESTION_ANSWER_LIST subfields + * + * @param array $sub_pred_obj_triplets in format described above + * @param string $lang locale tag for stem chargramming and segmentin + * @return array $triplets in format described above + */ + public static function extractTripletByType($sub_pred_obj_triplets, $type, + $lang) + { + $triplets = []; + if (!empty($sub_pred_obj_triplets['subject'][$type]) + && !empty($sub_pred_obj_triplets['predicate'][$type]) + && !empty($sub_pred_obj_triplets['object'][$type])) { + $question_answer_triplets = []; + $question_marker = self::$question_marker; + $tokenizer = PhraseParser::getTokenizer($lang); + $sentence = [ trim($sub_pred_obj_triplets['subject'][$type]), + trim($sub_pred_obj_triplets['predicate'][$type]), + trim($sub_pred_obj_triplets['object'][$type])]; + $parts = ['subject', 'predicate', 'object']; + for ($i = 0; $i < 3; $i++) { + $q_sentence = $sentence; + $q_sentence[$i] = $question_marker; + $q_sentence_string = implode(" ", $q_sentence); + $q_sentence_string = PhraseParser::stemCharGramSegment( + $q_sentence_string, $lang, true); + $triplets[$parts[$i]] = $q_sentence_string; + $question_answer_triplets[$q_sentence_string] = + PhraseParser::stemCharGramSegment($sentence[$i], $lang, + true); + } + $triplets['QUESTION_ANSWER_LIST'] = $question_answer_triplets; + } + return $triplets; + } + // private methods for stemming + /** + * Checks to see if the ith character in the buffer is a consonant + * + * @param int $i the character to check + * @return if the ith character is a constant + */ + private static function cons($i) + { + switch (self::$buffer[$i]) { + case 'a': + // no break + case 'e': + case 'i': + case 'o': + case 'u': + return false; + case 'y': + return ($i== 0 ) ? true : !self::cons($i - 1); + default: + return true; + } + } /** * m() measures the number of consonant sequences between 0 and j. if c is * a consonant sequence and v a vowel sequence, and [.] indicates arbitrary @@ -1217,6 +1578,15 @@ class Tokenizer } return $tagged_phrase; } + /** + * The function returns the question marker for the locale + * + * @return the question marker + */ + public static function getQuestionMarker() + { + return self::$question_marker; + } //private methods for sentence compression /** * From Back to Basics: CLASSY 2006 page 3: diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index 1af65c4..81df13f 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -38,7 +38,6 @@ use seekquarry\yioop\library\IndexManager; use seekquarry\yioop\library\PhraseParser; use seekquarry\yioop\library\Thesaurus; use seekquarry\yioop\library\index_bundle_iterators as I; -use seekquarry\yioop\library\QuestionAnswerExtractor; /** * logging is done during crawl not through web, @@ -917,7 +916,7 @@ class PhraseModel extends ParallelModel if (!empty($tokenizer) && method_exists($tokenizer, "isQuestion") && method_exists($tokenizer, "tagTokenizePartOfSpeech") && $tokenizer->isQuestion($phrase)) { - $generated_question = QuestionAnswerExtractor::questionParser( + $generated_question = $tokenizer->questionParser( $phrase, $tag); if (!empty($generated_question['CONCISE'])) { $phrase = $generated_question['CONCISE'][0]; -- 2.10.0.windows.1 From 8284eaa91491a3ecd51509fb8ae7352b359b2793 Mon Sep 17 00:00:00 2001 From: Salil Shenoy Date: Tue, 13 Dec 2016 11:29:21 -0800 Subject: [PATCH 3/3] Code Refactor Part 3: Ran Cleanup Utilities --- src/library/PhraseParser.php | 4 +-- src/locale/en_US/resources/Tokenizer.php | 62 ++++++++++++++++---------------- src/models/PhraseModel.php | 8 ++--- 3 files changed, 38 insertions(+), 36 deletions(-) diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index dad8b7b..2d23f76 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -708,7 +708,7 @@ class PhraseParser return self::getNGramsTerm($terms, $n); } /** - * Returns the characters n-grams for the given terms where n is the + * Returns the characters n-grams for the given terms where n is the * length. * * @param array $terms the terms to make n-grams for @@ -1360,4 +1360,4 @@ vaffanculo fok hoer kut lul やりまん 打っ掛け } return $result; } -} +} \ No newline at end of file diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php index eeee5b3..e13594f 100755 --- a/src/locale/en_US/resources/Tokenizer.php +++ b/src/locale/en_US/resources/Tokenizer.php @@ -550,12 +550,12 @@ class Tokenizer * Takes a part-of-speech tagged phrase and pre-tree with a * parse-from position and builds a parse tree for a determiner if possible * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields + * current parse position in $tagged_phrase] + * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase * "DT" a subarray with a token node for the determiner that was * parsed @@ -575,12 +575,12 @@ class Tokenizer * Takes a part-of-speech tagged phrase and pre-tree with a * parse-from position and builds a parse tree for an adjective if possible * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["cur_node" => - * current parse position in $tagged_phrase] - * @return array has fields + * current parse position in $tagged_phrase] + * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase * "JJ" a subarray with a token node for the adjective that was * parsed @@ -605,12 +605,12 @@ class Tokenizer * Takes a part-of-speech tagged phrase and pre-tree with a * parse-from position and builds a parse tree for a noun if possible * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields + * current parse position in $tagged_phrase] + * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase * "NN" a subarray with a token node for the noun string that was * parsed @@ -637,12 +637,12 @@ class Tokenizer * parse-from position and builds a parse tree for a sequence of * prepositional phrases if possible * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["cur_node" => - * current parse position in $tagged_phrase] - * @return array has fields + * current parse position in $tagged_phrase] + * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase * parsed followed by additional possible fields (here i * represents the ith clause found): @@ -704,12 +704,12 @@ class Tokenizer * Takes a part-of-speech tagged phrase and pre-tree with a * parse-from position and builds a parse tree for a noun phrase if possible * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields + * current parse position in $tagged_phrase] + * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase * "NP" a subarray with possible fields * "DT" with value a determiner subtree @@ -749,15 +749,15 @@ class Tokenizer * Takes a part-of-speech tagged phrase and pre-tree with a * parse-from position and builds a parse tree for a verb if possible * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields + * current parse position in $tagged_phrase] + * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase * "VB" a subarray with a token node for the verb string that was - * parsed + * parsed */ public static function extractVerb($tagged_phrase, $tree) { @@ -785,12 +785,12 @@ class Tokenizer * Takes a part-of-speech tagged phrase and pre-tree with a * parse-from position and builds a parse tree for a verb phrase if possible * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, * "tag"=> part_of_speech_tag_for_term) * @param array $tree that consists of ["curnode" => - * current parse position in $tagged_phrase] - * @return array has fields + * current parse position in $tagged_phrase] + * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase * "VP" a subarray with possible fields * "VB" with value a verb subtree @@ -837,9 +837,9 @@ class Tokenizer * Given a part-of-speeech tagged phrase array generates a parse tree * for the phrase using a recursive descent parser. * - * @param array $tagged_phrase + * @param array $tagged_phrase * an array of pairs of the form ("token" => token_for_term, - * "tag"=> part_of_speech_tag_for_term) + * "tag"=> part_of_speech_tag_for_term) * @return array used to represent a tree. The array has up to three fields * $tree["cur_node"] index of how far we parsed our$tagged_phrase * $tree["NP"] contains a subtree for a noun phrase @@ -899,23 +899,25 @@ class Tokenizer $question_list = []; $question_answer_list = []; $word_and_phrase_list = array_filter($word_and_phrase_list, - function ($key) { + function ($key) { return str_word_count($key) >= C\PHRASE_THRESHOLD; }, \ARRAY_FILTER_USE_KEY ); $triplet_types = ['CONCISE', 'RAW']; $triplet_parts = ['subject', 'predicate', 'object']; $tokenizer = PhraseParser::getTokenizer($lang); foreach ($word_and_phrase_list as $word_and_phrase => $position_list) { - $tagged_phrase = $tokenizer->tagTokenizePartOfSpeech($word_and_phrase); + $tagged_phrase = + $tokenizer->tagTokenizePartOfSpeech($word_and_phrase); $parse_tree = $tokenizer->generatePhraseParseTree($tagged_phrase); $triplets = self::extractTripletsParseTree($parse_tree); - $extracted_triplets = self::rearrangeTripletsByType($triplets, $lang); + $extracted_triplets = + self::rearrangeTripletsByType($triplets, $lang); foreach ($triplet_types as $type) { if (!empty($extracted_triplets[$type])) { $triplet = $extracted_triplets[$type]; foreach ($triplet_parts as $part) { - if(!empty($triplet[$part])) { + if (!empty($triplet[$part])) { $question_list[$triplet[$part]] = $position_list; } } @@ -1583,7 +1585,7 @@ class Tokenizer * * @return the question marker */ - public static function getQuestionMarker() + public static function getQuestionMarker() { return self::$question_marker; } @@ -1655,4 +1657,4 @@ class Tokenizer "(,\s?when[^,]*,)|(,\s?where[^,]*,)/i", "", $result); return $result; } -} +} \ No newline at end of file diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index 81df13f..16b4399 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -649,7 +649,7 @@ class PhraseModel extends ParallelModel $tmp_hash = (is_array($tmp_hash)) ? $tmp_hash : [$tmp_hash]; $test = array_merge($tmp_hash, [L\crawlHash($word)]); } else { - if(in_array($word, $found_materialized_metas) && + if (in_array($word, $found_materialized_metas) && !$metas_accounted) { $meta_keys[] = $tmp_hash; } else { @@ -657,7 +657,7 @@ class PhraseModel extends ParallelModel } } } - if(!$metas_accounted) { + if (!$metas_accounted) { $word_keys = array_merge($word_keys, $meta_keys); } if (count($word_keys) == 0) { @@ -810,7 +810,7 @@ class PhraseModel extends ParallelModel } $found_metas = array_unique($found_metas); $found_materialized_metas = array_unique($found_materialized_metas); - if(empty(trim($phrase_string)) && count($found_metas) == 2 + if (empty(trim($phrase_string)) && count($found_metas) == 2 && (in_array("site:doc", $found_metas) || in_array("site:any", $found_metas))) { /*site:doc and site:any doesn't work with materialized metas by @@ -1813,4 +1813,4 @@ class PhraseModel extends ParallelModel } return $group_iterator; } -} +} \ No newline at end of file -- 2.10.0.windows.1