From 60bb07ffbdaf9224a5609e2159b19bc47c3b7c74 Mon Sep 17 00:00:00 2001
From: Salil Shenoy <salilshenoy@gmail.com>
Date: Tue, 13 Dec 2016 10:29:54 -0800
Subject: [PATCH 1/3] Code Refactoring: The code to create moved to Tokenizer
 from Question Answer Extractor

---
 src/library/QuestionAnswerExtractor.php  | 353 +------------------------------
 src/locale/en_US/resources/Tokenizer.php | 329 ++++++++++++++++++++++++++++
 2 files changed, 335 insertions(+), 347 deletions(-)

diff --git a/src/library/QuestionAnswerExtractor.php b/src/library/QuestionAnswerExtractor.php
index 53b2728..f62f8d4 100644
--- a/src/library/QuestionAnswerExtractor.php
+++ b/src/library/QuestionAnswerExtractor.php
@@ -48,335 +48,6 @@ class QuestionAnswerExtractor
      */
     public static $question_marker = "qqq";
     /**
-     * @array
-     */
-    public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP",
-        "VBZ"];
-    /**
-     * @array
-     */
-    public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "PRP"];
-    /**
-     * @array
-     */
-    public static $adjective_phrases = ["JJ", "JJR", "JJS"];
-    /**
-     * Given a part-of-speeech tagged phrase array generates a parse tree
-     * for the phrase using a recursive descent parser.
-     *
-     * @param array $tagged_phrase 
-     *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term) 
-     * @return array used to represent a tree. The array has up to three fields
-     *      $tree["cur_node"] index of how far we parsed our$tagged_phrase
-     *      $tree["NP"] contains a subtree for a noun phrase
-     *      $tree["VP"] contains a subtree for a verb phrase
-     */
-    public static function generatePhraseParseTree($tagged_phrase)
-    {
-        $tree = [];
-        //cur_node is the index in tagged_phrase we've parse to so far
-        $tree_np = self::extractNounPhrase($tagged_phrase, ["cur_node" => 0]);
-        $tree = ["cur_node" => $tree_np['cur_node']];
-        $tree_vp = self::extractVerbPhrase($tagged_phrase, $tree);
-        if ($tree == $tree_vp) {
-            return $tree;
-        }
-        $tree['cur_node'] = $tree_vp['cur_node'];
-        unset($tree_np['cur_node']);
-        unset($tree_vp['cur_node']);
-        $tree['NP'] = $tree_np['NP'];
-        $tree['VP'] = $tree_vp['VP'];
-        return $tree;
-    }
-    /**
-     * Takes a part-of-speech tagged phrase and pre-tree with a
-     * parse-from position and builds a parse tree for a noun phrase if possible
-     *
-     * @param array $tagged_phrase 
-     *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term)
-     * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase] 
-     * @return array has fields 
-     *      "cur_node" index of how far we parsed $tagged_phrase
-     *      "NP" a subarray with possible fields
-     *      "DT" with value a determiner subtree
-     *      "JJ" with value an adjective subtree
-     *      "NN" with value a noun tree
-     */
-    public static function extractNounPhrase($tagged_phrase, $tree)
-    {
-        $cur_node = $tree['cur_node'];
-        $tree_dt = self::extractDeterminer($tagged_phrase,
-            ['cur_node' => $cur_node]);
-        $tree_jj = self::extractAdjective($tagged_phrase,
-            ['cur_node' => $tree_dt['cur_node']]);
-        $tree_nn = self::extractNoun($tagged_phrase,
-            ['cur_node' => $tree_jj['cur_node']]);
-        $tree_pp = self::extractPrepositionalPhrases($tagged_phrase,
-            ['cur_node' => $tree_nn['cur_node']]);
-        if ($tree_nn['cur_node'] == $cur_node) {
-            $tree['NP'] = "";
-        } else {
-            $cur_node = $tree_pp['cur_node'];
-            unset($tree_dt['cur_node']);
-            $tree_new_sub['DT'] = $tree_dt;
-            unset($tree_jj['cur_node']);
-            $tree_new_sub['JJ'] = $tree_jj;
-            unset($tree_nn['cur_node']);
-            $tree_new_sub['NN'] = $tree_nn;
-            unset($tree_pp['cur_node']);
-            $tree_new_sub['PRP'] = $tree_pp;
-            $tree_new['cur_node'] = $cur_node;
-            $tree_new['NP'] = $tree_new_sub;
-            return $tree_new;
-        }
-        return $tree;
-    }
-    /**
-     * Takes a part-of-speech tagged phrase and pre-tree with a
-     * parse-from position and builds a parse tree for a verb phrase if possible
-     *
-     * @param array $tagged_phrase 
-     *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term)
-     * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase] 
-     * @return array has fields 
-     *      "cur_node" index of how far we parsed $tagged_phrase
-     *      "VP" a subarray with possible fields
-     *      "VB" with value a verb subtree
-     *      "NP" with value an noun phrase subtree
-     */
-    public static function extractVerbPhrase($tagged_phrase, $tree)
-    {
-        $cur_node = $tree['cur_node'];
-        $tree_vb = self::extractVerb($tagged_phrase, ['cur_node' => $cur_node]);
-        if ($tree_vb['cur_node'] == $cur_node) {
-            return $tree;
-        }
-        $cur_node = $tree_vb['cur_node'];
-        $preposition_string = "";
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-            trim($tagged_phrase[$cur_node]['tag']) == "IN") {
-            $preposition_string .= " ". $tagged_phrase[$cur_node]['token'];
-            $cur_node++;
-        }
-        if (!empty($preposition_string)) {
-            $tree_vb["IN"] = $preposition_string;
-        }
-        $tree_np = self::extractNounPhrase($tagged_phrase,
-            ['cur_node' => $cur_node]);
-        $tree_new = [];
-        $tree_new_sub = [];
-        if ($tree_np['cur_node'] !=  $cur_node) {
-            $cur_node = $tree_np['cur_node'];
-            unset($tree_vb['cur_node']);
-            unset($tree_np['cur_node']);
-            $tree_new_sub['VB'] = $tree_vb;
-            $tree_new_sub['NP'] = $tree_np['NP'];
-            $tree_new['cur_node'] = $cur_node;
-            $tree_new['VP'] = $tree_new_sub;
-            return $tree_new;
-        }
-        unset($tree_vb['cur_node']);
-        $tree_new_sub['VB'] = $tree_vb;
-        $tree_new['cur_node'] = $cur_node;
-        $tree_new['VP'] = $tree_new_sub;
-        return $tree_new;
-    }
-    /**
-     * Takes a part-of-speech tagged phrase and pre-tree with a
-     * parse-from position and builds a parse tree for a determiner if possible
-     *
-     * @param array $tagged_phrase 
-     *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term)
-     * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase] 
-     * @return array has fields 
-     *      "cur_node" index of how far we parsed $tagged_phrase
-     *      "DT" a subarray with a token node for the determiner that was
-     *      parsed
-     */
-    public static function extractDeterminer($tagged_phrase, $tree)
-    {
-        $cur_node = $tree['cur_node'];
-        if (isset($tagged_phrase[$cur_node]['tag']) &&
-            trim($tagged_phrase[$cur_node]['tag']) == "DT" ) {
-            $tree['DT'] = $tagged_phrase[$cur_node]['token'];
-            $tree['cur_node']++;
-            return $tree;
-        }
-        return $tree;
-    }
-    /**
-     * Takes a part-of-speech tagged phrase and pre-tree with a
-     * parse-from position and builds a parse tree for an adjective if possible
-     *
-     * @param array $tagged_phrase 
-     *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term)
-     * @param array $tree that consists of ["cur_node" =>
-     *      current parse position in $tagged_phrase] 
-     * @return array has fields 
-     *      "cur_node" index of how far we parsed $tagged_phrase
-     *      "JJ" a subarray with a token node for the adjective that was
-     *      parsed
-     */
-    public static function extractAdjective($tagged_phrase, $tree)
-    {
-        $adjective_string = "";
-        $cur_node = $tree['cur_node'];
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-           in_array(trim($tagged_phrase[$cur_node]['tag']),
-           self::$adjective_phrases)) {
-           $adjective_string .= " " . $tagged_phrase[$cur_node]['token'];
-           $cur_node++;
-       }
-       if (!empty($adjective_string)) {
-           $tree["JJ"] = $adjective_string;
-       }
-       $tree['cur_node'] = $cur_node;
-       return $tree;
-   }
-    /**
-     * Takes a part-of-speech tagged phrase and pre-tree with a
-     * parse-from position and builds a parse tree for a noun if possible
-     *
-     * @param array $tagged_phrase 
-     *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term)
-     * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase] 
-     * @return array has fields 
-     *      "cur_node" index of how far we parsed $tagged_phrase
-     *      "NN" a subarray with a token node for the noun string that was
-     *      parsed
-     */
-    public static function extractNoun($tagged_phrase, $tree)
-    {
-        //Combining multiple noun into one
-        $noun_string = "";
-        $cur_node = $tree['cur_node'];
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-            (in_array(trim($tagged_phrase[$cur_node]['tag']),
-            self::$noun_phrases))) {
-            $noun_string .= " " . $tagged_phrase[$cur_node]['token'];
-            $cur_node++;
-        }
-        if (!empty($noun_string)) {
-            $tree["NN"] = $noun_string;
-        }
-        $tree['cur_node'] = $cur_node;
-        return $tree;
-    }
-    /**
-     * Takes a part-of-speech tagged phrase and pre-tree with a
-     * parse-from position and builds a parse tree for a sequence of
-     * prepositional phrases if possible
-     *
-     * @param array $tagged_phrase 
-     *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term)
-     * @param array $tree that consists of ["cur_node" =>
-     *      current parse position in $tagged_phrase] 
-     * @return array has fields 
-     *      "cur_node" index of how far we parsed $tagged_phrase
-     *      parsed followed by additional possible fields (here i
-     *      represents the ith clause found):
-     *      "IN_i" with value a preposition subtree
-     *      "DT_i" with value a determiner subtree
-     *      "JJ_i" with value an adjective subtree
-     *      "NN_i"  with value an additional noun subtree
-     */
-    public static function extractPrepositionalPhrases($tagged_phrase, $tree,
-        $index = 1)
-    {
-       $cur_node = $tree['cur_node'];
-        // Checking for preposition.I.e, format: prep [det] [adjective] noun
-        if (isset($tagged_phrase[$cur_node]['tag']) &&
-            trim($tagged_phrase[$cur_node]['tag']) == "IN") {
-            /* can have multiple prep's in a row, for example,
-               it is known in over 20 countries*/
-            $preposition_string = "";
-            while (isset($tagged_phrase[$cur_node]['tag']) &&
-                trim($tagged_phrase[$cur_node]['tag']) == "IN") {
-                $preposition_string .= " ". $tagged_phrase[$cur_node]['token'];
-                $cur_node++;
-            }
-            if (!empty($preposition_string)) {
-                $tree["IN_$index"] = $preposition_string;
-            }
-            if (isset($tagged_phrase[$cur_node]['tag']) &&
-                trim($tagged_phrase[$cur_node]['tag']) == "DT") {
-                $tree['DT_$index'] = $tagged_phrase[$cur_node]['token'];
-                $cur_node++;
-            }
-            $adjective_string = "";
-            while (isset($tagged_phrase[$cur_node]['tag']) &&
-                in_array(trim($tagged_phrase[$cur_node]['tag']),
-                self::$adjective_phrases)) {
-                $adjective_string .= " " . $tagged_phrase[$cur_node]['token'];
-                $cur_node++;
-            }
-            if (!empty($adjective_string)) {
-                $tree["JJ_$index"] = $adjective_string;
-            }
-            $prep_noun_string = "";
-            while (isset($tagged_phrase[$cur_node]['tag']) &&
-                in_array(trim($tagged_phrase[$cur_node]['tag']),
-                self::$noun_phrases)) {
-                $prep_noun_string .= " " . $tagged_phrase[$cur_node]['token'];
-                $cur_node++;
-            }
-            if ($prep_noun_string) {
-                $tree["NP_$index"] = $prep_noun_string;
-            }
-            $tree_next = self::extractPrepositionalPhrases($tagged_phrase,
-                ["cur_node" => $cur_node], $index + 1);
-        }
-        $tree['cur_node'] = $cur_node;
-        return $tree;
-    }
-    /**
-     * Takes a part-of-speech tagged phrase and pre-tree with a
-     * parse-from position and builds a parse tree for a verb if possible
-     *
-     * @param array $tagged_phrase 
-     *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term)
-     * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase] 
-     * @return array has fields 
-     *      "cur_node" index of how far we parsed $tagged_phrase
-     *      "VB" a subarray with a token node for the verb string that was
-     *      parsed 
-     */
-    public static function extractVerb($tagged_phrase, $tree)
-    {
-        $cur_node = $tree['cur_node'];
-        // skip stuff before verb (intensifiers and adverbs)
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-            !in_array(trim($tagged_phrase[$cur_node]['tag']),
-            self::$verb_phrases)) {
-            $cur_node++;
-        }
-        $verb_string = "";
-        while (isset($tagged_phrase[$cur_node]['tag']) &&
-            in_array(trim($tagged_phrase[$cur_node]['tag']),
-            self::$verb_phrases)) {
-            $verb_string .= " " . $tagged_phrase[$cur_node]['token'];
-            $cur_node++;
-        }
-        if (!empty($verb_string)) {
-            $tree["VB"] = $verb_string;
-        }
-        $tree['cur_node'] = $cur_node;
-        return $tree;
-   }
-    /**
      * Takes a parse tree of a phrase and computes subject, predicate, and
      * object arrays. Each of these array consists of two components CONCISE and
      * RAW, CONCISE corresponding to something more similar to the words in the
@@ -569,22 +240,6 @@ class QuestionAnswerExtractor
         return $extract;
     }
     /**
-     * Takes the phrase and apply the rules in the defined in the
-     * lexicon, assign parts of speech and generate a triplet tree.
-     *
-     * @param $phrase any phrase to be converted into triplets
-     * @param string $lang locale tag to tag parts of speech in phrase
-     * @return array question triplets extracted from phrase
-     */
-    public static function extractTripletsPhrase($phrase, $lang)
-    {
-        $tokenizer = PhraseParser::getTokenizer($lang);
-        $tagged_phrase = $tokenizer->tagTokenizePartOfSpeech($phrase);
-        $parse_tree = self::generatePhraseParseTree($tagged_phrase);
-        $triplets = self::extractTripletsParseTree($parse_tree);
-        return self::rearrangeTripletsByType($triplets, $lang);
-    }
-    /**
      * Scans a word list for phrases. For phrases found generate
      * a list of question and answer pairs at two levels of granularity:
      * CONCISE (using all terms in orginal phrase) and RAW (removing
@@ -608,9 +263,13 @@ class QuestionAnswerExtractor
             }, \ARRAY_FILTER_USE_KEY );
         $triplet_types = ['CONCISE', 'RAW'];
         $triplet_parts = ['subject', 'predicate', 'object'];
+        $tokenizer = PhraseParser::getTokenizer($lang);
         foreach ($word_and_phrase_list as $word_and_phrase => $position_list) {
-            $extracted_triplets = self::extractTripletsPhrase(
-                $word_and_phrase, $lang);
+            $tagged_phrase = $tokenizer->tagTokenizePartOfSpeech($word_and_phrase);
+            $parse_tree = $tokenizer->generatePhraseParseTree($tagged_phrase);
+            $triplets = self::extractTripletsParseTree($parse_tree);
+            $extracted_triplets = self::rearrangeTripletsByType($triplets, $lang);
+
             foreach ($triplet_types as $type) {
                 if (!empty($extracted_triplets[$type])) {
                     $triplet = $extracted_triplets[$type];
diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php
index afeabb1..d8b430b 100755
--- a/src/locale/en_US/resources/Tokenizer.php
+++ b/src/locale/en_US/resources/Tokenizer.php
@@ -74,6 +74,19 @@ class Tokenizer
         'bruce schnier' => 'bruce schneier',
     ];
     /**
+     * @array
+     */
+    public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP",
+        "VBZ"];
+    /**
+     * @array
+     */
+    public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "PRP"];
+    /**
+     * @array
+     */
+    public static $adjective_phrases = ["JJ", "JJR", "JJS"];
+    /**
      * storage used in computing the stem
      * @var string
      */
@@ -529,6 +542,322 @@ class Tokenizer
                 return true;
         }
     }
+            /**
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for a determiner if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["curnode" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      "DT" a subarray with a token node for the determiner that was
+     *      parsed
+     */
+    public static function extractDeterminer($tagged_phrase, $tree)
+    {
+        $cur_node = $tree['cur_node'];
+        if (isset($tagged_phrase[$cur_node]['tag']) &&
+            trim($tagged_phrase[$cur_node]['tag']) == "DT" ) {
+            $tree['DT'] = $tagged_phrase[$cur_node]['token'];
+            $tree['cur_node']++;
+            return $tree;
+        }
+        return $tree;
+    }
+    /**
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for an adjective if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["cur_node" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      "JJ" a subarray with a token node for the adjective that was
+     *      parsed
+     */
+    public static function extractAdjective($tagged_phrase, $tree)
+    {
+        $adjective_string = "";
+        $cur_node = $tree['cur_node'];
+        while (isset($tagged_phrase[$cur_node]['tag']) &&
+           in_array(trim($tagged_phrase[$cur_node]['tag']),
+           self::$adjective_phrases)) {
+           $adjective_string .= " " . $tagged_phrase[$cur_node]['token'];
+           $cur_node++;
+       }
+       if (!empty($adjective_string)) {
+           $tree["JJ"] = $adjective_string;
+       }
+       $tree['cur_node'] = $cur_node;
+       return $tree;
+    }
+    /**
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for a noun if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["curnode" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      "NN" a subarray with a token node for the noun string that was
+     *      parsed
+     */
+    public static function extractNoun($tagged_phrase, $tree)
+    {
+        //Combining multiple noun into one
+        $noun_string = "";
+        $cur_node = $tree['cur_node'];
+        while (isset($tagged_phrase[$cur_node]['tag']) &&
+            (in_array(trim($tagged_phrase[$cur_node]['tag']),
+            self::$noun_phrases))) {
+            $noun_string .= " " . $tagged_phrase[$cur_node]['token'];
+            $cur_node++;
+        }
+        if (!empty($noun_string)) {
+            $tree["NN"] = $noun_string;
+        }
+        $tree['cur_node'] = $cur_node;
+        return $tree;
+    }
+    /**
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for a sequence of
+     * prepositional phrases if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["cur_node" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      parsed followed by additional possible fields (here i
+     *      represents the ith clause found):
+     *      "IN_i" with value a preposition subtree
+     *      "DT_i" with value a determiner subtree
+     *      "JJ_i" with value an adjective subtree
+     *      "NN_i"  with value an additional noun subtree
+     */
+    public static function extractPrepositionalPhrases($tagged_phrase, $tree,
+        $index = 1)
+    {
+       $cur_node = $tree['cur_node'];
+        // Checking for preposition.I.e, format: prep [det] [adjective] noun
+        if (isset($tagged_phrase[$cur_node]['tag']) &&
+            trim($tagged_phrase[$cur_node]['tag']) == "IN") {
+            /* can have multiple prep's in a row, for example,
+               it is known in over 20 countries*/
+            $preposition_string = "";
+            while (isset($tagged_phrase[$cur_node]['tag']) &&
+                trim($tagged_phrase[$cur_node]['tag']) == "IN") {
+                $preposition_string .= " ". $tagged_phrase[$cur_node]['token'];
+                $cur_node++;
+            }
+            if (!empty($preposition_string)) {
+                $tree["IN_$index"] = $preposition_string;
+            }
+            if (isset($tagged_phrase[$cur_node]['tag']) &&
+                trim($tagged_phrase[$cur_node]['tag']) == "DT") {
+                $tree['DT_$index'] = $tagged_phrase[$cur_node]['token'];
+                $cur_node++;
+            }
+            $adjective_string = "";
+            while (isset($tagged_phrase[$cur_node]['tag']) &&
+                in_array(trim($tagged_phrase[$cur_node]['tag']),
+                self::$adjective_phrases)) {
+                $adjective_string .= " " . $tagged_phrase[$cur_node]['token'];
+                $cur_node++;
+            }
+            if (!empty($adjective_string)) {
+                $tree["JJ_$index"] = $adjective_string;
+            }
+            $prep_noun_string = "";
+            while (isset($tagged_phrase[$cur_node]['tag']) &&
+                in_array(trim($tagged_phrase[$cur_node]['tag']),
+                self::$noun_phrases)) {
+                $prep_noun_string .= " " . $tagged_phrase[$cur_node]['token'];
+                $cur_node++;
+            }
+            if ($prep_noun_string) {
+                $tree["NP_$index"] = $prep_noun_string;
+            }
+            $tree_next = self::extractPrepositionalPhrases($tagged_phrase,
+                ["cur_node" => $cur_node], $index + 1);
+        }
+        $tree['cur_node'] = $cur_node;
+        return $tree;
+    }
+    /**
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for a noun phrase if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["curnode" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      "NP" a subarray with possible fields
+     *      "DT" with value a determiner subtree
+     *      "JJ" with value an adjective subtree
+     *      "NN" with value a noun tree
+     */
+    public static function extractNounPhrase($tagged_phrase, $tree)
+    {
+        $cur_node = $tree['cur_node'];
+        $tree_dt = self::extractDeterminer($tagged_phrase,
+            ['cur_node' => $cur_node]);
+        $tree_jj = self::extractAdjective($tagged_phrase,
+            ['cur_node' => $tree_dt['cur_node']]);
+        $tree_nn = self::extractNoun($tagged_phrase,
+            ['cur_node' => $tree_jj['cur_node']]);
+        $tree_pp = self::extractPrepositionalPhrases($tagged_phrase,
+            ['cur_node' => $tree_nn['cur_node']]);
+        if ($tree_nn['cur_node'] == $cur_node) {
+            $tree['NP'] = "";
+        } else {
+            $cur_node = $tree_pp['cur_node'];
+            unset($tree_dt['cur_node']);
+            $tree_new_sub['DT'] = $tree_dt;
+            unset($tree_jj['cur_node']);
+            $tree_new_sub['JJ'] = $tree_jj;
+            unset($tree_nn['cur_node']);
+            $tree_new_sub['NN'] = $tree_nn;
+            unset($tree_pp['cur_node']);
+            $tree_new_sub['PRP'] = $tree_pp;
+            $tree_new['cur_node'] = $cur_node;
+            $tree_new['NP'] = $tree_new_sub;
+            return $tree_new;
+        }
+        return $tree;
+    }
+    /**
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for a verb if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["curnode" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      "VB" a subarray with a token node for the verb string that was
+     *      parsed
+     */
+    public static function extractVerb($tagged_phrase, $tree)
+    {
+        $cur_node = $tree['cur_node'];
+        // skip stuff before verb (intensifiers and adverbs)
+        while (isset($tagged_phrase[$cur_node]['tag']) &&
+            !in_array(trim($tagged_phrase[$cur_node]['tag']),
+            self::$verb_phrases)) {
+            $cur_node++;
+        }
+        $verb_string = "";
+        while (isset($tagged_phrase[$cur_node]['tag']) &&
+            in_array(trim($tagged_phrase[$cur_node]['tag']),
+            self::$verb_phrases)) {
+            $verb_string .= " " . $tagged_phrase[$cur_node]['token'];
+            $cur_node++;
+        }
+        if (!empty($verb_string)) {
+            $tree["VB"] = $verb_string;
+        }
+        $tree['cur_node'] = $cur_node;
+        return $tree;
+    }
+    /**
+     * Takes a part-of-speech tagged phrase and pre-tree with a
+     * parse-from position and builds a parse tree for a verb phrase if possible
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @param array $tree that consists of ["curnode" =>
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
+     *      "cur_node" index of how far we parsed $tagged_phrase
+     *      "VP" a subarray with possible fields
+     *      "VB" with value a verb subtree
+     *      "NP" with value an noun phrase subtree
+     */
+    public static function extractVerbPhrase($tagged_phrase, $tree)
+    {
+        $cur_node = $tree['cur_node'];
+        $tree_vb = self::extractVerb($tagged_phrase, ['cur_node' => $cur_node]);
+        if ($tree_vb['cur_node'] == $cur_node) {
+            return $tree;
+        }
+        $cur_node = $tree_vb['cur_node'];
+        $preposition_string = "";
+        while (isset($tagged_phrase[$cur_node]['tag']) &&
+            trim($tagged_phrase[$cur_node]['tag']) == "IN") {
+            $preposition_string .= " ". $tagged_phrase[$cur_node]['token'];
+            $cur_node++;
+        }
+        if (!empty($preposition_string)) {
+            $tree_vb["IN"] = $preposition_string;
+        }
+        $tree_np = self::extractNounPhrase($tagged_phrase,
+            ['cur_node' => $cur_node]);
+        $tree_new = [];
+        $tree_new_sub = [];
+        if ($tree_np['cur_node'] !=  $cur_node) {
+            $cur_node = $tree_np['cur_node'];
+            unset($tree_vb['cur_node']);
+            unset($tree_np['cur_node']);
+            $tree_new_sub['VB'] = $tree_vb;
+            $tree_new_sub['NP'] = $tree_np['NP'];
+            $tree_new['cur_node'] = $cur_node;
+            $tree_new['VP'] = $tree_new_sub;
+            return $tree_new;
+        }
+        unset($tree_vb['cur_node']);
+        $tree_new_sub['VB'] = $tree_vb;
+        $tree_new['cur_node'] = $cur_node;
+        $tree_new['VP'] = $tree_new_sub;
+        return $tree_new;
+    }
+    /**
+     * Given a part-of-speeech tagged phrase array generates a parse tree
+     * for the phrase using a recursive descent parser.
+     *
+     * @param array $tagged_phrase
+     *      an array of pairs of the form ("token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term)
+     * @return array used to represent a tree. The array has up to three fields
+     *      $tree["cur_node"] index of how far we parsed our$tagged_phrase
+     *      $tree["NP"] contains a subtree for a noun phrase
+     *      $tree["VP"] contains a subtree for a verb phrase
+     */
+    public static function generatePhraseParseTree($tagged_phrase)
+    {
+        $tree = [];
+        //cur_node is the index in tagged_phrase we've parse to so far
+        $tree_np = self::extractNounPhrase($tagged_phrase, ["cur_node" => 0]);
+        $tree = ["cur_node" => $tree_np['cur_node']];
+        $tree_vp = self::extractVerbPhrase($tagged_phrase, $tree);
+        if ($tree == $tree_vp) {
+            return $tree;
+        }
+        $tree['cur_node'] = $tree_vp['cur_node'];
+        unset($tree_np['cur_node']);
+        unset($tree_vp['cur_node']);
+        $tree['NP'] = $tree_np['NP'];
+        $tree['VP'] = $tree_vp['VP'];
+        return $tree;
+    }
     //private methods for stemming
     /**
      * m() measures the number of consonant sequences between 0 and j. if c is
-- 
2.10.0.windows.1


From d0a730a309c9a175f880b9f90031414119c8f168 Mon Sep 17 00:00:00 2001
From: Salil Shenoy <salilshenoy@gmail.com>
Date: Tue, 13 Dec 2016 11:20:43 -0800
Subject: [PATCH 2/3] Code Refactoring Part 2: Handling the question passed to
 yioop

---
 src/library/PhraseParser.php             |   6 +-
 src/library/QuestionAnswerExtractor.php  | 407 ---------------------------
 src/locale/en_US/resources/Tokenizer.php | 458 ++++++++++++++++++++++++++++---
 src/models/PhraseModel.php               |   3 +-
 4 files changed, 418 insertions(+), 456 deletions(-)
 delete mode 100644 src/library/QuestionAnswerExtractor.php

diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index 76a0aea..dad8b7b 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -33,7 +33,6 @@ namespace seekquarry\yioop\library;
 use seekquarry\yioop\configs as C;
 use seekquarry\yioop\models\LocaleModel;
 use seekquarry\yioop\library\processors\PageProcessor;
-use seekquarry\yioop\library\QuestionAnswerExtractor;
 
 /**
  * For crawlHash
@@ -159,7 +158,8 @@ class PhraseParser
                               stuff for now
                             */
         }
-        if (stristr($whole_phrase, QuestionAnswerExtractor::$question_marker)
+        $tokenizer = self::getTokenizer($lang);
+        if (stristr($whole_phrase, $tokenizer::getQuestionMarker())
             !== false) {
             $terms = [$whole_phrase, $terms[0]];
             return $terms;
@@ -256,7 +256,7 @@ class PhraseParser
             $tokenizer = self::getTokenizer($lang);
             if (method_exists($tokenizer, "tagTokenizePartOfSpeech")) {
                 $triplets_list =
-                    QuestionAnswerExtractor::extractTripletsPhrases(
+                    $tokenizer->extractTripletsPhrases(
                         $phrase_and_sentences["SENTENCES"], $lang);
                 $phrase_and_sentences["TERMS_AND_PHRASES"] =
                     array_merge($phrase_and_sentences["TERMS_AND_PHRASES"],
diff --git a/src/library/QuestionAnswerExtractor.php b/src/library/QuestionAnswerExtractor.php
deleted file mode 100644
index f62f8d4..0000000
--- a/src/library/QuestionAnswerExtractor.php
+++ /dev/null
@@ -1,407 +0,0 @@
-<?php
-/**
- * SeekQuarry/Yioop --
- * Open Source Pure PHP Search Engine, CCONCISEler, and Indexer
- *
- * Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
- *
- * LICENSE:
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * @author Chris Pollett chris@pollett.org
- * @license http://www.gnu.org/licenses/ GPL3
- * @link http://www.seekquarry.com/
- * @copyright 2009 - 2016
- * @filesource
- */
-namespace seekquarry\yioop\library;
-
-use seekquarry\yioop\configs as C;
-use seekquarry\yioop\library as L;
-
-/**
- * A class to extract triplet so that they may be used as part of a question
- * answering system
- *
- * @author Nirav Patel (revisited by Salil Shenoy, code clean up pass
- * Chris Pollett)
- * @package seekquarry\yioop\library
- */
-class QuestionAnswerExtractor
-{
-    /**
-     * Any unique identifier corresponding to the component of a triplet which
-     * can be answered using a question answer list
-     * @string
-     */
-    public static $question_marker = "qqq";
-    /**
-     * Takes a parse tree of a phrase and computes subject, predicate, and
-     * object arrays. Each of these array consists of two components CONCISE and
-     * RAW, CONCISE corresponding to something more similar to the words in the
-     * original phrase and RAW to the case where extraneous words have been
-     * removed
-     *
-     * @param are $parse_tree a parse tree for a sentence
-     * @return array triplet array
-     */
-    public static function extractTripletsParseTree($tree)
-    {
-        $triplets = [];
-        $triplets['subject'] = self::extractSubjectParseTree($tree);
-        $triplets['predicate'] = self::extractPredicateParseTree($tree);
-        $triplets['object'] = self::extractObjectParseTree($tree);
-        return $triplets;
-    }
-    /**
-     * Takes a triplets array with subject, predicate, object fields with
-     * CONCISE and RAW subfields and rearranges it to have two fields CONCISE
-     * and RAW with subject, predicate, object, and QUESTION_ANSWER_LIST
-     * subfields
-     *
-     * @param array $sub_pred_obj_triplets in format described above
-     * @param string $lang locale tag for stem chargramming and segmenting
-     * @return array $processed_triplets in format described above
-     */
-    public static function rearrangeTripletsByType($sub_pred_obj_triplets,
-        $lang)
-    {
-        $processed_triplet = [];
-        $processed_triplets['CONCISE'] =
-            self::extractTripletByType($sub_pred_obj_triplets, "CONCISE",
-            $lang);
-        $processed_triplets['RAW'] =
-            self::extractTripletByType($sub_pred_obj_triplets, "RAW", $lang);
-        return $processed_triplets;
-    }
-    /**
-     * Takes a triplets array with subject, predicate, object fields with
-     * CONCISE, RAW subfields and produces a triplits with $type subfield (where
-     * $type is one of CONCISE and RAW) and with subject, predicate, object,
-     * and QUESTION_ANSWER_LIST subfields
-     *
-     * @param array $sub_pred_obj_triplets  in format described above
-     * @param string $lang locale tag for stem chargramming and segmentin
-     * @return array $triplets in format described above
-     */
-    public static function extractTripletByType($sub_pred_obj_triplets, $type,
-        $lang)
-    {
-        $triplets = [];
-        if (!empty($sub_pred_obj_triplets['subject'][$type])
-            && !empty($sub_pred_obj_triplets['predicate'][$type])
-            && !empty($sub_pred_obj_triplets['object'][$type])) {
-            $question_answer_triplets = [];
-            $question_marker = self::$question_marker;
-            $tokenizer = PhraseParser::getTokenizer($lang);
-            $sentence = [ trim($sub_pred_obj_triplets['subject'][$type]),
-                trim($sub_pred_obj_triplets['predicate'][$type]),
-                trim($sub_pred_obj_triplets['object'][$type])];
-            $parts = ['subject', 'predicate', 'object'];
-            for ($i = 0; $i < 3; $i++) {
-                $q_sentence = $sentence;
-                $q_sentence[$i] = $question_marker;
-                $q_sentence_string = implode(" ", $q_sentence);
-                $q_sentence_string = PhraseParser::stemCharGramSegment(
-                    $q_sentence_string, $lang, true);
-                $triplets[$parts[$i]] = $q_sentence_string;
-                $question_answer_triplets[$q_sentence_string] =
-                    PhraseParser::stemCharGramSegment($sentence[$i], $lang,
-                    true);
-            }
-            $triplets['QUESTION_ANSWER_LIST'] = $question_answer_triplets;
-        }
-        return $triplets;
-    }
-    /**
-     * Takes a parse tree of a phrase or statement and returns an array
-     * with two fields CONCISE and RAW the former having the subject of
-     * the original phrase (as a string) the latter having the importart
-     * parts of the subject
-     *
-     * @param array representation of a parse tree of a phrase
-     * @return array with two fields CONCISE and RAW as described above
-     */
-    public static function extractSubjectParseTree($tree)
-    {
-        $subject = [];
-        if (!empty($tree['NP'])) {
-            $subject['CONCISE'] = self::extractDeepestSpeechPartPhrase(
-                $tree['NP'], "NN");
-            $raw_subject = "";
-            $it = new \RecursiveIteratorIterator(
-                new \RecursiveArrayIterator($tree['NP']));
-            foreach ($it as $v) {
-                $raw_subject .= $v . " ";
-            }
-            $subject['RAW'] = $raw_subject;
-        } else {
-            $subject['CONCISE'] = "";
-            $subject['RAW'] = "";
-        }
-        return $subject;
-    }
-
-    /**
-     * Takes a parse tree of a phrase or statement and returns an array
-     * with two fields CONCISE and RAW the former having the predicate of
-     * the original phrase (as a string) the latter having the importart
-     * parts of the predicate
-     *
-     * @param array representation of a parse tree of a phrase
-     * @return array with two fields CONCISE and RAW as described above
-     */
-    public static function extractPredicateParseTree($tree)
-    {
-        $predicate = [];
-        if (!empty($tree['VP'])) {
-            $tree_vp = $tree['VP'];
-            $predicate['CONCISE'] = self::extractDeepestSpeechPartPhrase(
-                $tree_vp, "VB");
-            $raw_predicate = "";
-            if (!empty($tree_vp['VB'])) {
-                $tree_vb = $tree_vp['VB'];
-                $it = new \RecursiveIteratorIterator(
-                    new \RecursiveArrayIterator($tree_vb));
-                foreach ($it as $v) {
-                    $raw_predicate .= $v . " ";
-                }
-                $predicate['RAW'] = $raw_predicate;
-            }
-        } else {
-            $predicate['CONCISE'] = "";
-            $predicate['RAW'] = "";
-        }
-        return $predicate;
-    }
-    /**
-     * Takes a parse tree of a phrase or statement and returns an array
-     * with two fields CONCISE and RAW the former having the object of
-     * the original phrase (as a string) the latter having the importart
-     * parts of the object
-     *
-     * @param array representation of a parse tree of a phrase
-     * @return array with two fields CONCISE and RAW as described above
-     */
-    public static function extractObjectParseTree($tree)
-    {
-        $object = [];
-        if (!empty($tree['VP'])) {
-            $tree_vp = $tree['VP'];
-            if (!empty($tree_vp['NP'])) {
-                $nb = $tree_vp['NP'];
-                $object['CONCISE'] = self::extractDeepestSpeechPartPhrase($nb,
-                    "NN");
-                $raw_object = "";
-                $it = new \RecursiveIteratorIterator(
-                    new \RecursiveArrayIterator($nb));
-                foreach ($it as $v) {
-                    $raw_object .= $v . " ";
-                }
-                $object['RAW'] = $raw_object;
-            } else {
-                $object['CONCISE'] = "";
-                $object['RAW'] = "";
-            }
-        } else {
-            $object['CONCISE'] = "";
-            $object['RAW'] = "";
-        }
-        return $object;
-    }
-    /**
-     * Takes phrase tree $tree and a part-of-speech $pos returns
-     * the deepest $pos only path in tree.
-     *
-     * @param $tree phrase to extract type from
-     * @return string deepest verb
-     */
-    public static function extractDeepestSpeechPartPhrase($tree, $pos)
-    {
-        $extract = "";
-        if (!empty($tree[$pos])) {
-            $extract = self::extractDeepestSpeechPartPhrase($tree[$pos], $pos);
-        }
-        if (!$extract && !empty($tree[$pos]) && !empty($tree[$pos][$pos])) {
-            $extract = $tree[$pos][$pos];
-        }
-        return $extract;
-    }
-    /**
-     * Scans a word list for phrases. For phrases found generate
-     * a list of question and answer pairs at two levels of granularity:
-     * CONCISE (using all terms in orginal phrase) and RAW (removing
-     * (adjectives, etc).
-     *
-     * @param array $word_and_phrase_list of statements
-     * @param string $lang locale tag to tag parts of speech in phrase
-     * @return array with two fields: QUESTION_LIST consisting of triplets
-     *      (SUBJECT, PREDICATES, OBJECT) where one of the components has been
-     *      replaced with a question marker.
-     */
-    public static function extractTripletsPhrases($word_and_phrase_list,
-        $lang)
-    {
-        $triplets_list = [];
-        $question_list = [];
-        $question_answer_list = [];
-        $word_and_phrase_list = array_filter($word_and_phrase_list,
-            function ($key) { 
-                return str_word_count($key) >= C\PHRASE_THRESHOLD;
-            }, \ARRAY_FILTER_USE_KEY );
-        $triplet_types = ['CONCISE', 'RAW'];
-        $triplet_parts = ['subject', 'predicate', 'object'];
-        $tokenizer = PhraseParser::getTokenizer($lang);
-        foreach ($word_and_phrase_list as $word_and_phrase => $position_list) {
-            $tagged_phrase = $tokenizer->tagTokenizePartOfSpeech($word_and_phrase);
-            $parse_tree = $tokenizer->generatePhraseParseTree($tagged_phrase);
-            $triplets = self::extractTripletsParseTree($parse_tree);
-            $extracted_triplets = self::rearrangeTripletsByType($triplets, $lang);
-
-            foreach ($triplet_types as $type) {
-                if (!empty($extracted_triplets[$type])) {
-                    $triplet = $extracted_triplets[$type];
-                    foreach ($triplet_parts as $part) {
-                        if(!empty($triplet[$part])) {
-                            $question_list[$triplet[$part]] = $position_list;
-                        }
-                    }
-                    $question_answer_list = array_merge($question_answer_list,
-                        $triplet['QUESTION_ANSWER_LIST']);
-                }
-            }
-        }
-        $triplets_list['QUESTION_LIST'] = $question_list;
-        $triplets_list['QUESTION_ANSWER_LIST'] = $question_answer_list;
-        return $triplets_list;
-    }
-    /**
-     * Takes any question started with WH question and returns the
-     * triplet from the question
-     *
-     * @param string $question question to parse
-     * @param string $lang locale tag to tag parts of speech in phrase
-     * @return array question triplet
-     */
-    public static function questionParser($question, $lang)
-    {
-        $tokenizer = PhraseParser::getTokenizer($lang);
-        $tagged_question = $tokenizer->tagTokenizePartOfSpeech($question);
-        $generated_question_array = [];
-        if (isset($tagged_question[0])) {
-            if (in_array(trim($tagged_question[0]['tag']),
-                ["WRB", "WP"])) {
-                $token = strtoupper(trim($tagged_question[0]['token']));
-                if ($token == "WHO") {
-                    $generated_question_array = self::parseWhoQuestion(
-                        $tagged_question, 1);
-                } else if (in_array($token, ["WHERE", "WHEN", "WHAT"])) {
-                    $generated_question_array = self::parseWHPlusQuestion(
-                        $tagged_question, 1);
-                }
-            }
-        }
-        return $generated_question_array;
-    }
-    /**
-     * Takes tagged question string starts with Who
-     * and returns question triplet from the question string
-     *
-     * @param string $tagged_question part-of-speech tagged question
-     * @param int $index current index in statement
-     * @return array parsed triplet
-     */
-    public static function parseWhoQuestion($tagged_question, $index)
-    {;
-        $generated_questions = [];
-        $question_marker = self::$question_marker;
-        $tree = ["cur_node" => $index];
-        $tree['NP'] = "WHO";
-        $triplets = [];
-        $tree_vp = self::extractVerbPhrase($tagged_question, $tree);
-        $triplets['predicate'] = self::extractPredicateParseTree(
-            $tree_vp);
-        $triplets['object'] = self::extractObjectParseTree(
-            $tree_vp);
-        $triplet_types = ['CONCISE', 'RAW'];
-        foreach ($triplet_types as $type) {
-            if (!empty($triplets['object'][$type])
-                && !empty($triplets['predicate'][$type])) {
-                $generated_questions[$type][] =
-                    trim($triplets['object'][$type]) .
-                    " " . trim($triplets['predicate'][$type]) . " " .
-                    $question_marker;
-                $generated_questions[$type][] = $question_marker .
-                    " " . trim($triplets['predicate'][$type]) .
-                    " " . trim($triplets['object'][$type]);
-            }
-        }
-        return $generated_questions;
-    }
-    /**
-     * Takes tagged question string starts with Wh+ except Who
-     * and returns question triplet from the question string
-     * Unlike the WHO case, here we assume there is an auxliary verb
-     * followed by a noun phrase then the rest of the verb phrase. For example,
-     * Where is soccer played?
-     *
-     * @param string $tagged_question part-of-speech tagged question
-     * @param $index current index in statement
-     * @return array parsed triplet suitable for query look-up
-     */
-    public static function parseWHPlusQuestion($tagged_question, $index)
-    {
-        $generated_questions = [];
-        $aux_verb = "";
-        $question_marker = self::$question_marker;
-        while (isset($tagged_question[$index]) &&
-            in_array(trim($tagged_question[$index]['tag']),
-            self::$verb_phrases)) {
-            $token = trim($tagged_question[$index]['token']);
-            $aux_verb .= " " . $token;
-            $index++;
-        }
-        $tree = ["cur_node" => $index];
-        $tree['NP'] = "WHPlus";
-        $triplets = [];
-        $tree_np = self::extractNounPhrase($tagged_question, $tree);
-        $triplets['subject'] = self::extractSubjectParseTree($tree_np);
-        $tree_vp = self::extractVerbPhrase($tagged_question, $tree_np);
-        $triplets['predicate'] = self::extractPredicateParseTree($tree_vp);
-        if (!empty($aux_verb)) {
-            $triplets['predicate']['CONCISE'] = trim($aux_verb) .
-                " " . $triplets['predicate']['CONCISE'];
-            if (!isset($triplets['predicate']['RAW'])) {
-                $triplets['predicate']['RAW'] = "";
-            }
-            $triplets['predicate']['RAW'] = trim($aux_verb) .
-                " " . $triplets['predicate']['RAW'];
-        }
-        $triplet_types = ['CONCISE', 'RAW'];
-        foreach ($triplet_types as $type) {
-            if (!empty($triplets['subject'][$type])&&
-                !empty($triplets['predicate'][$type])) {
-                $generated_questions[$type][] =
-                    trim($triplets['subject'][$type]) .
-                    " " . trim($triplets['predicate'][$type]) .
-                    " " . $question_marker;
-                $generated_questions[$type][] = $question_marker.
-                    " " . trim($triplets['predicate'][$type]) .
-                    " " . trim($triplets['subject'][$type]);
-            }
-        }
-        return $generated_questions;
-    }
-}
diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php
index d8b430b..eeee5b3 100755
--- a/src/locale/en_US/resources/Tokenizer.php
+++ b/src/locale/en_US/resources/Tokenizer.php
@@ -74,6 +74,12 @@ class Tokenizer
         'bruce schnier' => 'bruce schneier',
     ];
     /**
+     * Any unique identifier corresponding to the component of a triplet which
+     * can be answered using a question answer list
+     * @string
+     */
+    public static $question_marker = "qqq";
+    /**
      * @array
      */
     public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP",
@@ -519,39 +525,37 @@ class Tokenizer
         $result = self::numberSix($result);
         return $result;
     }
-    // private methods for stemming
     /**
-     * Checks to see if the ith character in the buffer is a consonant
+     * Takes a triplets array with subject, predicate, object fields with
+     * CONCISE and RAW subfields and rearranges it to have two fields CONCISE
+     * and RAW with subject, predicate, object, and QUESTION_ANSWER_LIST
+     * subfields
      *
-     * @param int $i the character to check
-     * @return if the ith character is a constant
+     * @param array $sub_pred_obj_triplets in format described above
+     * @param string $lang locale tag for stem chargramming and segmenting
+     * @return array $processed_triplets in format described above
      */
-    private static function cons($i)
+    public static function rearrangeTripletsByType($sub_pred_obj_triplets,
+        $lang)
     {
-        switch (self::$buffer[$i]) {
-            case 'a':
-                // no break
-            case 'e':
-            case 'i':
-            case 'o':
-            case 'u':
-                return false;
-            case 'y':
-                return ($i== 0 ) ? true : !self::cons($i - 1);
-            default:
-                return true;
-        }
+        $processed_triplet = [];
+        $processed_triplets['CONCISE'] =
+            self::extractTripletByType($sub_pred_obj_triplets, "CONCISE",
+            $lang);
+        $processed_triplets['RAW'] =
+            self::extractTripletByType($sub_pred_obj_triplets, "RAW", $lang);
+        return $processed_triplets;
     }
-            /**
+    /**
      * Takes a part-of-speech tagged phrase and pre-tree with a
      * parse-from position and builds a parse tree for a determiner if possible
      *
-     * @param array $tagged_phrase
+     * @param array $tagged_phrase 
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase]
-     * @return array has fields
+     *      current parse position in $tagged_phrase] 
+     * @return array has fields 
      *      "cur_node" index of how far we parsed $tagged_phrase
      *      "DT" a subarray with a token node for the determiner that was
      *      parsed
@@ -571,12 +575,12 @@ class Tokenizer
      * Takes a part-of-speech tagged phrase and pre-tree with a
      * parse-from position and builds a parse tree for an adjective if possible
      *
-     * @param array $tagged_phrase
+     * @param array $tagged_phrase 
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["cur_node" =>
-     *      current parse position in $tagged_phrase]
-     * @return array has fields
+     *      current parse position in $tagged_phrase] 
+     * @return array has fields 
      *      "cur_node" index of how far we parsed $tagged_phrase
      *      "JJ" a subarray with a token node for the adjective that was
      *      parsed
@@ -601,12 +605,12 @@ class Tokenizer
      * Takes a part-of-speech tagged phrase and pre-tree with a
      * parse-from position and builds a parse tree for a noun if possible
      *
-     * @param array $tagged_phrase
+     * @param array $tagged_phrase 
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase]
-     * @return array has fields
+     *      current parse position in $tagged_phrase] 
+     * @return array has fields 
      *      "cur_node" index of how far we parsed $tagged_phrase
      *      "NN" a subarray with a token node for the noun string that was
      *      parsed
@@ -633,12 +637,12 @@ class Tokenizer
      * parse-from position and builds a parse tree for a sequence of
      * prepositional phrases if possible
      *
-     * @param array $tagged_phrase
+     * @param array $tagged_phrase 
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["cur_node" =>
-     *      current parse position in $tagged_phrase]
-     * @return array has fields
+     *      current parse position in $tagged_phrase] 
+     * @return array has fields 
      *      "cur_node" index of how far we parsed $tagged_phrase
      *      parsed followed by additional possible fields (here i
      *      represents the ith clause found):
@@ -700,12 +704,12 @@ class Tokenizer
      * Takes a part-of-speech tagged phrase and pre-tree with a
      * parse-from position and builds a parse tree for a noun phrase if possible
      *
-     * @param array $tagged_phrase
+     * @param array $tagged_phrase 
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase]
-     * @return array has fields
+     *      current parse position in $tagged_phrase] 
+     * @return array has fields 
      *      "cur_node" index of how far we parsed $tagged_phrase
      *      "NP" a subarray with possible fields
      *      "DT" with value a determiner subtree
@@ -745,15 +749,15 @@ class Tokenizer
      * Takes a part-of-speech tagged phrase and pre-tree with a
      * parse-from position and builds a parse tree for a verb if possible
      *
-     * @param array $tagged_phrase
+     * @param array $tagged_phrase 
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase]
-     * @return array has fields
+     *      current parse position in $tagged_phrase] 
+     * @return array has fields 
      *      "cur_node" index of how far we parsed $tagged_phrase
      *      "VB" a subarray with a token node for the verb string that was
-     *      parsed
+     *      parsed 
      */
     public static function extractVerb($tagged_phrase, $tree)
     {
@@ -781,12 +785,12 @@ class Tokenizer
      * Takes a part-of-speech tagged phrase and pre-tree with a
      * parse-from position and builds a parse tree for a verb phrase if possible
      *
-     * @param array $tagged_phrase
+     * @param array $tagged_phrase 
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase]
-     * @return array has fields
+     *      current parse position in $tagged_phrase] 
+     * @return array has fields 
      *      "cur_node" index of how far we parsed $tagged_phrase
      *      "VP" a subarray with possible fields
      *      "VB" with value a verb subtree
@@ -833,9 +837,9 @@ class Tokenizer
      * Given a part-of-speeech tagged phrase array generates a parse tree
      * for the phrase using a recursive descent parser.
      *
-     * @param array $tagged_phrase
+     * @param array $tagged_phrase 
      *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term)
+     *     "tag"=> part_of_speech_tag_for_term) 
      * @return array used to represent a tree. The array has up to three fields
      *      $tree["cur_node"] index of how far we parsed our$tagged_phrase
      *      $tree["NP"] contains a subtree for a noun phrase
@@ -858,7 +862,364 @@ class Tokenizer
         $tree['VP'] = $tree_vp['VP'];
         return $tree;
     }
-    //private methods for stemming
+    /**
+     * Takes a parse tree of a phrase and computes subject, predicate, and
+     * object arrays. Each of these array consists of two components CONCISE and
+     * RAW, CONCISE corresponding to something more similar to the words in the
+     * original phrase and RAW to the case where extraneous words have been
+     * removed
+     *
+     * @param are $parse_tree a parse tree for a sentence
+     * @return array triplet array
+     */
+    public static function extractTripletsParseTree($tree)
+    {
+        $triplets = [];
+        $triplets['subject'] = self::extractSubjectParseTree($tree);
+        $triplets['predicate'] = self::extractPredicateParseTree($tree);
+        $triplets['object'] = self::extractObjectParseTree($tree);
+        return $triplets;
+    }
+    /**
+     * Scans a word list for phrases. For phrases found generate
+     * a list of question and answer pairs at two levels of granularity:
+     * CONCISE (using all terms in orginal phrase) and RAW (removing
+     * (adjectives, etc).
+     *
+     * @param array $word_and_phrase_list of statements
+     * @param string $lang locale tag to tag parts of speech in phrase
+     * @return array with two fields: QUESTION_LIST consisting of triplets
+     *      (SUBJECT, PREDICATES, OBJECT) where one of the components has been
+     *      replaced with a question marker.
+     */
+    public static function extractTripletsPhrases($word_and_phrase_list,
+        $lang)
+    {
+        $triplets_list = [];
+        $question_list = [];
+        $question_answer_list = [];
+        $word_and_phrase_list = array_filter($word_and_phrase_list,
+            function ($key) { 
+                return str_word_count($key) >= C\PHRASE_THRESHOLD;
+            }, \ARRAY_FILTER_USE_KEY );
+        $triplet_types = ['CONCISE', 'RAW'];
+        $triplet_parts = ['subject', 'predicate', 'object'];
+        $tokenizer = PhraseParser::getTokenizer($lang);
+        foreach ($word_and_phrase_list as $word_and_phrase => $position_list) {
+            $tagged_phrase = $tokenizer->tagTokenizePartOfSpeech($word_and_phrase);
+            $parse_tree = $tokenizer->generatePhraseParseTree($tagged_phrase);
+            $triplets = self::extractTripletsParseTree($parse_tree);
+            $extracted_triplets = self::rearrangeTripletsByType($triplets, $lang);
+
+            foreach ($triplet_types as $type) {
+                if (!empty($extracted_triplets[$type])) {
+                    $triplet = $extracted_triplets[$type];
+                    foreach ($triplet_parts as $part) {
+                        if(!empty($triplet[$part])) {
+                            $question_list[$triplet[$part]] = $position_list;
+                        }
+                    }
+                    $question_answer_list = array_merge($question_answer_list,
+                        $triplet['QUESTION_ANSWER_LIST']);
+                }
+            }
+        }
+        $triplets_list['QUESTION_LIST'] = $question_list;
+        $triplets_list['QUESTION_ANSWER_LIST'] = $question_answer_list;
+        return $triplets_list;
+    }
+    /**
+     * Takes phrase tree $tree and a part-of-speech $pos returns
+     * the deepest $pos only path in tree.
+     *
+     * @param $tree phrase to extract type from
+     * @return string deepest verb
+     */
+    public static function extractDeepestSpeechPartPhrase($tree, $pos)
+    {
+        $extract = "";
+        if (!empty($tree[$pos])) {
+            $extract = self::extractDeepestSpeechPartPhrase($tree[$pos], $pos);
+        }
+        if (!$extract && !empty($tree[$pos]) && !empty($tree[$pos][$pos])) {
+            $extract = $tree[$pos][$pos];
+        }
+        return $extract;
+    }
+    /**
+     * Takes a parse tree of a phrase or statement and returns an array
+     * with two fields CONCISE and RAW the former having the object of
+     * the original phrase (as a string) the latter having the importart
+     * parts of the object
+     *
+     * @param array representation of a parse tree of a phrase
+     * @return array with two fields CONCISE and RAW as described above
+     */
+    public static function extractObjectParseTree($tree)
+    {
+        $object = [];
+        if (!empty($tree['VP'])) {
+            $tree_vp = $tree['VP'];
+            if (!empty($tree_vp['NP'])) {
+                $nb = $tree_vp['NP'];
+                $object['CONCISE'] = self::extractDeepestSpeechPartPhrase($nb,
+                    "NN");
+                $raw_object = "";
+                $it = new \RecursiveIteratorIterator(
+                    new \RecursiveArrayIterator($nb));
+                foreach ($it as $v) {
+                    $raw_object .= $v . " ";
+                }
+                $object['RAW'] = $raw_object;
+            } else {
+                $object['CONCISE'] = "";
+                $object['RAW'] = "";
+            }
+        } else {
+            $object['CONCISE'] = "";
+            $object['RAW'] = "";
+        }
+        return $object;
+    }
+    /**
+     * Takes a parse tree of a phrase or statement and returns an array
+     * with two fields CONCISE and RAW the former having the predicate of
+     * the original phrase (as a string) the latter having the importart
+     * parts of the predicate
+     *
+     * @param array representation of a parse tree of a phrase
+     * @return array with two fields CONCISE and RAW as described above
+     */
+    public static function extractPredicateParseTree($tree)
+    {
+        $predicate = [];
+        if (!empty($tree['VP'])) {
+            $tree_vp = $tree['VP'];
+            $predicate['CONCISE'] = self::extractDeepestSpeechPartPhrase(
+                $tree_vp, "VB");
+            $raw_predicate = "";
+            if (!empty($tree_vp['VB'])) {
+                $tree_vb = $tree_vp['VB'];
+                $it = new \RecursiveIteratorIterator(
+                    new \RecursiveArrayIterator($tree_vb));
+                foreach ($it as $v) {
+                    $raw_predicate .= $v . " ";
+                }
+                $predicate['RAW'] = $raw_predicate;
+            }
+        } else {
+            $predicate['CONCISE'] = "";
+            $predicate['RAW'] = "";
+        }
+        return $predicate;
+    }
+    /**
+     * Takes a parse tree of a phrase or statement and returns an array
+     * with two fields CONCISE and RAW the former having the subject of
+     * the original phrase (as a string) the latter having the importart
+     * parts of the subject
+     *
+     * @param array representation of a parse tree of a phrase
+     * @return array with two fields CONCISE and RAW as described above
+     */
+    public static function extractSubjectParseTree($tree)
+    {
+        $subject = [];
+        if (!empty($tree['NP'])) {
+            $subject['CONCISE'] = self::extractDeepestSpeechPartPhrase(
+                $tree['NP'], "NN");
+            $raw_subject = "";
+            $it = new \RecursiveIteratorIterator(
+                new \RecursiveArrayIterator($tree['NP']));
+            foreach ($it as $v) {
+                $raw_subject .= $v . " ";
+            }
+            $subject['RAW'] = $raw_subject;
+        } else {
+            $subject['CONCISE'] = "";
+            $subject['RAW'] = "";
+        }
+        return $subject;
+    }
+    /**
+     * Takes tagged question string starts with Who
+     * and returns question triplet from the question string
+     *
+     * @param string $tagged_question part-of-speech tagged question
+     * @param int $index current index in statement
+     * @return array parsed triplet
+     */
+    public static function parseWhoQuestion($tagged_question, $index)
+    {
+        $generated_questions = [];
+        $question_marker = self::getQuestionMarker();
+        $tree = ["cur_node" => $index];
+        $tree['NP'] = "WHO";
+        $triplets = [];
+        $tree_vp = self::extractVerbPhrase($tagged_question, $tree);
+        $triplets['predicate'] = self::extractPredicateParseTree(
+            $tree_vp);
+        $triplets['object'] = self::extractObjectParseTree(
+            $tree_vp);
+        $triplet_types = ['CONCISE', 'RAW'];
+        foreach ($triplet_types as $type) {
+            if (!empty($triplets['object'][$type])
+                && !empty($triplets['predicate'][$type])) {
+                $generated_questions[$type][] =
+                    trim($triplets['object'][$type]) .
+                    " " . trim($triplets['predicate'][$type]) . " " .
+                    $question_marker;
+                $generated_questions[$type][] = $question_marker .
+                    " " . trim($triplets['predicate'][$type]) .
+                    " " . trim($triplets['object'][$type]);
+            }
+        }
+        return $generated_questions;
+    }
+    /**
+     * Takes tagged question string starts with Wh+ except Who
+     * and returns question triplet from the question string
+     * Unlike the WHO case, here we assume there is an auxliary verb
+     * followed by a noun phrase then the rest of the verb phrase. For example,
+     * Where is soccer played?
+     *
+     * @param string $tagged_question part-of-speech tagged question
+     * @param $index current index in statement
+     * @return array parsed triplet suitable for query look-up
+     */
+    public static function parseWHPlusQuestion($tagged_question, $index)
+    {
+        $generated_questions = [];
+        $aux_verb = "";
+        $question_marker = self::getQuestionMarker();
+        while (isset($tagged_question[$index]) &&
+            in_array(trim($tagged_question[$index]['tag']),
+            self::$verb_phrases)) {
+            $token = trim($tagged_question[$index]['token']);
+            $aux_verb .= " " . $token;
+            $index++;
+        }
+        $tree = ["cur_node" => $index];
+        $tree['NP'] = "WHPlus";
+        $triplets = [];
+        $tree_np = self::extractNounPhrase($tagged_question, $tree);
+        $triplets['subject'] = self::extractSubjectParseTree($tree_np);
+        $tree_vp = self::extractVerbPhrase($tagged_question, $tree_np);
+        $triplets['predicate'] = self::extractPredicateParseTree($tree_vp);
+        if (!empty($aux_verb)) {
+            $triplets['predicate']['CONCISE'] = trim($aux_verb) .
+                " " . $triplets['predicate']['CONCISE'];
+            if (!isset($triplets['predicate']['RAW'])) {
+                $triplets['predicate']['RAW'] = "";
+            }
+            $triplets['predicate']['RAW'] = trim($aux_verb) .
+                " " . $triplets['predicate']['RAW'];
+        }
+        $triplet_types = ['CONCISE', 'RAW'];
+        foreach ($triplet_types as $type) {
+            if (!empty($triplets['subject'][$type])&&
+                !empty($triplets['predicate'][$type])) {
+                $generated_questions[$type][] =
+                    trim($triplets['subject'][$type]) .
+                    " " . trim($triplets['predicate'][$type]) .
+                    " " . $question_marker;
+                $generated_questions[$type][] = $question_marker.
+                    " " . trim($triplets['predicate'][$type]) .
+                    " " . trim($triplets['subject'][$type]);
+            }
+        }
+        return $generated_questions;
+    }
+    /**
+     * Takes any question started with WH question and returns the
+     * triplet from the question
+     *
+     * @param string $question question to parse
+     * @param string $lang locale tag to tag parts of speech in phrase
+     * @return array question triplet
+     */
+    public static function questionParser($question, $lang)
+    {
+        $tokenizer = PhraseParser::getTokenizer($lang);
+        $tagged_question = $tokenizer->tagTokenizePartOfSpeech($question);
+        $generated_question_array = [];
+        if (isset($tagged_question[0])) {
+            if (in_array(trim($tagged_question[0]['tag']),
+                ["WRB", "WP"])) {
+                $token = strtoupper(trim($tagged_question[0]['token']));
+                if ($token == "WHO") {
+                    $generated_question_array = self::parseWhoQuestion(
+                        $tagged_question, 1);
+                } else if (in_array($token, ["WHERE", "WHEN", "WHAT"])) {
+                    $generated_question_array = self::parseWHPlusQuestion(
+                        $tagged_question, 1);
+                }
+            }
+        }
+        return $generated_question_array;
+    }
+    /**
+     * Takes a triplets array with subject, predicate, object fields with
+     * CONCISE, RAW subfields and produces a triplits with $type subfield (where
+     * $type is one of CONCISE and RAW) and with subject, predicate, object,
+     * and QUESTION_ANSWER_LIST subfields
+     *
+     * @param array $sub_pred_obj_triplets  in format described above
+     * @param string $lang locale tag for stem chargramming and segmentin
+     * @return array $triplets in format described above
+     */
+    public static function extractTripletByType($sub_pred_obj_triplets, $type,
+        $lang)
+    {
+        $triplets = [];
+        if (!empty($sub_pred_obj_triplets['subject'][$type])
+            && !empty($sub_pred_obj_triplets['predicate'][$type])
+            && !empty($sub_pred_obj_triplets['object'][$type])) {
+            $question_answer_triplets = [];
+            $question_marker = self::$question_marker;
+            $tokenizer = PhraseParser::getTokenizer($lang);
+            $sentence = [ trim($sub_pred_obj_triplets['subject'][$type]),
+                trim($sub_pred_obj_triplets['predicate'][$type]),
+                trim($sub_pred_obj_triplets['object'][$type])];
+            $parts = ['subject', 'predicate', 'object'];
+            for ($i = 0; $i < 3; $i++) {
+                $q_sentence = $sentence;
+                $q_sentence[$i] = $question_marker;
+                $q_sentence_string = implode(" ", $q_sentence);
+                $q_sentence_string = PhraseParser::stemCharGramSegment(
+                    $q_sentence_string, $lang, true);
+                $triplets[$parts[$i]] = $q_sentence_string;
+                $question_answer_triplets[$q_sentence_string] =
+                    PhraseParser::stemCharGramSegment($sentence[$i], $lang,
+                    true);
+            }
+            $triplets['QUESTION_ANSWER_LIST'] = $question_answer_triplets;
+        }
+        return $triplets;
+    }
+    // private methods for stemming
+    /**
+     * Checks to see if the ith character in the buffer is a consonant
+     *
+     * @param int $i the character to check
+     * @return if the ith character is a constant
+     */
+    private static function cons($i)
+    {
+        switch (self::$buffer[$i]) {
+            case 'a':
+                // no break
+            case 'e':
+            case 'i':
+            case 'o':
+            case 'u':
+                return false;
+            case 'y':
+                return ($i== 0 ) ? true : !self::cons($i - 1);
+            default:
+                return true;
+        }
+    }
     /**
      * m() measures the number of consonant sequences between 0 and j. if c is
      * a consonant sequence and v a vowel sequence, and [.] indicates arbitrary
@@ -1217,6 +1578,15 @@ class Tokenizer
         }
         return $tagged_phrase;
     }
+    /**
+     * The function returns the question marker for the locale
+     *
+     * @return the question marker
+     */
+    public static function getQuestionMarker() 
+    {
+        return self::$question_marker;
+    }
     //private methods for sentence compression
     /**
      * From Back to Basics: CLASSY 2006 page 3:
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 1af65c4..81df13f 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -38,7 +38,6 @@ use seekquarry\yioop\library\IndexManager;
 use seekquarry\yioop\library\PhraseParser;
 use seekquarry\yioop\library\Thesaurus;
 use seekquarry\yioop\library\index_bundle_iterators as I;
-use seekquarry\yioop\library\QuestionAnswerExtractor;
 
 /**
  * logging is done during crawl not through web,
@@ -917,7 +916,7 @@ class PhraseModel extends ParallelModel
         if (!empty($tokenizer) && method_exists($tokenizer, "isQuestion") &&
             method_exists($tokenizer, "tagTokenizePartOfSpeech") &&
             $tokenizer->isQuestion($phrase)) {
-            $generated_question = QuestionAnswerExtractor::questionParser(
+            $generated_question = $tokenizer->questionParser(
                 $phrase, $tag);
             if (!empty($generated_question['CONCISE'])) {
                 $phrase = $generated_question['CONCISE'][0];
-- 
2.10.0.windows.1


From 8284eaa91491a3ecd51509fb8ae7352b359b2793 Mon Sep 17 00:00:00 2001
From: Salil Shenoy <salilshenoy@gmail.com>
Date: Tue, 13 Dec 2016 11:29:21 -0800
Subject: [PATCH 3/3] Code Refactor Part 3: Ran Cleanup Utilities

---
 src/library/PhraseParser.php             |  4 +--
 src/locale/en_US/resources/Tokenizer.php | 62 ++++++++++++++++----------------
 src/models/PhraseModel.php               |  8 ++---
 3 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index dad8b7b..2d23f76 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -708,7 +708,7 @@ class PhraseParser
         return self::getNGramsTerm($terms, $n);
     }
     /**
-     * Returns the characters n-grams for the given terms where n is the 
+     * Returns the characters n-grams for the given terms where n is the
      * length.
      *
      * @param array $terms the terms to make n-grams for
@@ -1360,4 +1360,4 @@ vaffanculo fok hoer kut lul やりまん 打っ掛け
         }
         return $result;
     }
-}
+}
\ No newline at end of file
diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php
index eeee5b3..e13594f 100755
--- a/src/locale/en_US/resources/Tokenizer.php
+++ b/src/locale/en_US/resources/Tokenizer.php
@@ -550,12 +550,12 @@ class Tokenizer
      * Takes a part-of-speech tagged phrase and pre-tree with a
      * parse-from position and builds a parse tree for a determiner if possible
      *
-     * @param array $tagged_phrase 
+     * @param array $tagged_phrase
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase] 
-     * @return array has fields 
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
      *      "cur_node" index of how far we parsed $tagged_phrase
      *      "DT" a subarray with a token node for the determiner that was
      *      parsed
@@ -575,12 +575,12 @@ class Tokenizer
      * Takes a part-of-speech tagged phrase and pre-tree with a
      * parse-from position and builds a parse tree for an adjective if possible
      *
-     * @param array $tagged_phrase 
+     * @param array $tagged_phrase
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["cur_node" =>
-     *      current parse position in $tagged_phrase] 
-     * @return array has fields 
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
      *      "cur_node" index of how far we parsed $tagged_phrase
      *      "JJ" a subarray with a token node for the adjective that was
      *      parsed
@@ -605,12 +605,12 @@ class Tokenizer
      * Takes a part-of-speech tagged phrase and pre-tree with a
      * parse-from position and builds a parse tree for a noun if possible
      *
-     * @param array $tagged_phrase 
+     * @param array $tagged_phrase
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase] 
-     * @return array has fields 
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
      *      "cur_node" index of how far we parsed $tagged_phrase
      *      "NN" a subarray with a token node for the noun string that was
      *      parsed
@@ -637,12 +637,12 @@ class Tokenizer
      * parse-from position and builds a parse tree for a sequence of
      * prepositional phrases if possible
      *
-     * @param array $tagged_phrase 
+     * @param array $tagged_phrase
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["cur_node" =>
-     *      current parse position in $tagged_phrase] 
-     * @return array has fields 
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
      *      "cur_node" index of how far we parsed $tagged_phrase
      *      parsed followed by additional possible fields (here i
      *      represents the ith clause found):
@@ -704,12 +704,12 @@ class Tokenizer
      * Takes a part-of-speech tagged phrase and pre-tree with a
      * parse-from position and builds a parse tree for a noun phrase if possible
      *
-     * @param array $tagged_phrase 
+     * @param array $tagged_phrase
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase] 
-     * @return array has fields 
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
      *      "cur_node" index of how far we parsed $tagged_phrase
      *      "NP" a subarray with possible fields
      *      "DT" with value a determiner subtree
@@ -749,15 +749,15 @@ class Tokenizer
      * Takes a part-of-speech tagged phrase and pre-tree with a
      * parse-from position and builds a parse tree for a verb if possible
      *
-     * @param array $tagged_phrase 
+     * @param array $tagged_phrase
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase] 
-     * @return array has fields 
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
      *      "cur_node" index of how far we parsed $tagged_phrase
      *      "VB" a subarray with a token node for the verb string that was
-     *      parsed 
+     *      parsed
      */
     public static function extractVerb($tagged_phrase, $tree)
     {
@@ -785,12 +785,12 @@ class Tokenizer
      * Takes a part-of-speech tagged phrase and pre-tree with a
      * parse-from position and builds a parse tree for a verb phrase if possible
      *
-     * @param array $tagged_phrase 
+     * @param array $tagged_phrase
      *      an array of pairs of the form ("token" => token_for_term,
      *     "tag"=> part_of_speech_tag_for_term)
      * @param array $tree that consists of ["curnode" =>
-     *      current parse position in $tagged_phrase] 
-     * @return array has fields 
+     *      current parse position in $tagged_phrase]
+     * @return array has fields
      *      "cur_node" index of how far we parsed $tagged_phrase
      *      "VP" a subarray with possible fields
      *      "VB" with value a verb subtree
@@ -837,9 +837,9 @@ class Tokenizer
      * Given a part-of-speeech tagged phrase array generates a parse tree
      * for the phrase using a recursive descent parser.
      *
-     * @param array $tagged_phrase 
+     * @param array $tagged_phrase
      *      an array of pairs of the form ("token" => token_for_term,
-     *     "tag"=> part_of_speech_tag_for_term) 
+     *     "tag"=> part_of_speech_tag_for_term)
      * @return array used to represent a tree. The array has up to three fields
      *      $tree["cur_node"] index of how far we parsed our$tagged_phrase
      *      $tree["NP"] contains a subtree for a noun phrase
@@ -899,23 +899,25 @@ class Tokenizer
         $question_list = [];
         $question_answer_list = [];
         $word_and_phrase_list = array_filter($word_and_phrase_list,
-            function ($key) { 
+            function ($key) {
                 return str_word_count($key) >= C\PHRASE_THRESHOLD;
             }, \ARRAY_FILTER_USE_KEY );
         $triplet_types = ['CONCISE', 'RAW'];
         $triplet_parts = ['subject', 'predicate', 'object'];
         $tokenizer = PhraseParser::getTokenizer($lang);
         foreach ($word_and_phrase_list as $word_and_phrase => $position_list) {
-            $tagged_phrase = $tokenizer->tagTokenizePartOfSpeech($word_and_phrase);
+            $tagged_phrase = 
+                $tokenizer->tagTokenizePartOfSpeech($word_and_phrase);
             $parse_tree = $tokenizer->generatePhraseParseTree($tagged_phrase);
             $triplets = self::extractTripletsParseTree($parse_tree);
-            $extracted_triplets = self::rearrangeTripletsByType($triplets, $lang);
+            $extracted_triplets = 
+                self::rearrangeTripletsByType($triplets, $lang);
 
             foreach ($triplet_types as $type) {
                 if (!empty($extracted_triplets[$type])) {
                     $triplet = $extracted_triplets[$type];
                     foreach ($triplet_parts as $part) {
-                        if(!empty($triplet[$part])) {
+                        if (!empty($triplet[$part])) {
                             $question_list[$triplet[$part]] = $position_list;
                         }
                     }
@@ -1583,7 +1585,7 @@ class Tokenizer
      *
      * @return the question marker
      */
-    public static function getQuestionMarker() 
+    public static function getQuestionMarker()
     {
         return self::$question_marker;
     }
@@ -1655,4 +1657,4 @@ class Tokenizer
             "(,\s?when[^,]*,)|(,\s?where[^,]*,)/i", "", $result);
         return $result;
     }
-}
+}
\ No newline at end of file
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index 81df13f..16b4399 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -649,7 +649,7 @@ class PhraseModel extends ParallelModel
                     $tmp_hash = (is_array($tmp_hash)) ? $tmp_hash : [$tmp_hash];
                     $test =  array_merge($tmp_hash, [L\crawlHash($word)]);
                 } else {
-                    if(in_array($word, $found_materialized_metas) &&
+                    if (in_array($word, $found_materialized_metas) &&
                         !$metas_accounted) {
                         $meta_keys[] = $tmp_hash;
                     } else {
@@ -657,7 +657,7 @@ class PhraseModel extends ParallelModel
                     }
                 }
             }
-            if(!$metas_accounted) {
+            if (!$metas_accounted) {
                 $word_keys = array_merge($word_keys, $meta_keys);
             }
             if (count($word_keys) == 0) {
@@ -810,7 +810,7 @@ class PhraseModel extends ParallelModel
         }
         $found_metas = array_unique($found_metas);
         $found_materialized_metas = array_unique($found_materialized_metas);
-        if(empty(trim($phrase_string)) && count($found_metas) == 2 
+        if (empty(trim($phrase_string)) && count($found_metas) == 2
             && (in_array("site:doc", $found_metas)
             || in_array("site:any", $found_metas))) {
             /*site:doc and site:any doesn't work with materialized metas by
@@ -1813,4 +1813,4 @@ class PhraseModel extends ParallelModel
         }
         return $group_iterator;
     }
-}
+}
\ No newline at end of file
-- 
2.10.0.windows.1