From de8c8150d5fda9ea80c94110e51321dbef959b89 Mon Sep 17 00:00:00 2001
From: Salil Shenoy <salilshenoy@gmail.com>
Date: Sun, 9 Oct 2016 16:10:42 -0700
Subject: [PATCH 1/5] Integrating the question answer patch with the latest
 version of yioop

---
 src/controllers/components/CrawlComponent.php |   15 +-
 src/executables/ArcTool.php                   |    3 +-
 src/executables/Fetcher.php                   |   31 +-
 src/library/CrawlConstants.php                |    1 +
 src/library/PhraseParser.php                  |   24 +-
 src/library/TripletExtractor.php              | 1062 +++++++++++++++++++++++++
 src/models/PhraseModel.php                    |   41 +
 src/views/SearchView.php                      |   17 +-
 src/views/elements/PageoptionsElement.php     |    5 +
 9 files changed, 1172 insertions(+), 27 deletions(-)
 create mode 100644 src/library/TripletExtractor.php

diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index f7f5f9e..1efafea 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -1346,7 +1346,7 @@ class CrawlComponent extends Component implements CrawlConstants
                     PhraseParser::extractPhrasesInLists($phrase_string,
                         $lang);
                 $len = strlen($phrase_string);
-                if (PhraseParser::computeSafeSearchScore($word_lists, $len) <
+                if (PhraseParser::computeSafeSearchScore($word_lists['WORD_LIST'], $len) <
                     0.012) {
                     $meta_ids[] = "safe:true";
                     $safe = true;
@@ -1355,13 +1355,20 @@ class CrawlComponent extends Component implements CrawlConstants
                     $safe = false;
                 }
             }
-            if (!isset($word_lists)) {
-                $word_lists = [];
+            if (!isset($word_lists['WORD_LIST'])) {
+                $word_lists['WORD_LIST'] = [];
             }
+
+            if (!isset($word_lists['QUESTION_ANSWER_LIST'])) {
+                $word_lists['QUESTION_ANSWER_LIST'] = [];
+            }
+
             $data["EXTRACTED_WORDS"] = wordwrap($parent->clean(
-                print_r($word_lists, true), "string"), 75, "\n", true);;
+                print_r($word_lists, true), "string"), 75, "\n", true);
             $data["EXTRACTED_META_WORDS"] = wordwrap($parent->clean(
                 print_r($meta_ids, true), "string"), 75, "\n", true);
+            $data["QUESTIONS_TRIPLET"] = wordwrap($parent->clean(
+                print_r($word_lists['QUESTION_ANSWER_LIST'], true), "string"), 75, "\n", true);
         }
         return $data;
     }
diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php
index 6b93c0b..b6049cf 100755
--- a/src/executables/ArcTool.php
+++ b/src/executables/ArcTool.php
@@ -925,9 +925,10 @@ class ArcTool implements CrawlConstants
                                 mb_substr($site[self::DESCRIPTION], 0,
                                 C\AD_HOC_TITLE_LENGTH), $site[self::LANG]);
                         }
-                        $word_lists =
+                        $triplet_lists = 
                             PhraseParser::extractPhrasesInLists($phrase_string,
                                 $lang);
+                        $word_lists = $triplet_list['WORD_LIST'];
                         $len = strlen($phrase_string);
                         if (PhraseParser::computeSafeSearchScore($word_lists,
                             $len) < 0.012) {
diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 25bbf7f..963f982 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -44,6 +44,7 @@ use seekquarry\yioop\library\PhraseParser;
 use seekquarry\yioop\library\processors\PageProcessor;
 use seekquarry\yioop\library\UrlParser;
 use seekquarry\yioop\library\WebArchiveBundle;
+use seekquarry\yioop\library\TripletExtractor;
 
 if (php_sapi_name() != 'cli') {echo "BAD REQUEST"; exit();}
 ini_set("memory_limit", "1200M"); //so have enough memory to crawl sitemaps
@@ -558,7 +559,6 @@ class Fetcher implements CrawlConstants
         $local_archives = [""];
         while (CrawlDaemon::processHandler()) {
             $start_time = microtime(true);
-            $info = [];
             $fetcher_message_file = C\CRAWL_DIR.
                 "/schedules/{$prefix}FetcherMessages.txt";
             if (file_exists($fetcher_message_file)) {
@@ -577,8 +577,6 @@ class Fetcher implements CrawlConstants
                 if ($info[self::CRAWL_TIME] == 0) {
                     $info[self::STATUS] = self::NO_DATA_STATE;
                     $this->to_crawl = [];
-                } else {
-                    L\crawlLog("Crawl time is now " . $this->crawl_time);
                 }
             } else if ($this->crawl_type == self::ARCHIVE_CRAWL &&
                     $this->arc_type != "WebArchiveBundle" &&
@@ -657,7 +655,6 @@ class Fetcher implements CrawlConstants
                 $this->to_crawl_again = [];
                 $this->found_sites = [];
                 gc_collect_cycles();
-                
                 $this->web_archive = new WebArchiveBundle($tmp_base_name,
                     false);
                 $this->crawl_time = $info[self::CRAWL_TIME];
@@ -665,7 +662,7 @@ class Fetcher implements CrawlConstants
                 $this->sum_seen_description_length = 0;
                 $this->sum_seen_site_link_length = 0;
                 $this->num_seen_sites = 0;
-                L\crawlLog("New name: " . $this->web_archive->dir_name);
+                L\crawlLog("New name: ".$this->web_archive->dir_name);
                 L\crawlLog("Switching archive...");
                 if (!isset($info[self::ARC_DATA])) {
                     continue;
@@ -984,11 +981,6 @@ class Fetcher implements CrawlConstants
         if (isset($info[self::CRAWL_TIME])
             && ($info[self::CRAWL_TIME] != $this->crawl_time
             || $info[self::CRAWL_TIME] == 0)) {
-            if ($info[self::CRAWL_TIME] > 0) {
-                L\crawlLog("New Crawl Time Found: {$info[self::CRAWL_TIME]}");
-            } else {
-                L\crawlLog("Crawl Time Changing to 0");
-            }
             $dir = C\CRAWL_DIR."/schedules";
             $time_change = true;
             /* Zero out the crawl. If haven't done crawl before, then scheduler
@@ -1041,8 +1033,6 @@ class Fetcher implements CrawlConstants
                 "{$this->crawl_time}.txt") && file_exists(
                 "$dir/$prefix".self::fetch_batch_name.
                     "{$this->crawl_time}.txt")) {
-                L\crawlLog("Loading old batches for ".
-                    "{$this->crawl_time}.");
                 $info = unserialize(file_get_contents(
                     "$dir/$prefix".self::fetch_crawl_info.
                         "{$this->crawl_time}.txt"));
@@ -1486,6 +1476,7 @@ class Fetcher implements CrawlConstants
      */
     public function getFetchSites()
     {
+        $web_archive = $this->web_archive;
         $start_time = microtime(true);
         $seeds = [];
         $delete_indices = [];
@@ -1772,9 +1763,6 @@ class Fetcher implements CrawlConstants
                 }
                 $doc_info = $processor->handle($site[self::PAGE],
                     $site[self::URL]);
-                if (C\FETCHER_PROCESS_DELAY > 0 ) {
-                    usleep(C\FETCHER_PROCESS_DELAY);
-                }
                 if (isset($site[self::REPOSITORY_TYPE]) &&
                     $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                     $site[self::URL] = $tmp_url_store;
@@ -2705,6 +2693,7 @@ class Fetcher implements CrawlConstants
                     $this->video_sources);
             }
             $word_lists = [];
+            $triplet_lists = [];
             /*
                 self::JUST_METAS check to avoid getting sitemaps in results for
                 popular words
@@ -2737,9 +2726,10 @@ class Fetcher implements CrawlConstants
                 $lang = L\guessLocaleFromString(
                     mb_substr($site[self::DESCRIPTION], 0,
                     C\AD_HOC_TITLE_LENGTH), $site[self::LANG]);
-                $word_lists =
+                $triplet_lists =
                     PhraseParser::extractPhrasesInLists($phrase_string,
                         $lang);
+                $word_lists = $triplet_lists['WORD_LIST'];
                 $len = strlen($phrase_string);
                 if (isset($this->programming_language_extension[$lang]) ||
                     PhraseParser::computeSafeSearchScore($word_lists, $len) <
@@ -2780,6 +2770,12 @@ class Fetcher implements CrawlConstants
                 ]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG,
                 $word_lists, $meta_ids, PhraseParser::$materialized_metas,
                 true, $doc_rank);
+
+            if(isset($triplet_lists['QUESTION_ANSWER_LIST'])) {
+                $question_list = $triplet_lists['QUESTION_ANSWER_LIST'];
+                $site[self::QUESTION_TRIPLETS] = $question_list;
+                $this->found_sites[self::SEEN_URLS][$i] = $site;
+            }
             /*
                 $this->no_process_links is set when doing things like
                 mix recrawls. In this case links likely already will appear
@@ -2831,9 +2827,10 @@ class Fetcher implements CrawlConstants
                     $summary[self::LANG] = $lang;
                     $this->found_sites[self::LINK_SEEN_URLS][$part_num][] =
                         $summary;
-                    $link_word_lists =
+                    $link_lists =
                         PhraseParser::extractPhrasesInLists($link_text,
                         $lang);
+                    $link_word_lists = $link_lists['WORD_LIST'];
                     $link_meta_ids = PhraseParser::calculateLinkMetas($url,
                         $link_host, $link_text, $site_url);
                     if (!isset($this->found_sites[self::INVERTED_INDEX][
diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php
index dab8b7b..38bf0b9 100755
--- a/src/library/CrawlConstants.php
+++ b/src/library/CrawlConstants.php
@@ -231,4 +231,5 @@ interface CrawlConstants
     const CENTROID_WEIGHTED_SUMMARIZER = 'dt';
     const SCRAPER_LABEL = 'du';
     const SCRAPERS = 'dv';
+    const QUESTION_TRIPLETS = 'dw';
 }
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index 128f53f..8f866e3 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -33,6 +33,7 @@ namespace seekquarry\yioop\library;
 use seekquarry\yioop\configs as C;
 use seekquarry\yioop\models\LocaleModel;
 use seekquarry\yioop\library\processors\PageProcessor;
+use seekquarry\yioop\library\TripletExtractor;
 
 /**
  * For crawlHash
@@ -210,7 +211,8 @@ class PhraseParser
      */
     public static function extractPhrasesAndCount($string, $lang = null)
     {
-        $phrases = self::extractPhrasesInLists($string, $lang);
+        $triplet_list = self::extractPhrasesInLists($string, $lang);
+        $phrases = $triplet_list['WORD_LIST'];
         $phrase_counts = [];
         foreach ($phrases as $term => $positions) {
             $phrase_counts[$term] = count($positions);
@@ -232,7 +234,20 @@ class PhraseParser
         if (!isset(self::$programming_language_map[$lang])) {
             self::canonicalizePunctuatedTerms($string, $lang);
         }
-        return self::extractMaximalTermsAndFilterPhrases($string, $lang);
+        $phrase_list = array();
+        $word_lists = self::extractMaximalTermsAndFilterPhrases(
+            $string, $lang);
+        // COMMENT BELOW CODE IN IF TO DISABLE QUESTION ANSWERING SYSTEM
+        if (isset($word_lists)) {
+            $triplets_list = 
+                TripletExtractor::storeStatementArraysAsTriplet($word_lists);
+            $word_lists = 
+                array_merge($word_lists, $triplets_list['QUESTION_LIST']);
+            $phrase_list['QUESTION_ANSWER_LIST'] = 
+                $triplets_list['QUESTION_ANSWER_LIST'];
+        }
+        $phrase_list['WORD_LIST'] = $word_lists;
+        return $phrase_list;
     }
     /**
      * This functions tries to convert acronyms, e-mail, urls, etc into
@@ -1254,8 +1269,9 @@ vaffanculo fok hoer kut lul やりまん 打っ掛け
             return 0;
         }
         if ($unsafe_terms == []) {
-            $unsafe_lists = PhraseParser::extractPhrasesInLists($unsafe_phrase,
-                "en-US");
+             $triplet_list = PhraseParser::extractPhrasesInLists($unsafe_phrase,
+                 "en-US");
+            $unsafe_lists = $triplet_list['WORD_LIST'];
             $unsafe_terms = array_keys($unsafe_lists);
         }
         $num_unsafe_terms = 0;
diff --git a/src/library/TripletExtractor.php b/src/library/TripletExtractor.php
new file mode 100644
index 0000000..2dd8bbc
--- /dev/null
+++ b/src/library/TripletExtractor.php
@@ -0,0 +1,1062 @@
+<?php
+/**
+ * SeekQuarry/Yioop --
+ * Open Source Pure PHP Search Engine, Crawler, and Indexer
+ *
+ * Copyright (C) 2009 - 2015  Chris Pollett chris@pollett.org
+ *
+ * LICENSE:
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * @author Chris Pollett chris@pollett.org
+ * @license http://www.gnu.org/licenses/ GPL3
+ * @link http://www.seekquarry.com/
+ * @copyright 2009 - 2015
+ * @filesource
+ */
+
+namespace seekquarry\yioop\library;
+
+use seekquarry\yioop\configs as C;
+use seekquarry\yioop\library as L;
+
+/**
+ *
+ * @author Nirav Patel
+ * @package seekquarry\yioop\library
+ */
+class TripletExtractor
+{
+
+    /**
+     * Any unique identifier of question word
+     * @question_word string identifier
+     */
+    public static $question_word = "qqque";
+
+    /**
+     * Takes a phrase and tags each term in it with its part of speech.
+     * So each term in the original phrase gets mapped to term~part_of_speech
+     * This tagger is based on a Brill tagger. It makes uses a lexicon
+     * consisting of words from the Brown corpus together with a list of
+     * part of speech tags that that word had in the Brown Corpus. These are
+     * used to get an initial part of speech (in word was not present than
+     * we assume it is a noun). From this a fixed set of rules is used to modify
+     * the initial tag if necessary.
+     *
+     * @param string $phrase text to add parts speech tags to
+     * @return string $tagged_phrase phrase where each term has ~part_of_speech
+     *     appended
+     */
+    public static function tagPartsOfSpeechPhrase($phrase)
+    {
+        preg_match_all("/[\w\d]+/", $phrase, $matches);
+        $tagged_tokens = self::tagTokenizePartOfSpeech($phrase);
+        $tagged_phrase = self::taggedPartOfSpeechTokensToString(
+            $tagged_tokens);
+        return $tagged_phrase;
+    }
+
+    /**
+     * Split input text into terms and output an array with one element
+     * per term, that element consisting of array with the term token
+     * and the part of speech tag.
+     *
+     * @param string $text string to tag and tokenize
+     * @return array of pairs of the form( "token" => token_for_term,
+     *     "tag"=> part_of_speech_tag_for_term) for one each token in $text
+     */
+    public static function tagTokenizePartOfSpeech($text)
+    {
+        static $lex_string = null;
+        if (!$lex_string) {
+            $lex_string = gzdecode(file_get_contents("lexicon.txt.gz"));
+        }
+        preg_match_all("/[\w\d]+/", $text, $matches);
+        $tokens = $matches[0];
+        $nouns = array('NN', 'NNS', 'NNP');
+        $verbs = array('VBD', 'VBP', 'VB');
+        $result = array();
+        $previous = array('token' => -1, 'tag' => -1);
+        $previous_token = -1;
+        sort($tokens);
+        $dictionary = array();
+        /*
+            Notice we sorted the tokens, and notice how we use $cur_pos
+            so only advance forward through $lex_string. So the
+            run time of this is bound by at most one scan of $lex_string
+         */
+        $cur_pos = 0;
+        foreach ($tokens as $token) {
+            $token = strtolower(rtrim($token, "."));
+            $token_pos = stripos($lex_string, "\n" . $token . " ", $cur_pos);
+            if ($token_pos !== false) {
+                $token_pos++;
+                $cur_pos = stripos($lex_string, "\n", $token_pos);
+                $line = trim(substr($lex_string, $token_pos,
+                    $cur_pos - $token_pos));
+                $tag_list = explode(' ', $line);
+                $dictionary[strtolower(rtrim($token, "."))] =
+                    array_slice($tag_list, 1);
+                $cur_pos++;
+            }
+        }
+        // now using our dictionary we tag
+        $i = 0;
+        $tag_list = array();
+        foreach ($matches[0] as $token) {
+            $prev_tag_list = $tag_list;
+            $tag_list = array();
+            // default to a common noun
+            $current = array('token' => $token, 'tag' => 'NN');
+            // remove trailing full stops
+            $token = strtolower(rtrim($token, "."));
+            if (isset($dictionary[$token])) {
+                $tag_list = $dictionary[$token];
+                $current['tag'] = $tag_list[0];
+            }
+            // Converts verbs after 'the' to nouns
+            if ($previous['tag'] == 'DT' && in_array($current['tag'], $verbs)) {
+                $current['tag'] = 'NN';
+            }
+            // Convert noun to number if . appears
+            if ($current['tag'][0] == 'N' && strpos($token, '.') !== false) {
+                $current['tag'] = 'CD';
+            }
+            $ends_with = substr($token, -2);
+            switch ($ends_with) {
+                case 'ed':
+                    // Convert noun to past particle if ends with 'ed'
+                    if ($current['tag'][0] == 'N') {
+                        $current['tag'] = 'VBN';
+                    }
+                    break;
+                case 'ly':
+                    // Anything that ends 'ly' is an adverb
+                    $current['tag'] = 'RB';
+                    break;
+                case 'al':
+                    // Common noun to adjective if it ends with al
+                    if (in_array($current['tag'], $nouns)) {
+                        $current['tag'] = 'JJ';
+                    }
+                    break;
+            }
+            // Noun to verb if the word before is 'would'
+            if ($current['tag'] == 'NN' && $previous_token == 'would') {
+                $current['tag'] = 'VB';
+            }
+            // Convert common noun to gerund
+            if (in_array($current['tag'], $nouns) &&
+                substr($token, -3) == 'ing'
+            ) {
+                $current['tag'] = 'VBG';
+            }
+            //nouns followed by adjectives
+            if (in_array($previous['tag'], $nouns) &&
+                $current['tag'] == 'JJ' && in_array('JJ', $prev_tag_list)
+            ) {
+                $result[$i - 1]['tag'] = 'JJ';
+                $current['tag'] = 'NN';
+            }
+            /* If we get noun noun, and the second can be a verb,
+             * convert to verb; if noun noun and previous could be an
+             * adjective convert to adjective
+             */
+            if (in_array($previous['tag'], $nouns) &&
+                in_array($current['tag'], $nouns)
+            ) {
+                if (in_array('VBN', $tag_list)) {
+                    $current['tag'] = 'VBN';
+                } else {
+                    if (in_array('VBZ', $tag_list)) {
+                        $current['tag'] = 'VBZ';
+                    } else {
+                        if (in_array('JJ', $prev_tag_list)) {
+                            $result[$i - 1]['tag'] = 'JJ';
+                        }
+                    }
+                }
+            }
+            $result[$i] = $current;
+            $i++;
+            $previous = $current;
+            $previous_token = $token;
+        }
+        return $result;
+    }
+
+    /**
+     * Takes an array of pairs (token, tag) that came from phrase
+     * and builds a new phrase where terms look like token~tag.
+     *
+     * @param array $tagged_tokens array of pairs as might come from tagTokenize
+     * @return string $tagged_phrase a phrase with terms in the format token~tag
+     */
+    public static function taggedPartOfSpeechTokensToString($tagged_tokens)
+    {
+        $tagged_phrase = "";
+        $simplified_parts_of_speech = array(
+            "NN" => "NN",
+            "NNS" => "NN",
+            "NNP" => "NN",
+            "NNPS" => "NN",
+            "PRP" => "NN",
+            'PRP$' => "NN",
+            "WP" => "NN",
+            "VB" => "VB",
+            "VBD" => "VB",
+            "VBN" => "VB",
+            "VBP" => "VB",
+            "VBZ" => "VB",
+            "JJ" => "AJ",
+            "JJR" => "AJ",
+            "JJS" => "AJ",
+            "RB" => "AV",
+            "RBR" => "AV",
+            "RBS" => "AV",
+            "WRB" => "AV"
+        );
+        foreach ($tagged_tokens as $t) {
+            $tag = trim($t['tag']);
+            $tag = (isset($simplified_parts_of_speech[$tag])) ?
+                $simplified_parts_of_speech[$tag] : $tag;
+            $tagged_phrase .= $t['token'] . "~" . $tag . " ";
+        }
+        return $tagged_phrase;
+    }
+    /**
+     * Takes a statement and return the array of words from text
+     * along with the tags of Part Of Speech
+     *
+     * @param $text any statement
+     * @return array words tagged with POS tags
+     */
+    public static function partOfSpeechTagger_Brill($text)
+    {
+        static $dict = null;
+        $lexicon = C\LOCALE_DIR . "/en_US/resources/lexicon.txt";
+        if (!$dict) {
+            $fh = fopen($lexicon, 'r');
+            while ($line = fgets($fh)) {
+                $tags = explode(' ', $line);
+                $dict[strtolower(array_shift($tags))] = $tags;
+            }
+            fclose($fh);
+        }
+        preg_match_all("/[\w\d\.]+/", $text, $matches);
+        $nouns = array('NN', 'NNS');
+        $return = array();
+        $i = 0;
+        foreach ($matches[0] as $token) {
+            // default to a common noun
+            $return[$i] = array('token' => $token, 'tag' => 'NN');
+            // remove trailing full stops
+            if (substr($token, -1) == '.') {
+                $token = preg_replace('/\.+$/', '', $token);
+            }
+            // get from dict if set
+            if (isset($dict[strtolower($token)])) {
+                $return[$i]['tag'] = trim($dict[strtolower($token)][0]);
+            }
+            // Converts verbs after 'the' to nouns
+            if ($i > 0) {
+                if ($return[$i - 1]['tag'] == 'DT' &&
+                    in_array($return[$i]['tag'],
+                        array('VBD', 'VBP', 'VB'))
+                ) {
+                    $return[$i]['tag'] = 'NN';
+                }
+            }
+            // Convert noun to number if . appears
+            if ($return[$i]['tag'][0] == 'N' && strpos($token, '.') !== false) {
+                $return[$i]['tag'] = 'CD';
+            }
+            // Convert noun to past particile if ends with 'ed'
+            if ($return[$i]['tag'][0] == 'N' && substr($token, -2) == 'ed') {
+                $return[$i]['tag'] = 'VBN';
+            }
+            // Anything that ends 'ly' is an adverb
+            if (substr($token, -2) == 'ly') {
+                $return[$i]['tag'] = 'RB';
+            }
+            // Common noun to adjective if it ends with al
+            if (in_array($return[$i]['tag'], $nouns)
+                && substr($token, -2) == 'al'
+            ) {
+                $return[$i]['tag'] = 'JJ';
+            }
+            // Noun to verb if the word before is 'would'
+            if ($i > 0) {
+                if ($return[$i]['tag'] == 'NN'
+                    && strtolower($return[$i - 1]['token']) == 'would'
+                ) {
+                    $return[$i]['tag'] = 'VB';
+                }
+            }
+           // Convert noun to plural if it ends with an s
+            if ($return[$i]['tag'] == 'NN' && substr($token, -1) == 's') {
+                $return[$i]['tag'] = 'NNS';
+            }
+            // Convert common noun to gerund
+            if (in_array($return[$i]['tag'], $nouns)
+                && substr($token, -3) == 'ing'
+            ) {
+                $return[$i]['tag'] = 'VBG';
+            }
+            // If we get noun, and the second can be a verb, convert to verb
+            if ($i > 0) {
+                if (in_array($return[$i]['tag'], $nouns)
+                    && in_array($return[$i - 1]['tag'], $nouns)
+                    && isset($dict[strtolower($token)])
+                ) {
+                    if (in_array('VBN', $dict[strtolower($token)])) {
+                        $return[$i]['tag'] = 'VBN';
+                    } else {
+                        if (in_array('VBZ',
+                            $dict[strtolower($token)])) {
+                            $return[$i]['tag'] = 'VBZ';
+                        }
+                    }
+                }
+            }
+            $i++;
+        }
+        return $return;
+    }
+
+    /**
+     * Takes tagged array from the Part Of Speech tagger and
+     * returns tree generated from the tagged statement
+     *
+     * @param $tagger_array tagged array from part of speech tagger
+     * @return array formed tree array
+     */
+    public static function generateParseTreeUsingRDP($tagger_array)
+    {
+        $tree = array();
+        $tree = ["cur_node" => 0];
+        $tree_np = TripletExtractor::extractNPUsingRDP($tagger_array, $tree);
+        $tree = ["cur_node" => $tree_np['cur_node']];
+        $tree_vp = TripletExtractor::extractVPUsingRDP($tagger_array, $tree);
+        if ($tree == $tree_vp) {
+            return $tree;
+        }
+        $tree['cur_node'] = $tree_vp['cur_node'];
+        unset($tree_np['cur_node']);
+        unset($tree_vp['cur_node']);
+        $tree['NP'] = $tree_np['NP'];
+        $tree['VP'] = $tree_vp['VP'];
+        return $tree;
+    }
+
+    /**
+     * Takes tree generated till Statement and returns
+     * tree by adding Noun Phrase subtree to it
+     *
+     * @param $tagger_array tagged array
+     * @param $tree current tree
+     * @return mixed tree with NP node
+     */
+    public static function extractNPUsingRDP($tagger_array, $tree)
+    {
+        $NP = $tree['cur_node'];
+        $tree_dt = ['cur_node' => $NP];
+        $tree_dt = TripletExtractor::extractDTUsingRDP($tagger_array, $tree_dt);
+        $tree_jj = ['cur_node' => $tree_dt['cur_node']];
+        $tree_jj = TripletExtractor::extractJJUsingRDP($tagger_array, $tree_jj);
+        $tree_nn = ['cur_node' => $tree_jj['cur_node']];
+        $tree_nn = TripletExtractor::extractNNUsingRDP($tagger_array, $tree_nn);
+        if ($NP == $tree_nn['cur_node']) {
+            $tree['NP'] = "";
+        } else {
+            $cur_node = $tree_nn['cur_node'];
+            unset($tree_dt['cur_node']);
+            $tree_new_sub['DT'] = $tree_dt;
+            unset($tree_jj['cur_node']);
+            $tree_new_sub['JJ'] = $tree_jj;
+            unset($tree_nn['cur_node']);
+            $tree_new_sub['NN'] = $tree_nn;
+            $tree_new['cur_node'] = $cur_node;
+            $tree_new['NP'] = $tree_new_sub;
+            return $tree_new;
+        }
+        return $tree;
+    }
+
+    /**
+     * Takes current tree and returns
+     * tree by adding determiner node to it
+     *
+     * @param $tagger_array pos tagged array
+     * @param $tree current tree
+     * @return mixed DT added tree
+     */
+    public static function extractDTUsingRDP($tagger_array, $tree)
+    {
+        if (isset($tagger_array[$tree['cur_node']]['tag']) &&
+            ("DT" == $tagger_array[$tree['cur_node']]['tag'])
+        ) {
+           $tree['DT'] = $tagger_array[$tree['cur_node']]['token'];
+           $tree['cur_node'] = $tree['cur_node'] + 1;
+           return $tree;
+       }
+       return $tree;
+   }
+   /**
+     * Takes current tree and returns
+     * tree by adding adjective node to it
+     *
+    * @param $tagger_array POS tagged array
+    * @param $tree current tree
+    * @return mixed JJ added tree
+    */
+   public static function extractJJUsingRDP($tagger_array, $tree)
+   {
+       $adjective_string = "";
+       while (isset($tagger_array[$tree['cur_node']]['tag']) &&
+           ("JJ" == $tagger_array[$tree['cur_node']]['tag'] ||
+                "JJR" == $tagger_array[$tree['cur_node']]['tag'] ||
+                "JJS" == $tagger_array[$tree['cur_node']]['tag'])) {
+           $adjective_string .= " " .
+               $tagger_array[$tree['cur_node']]['token'];
+           $tree['cur_node'] = $tree['cur_node'] + 1;
+       }
+       if ("" != $adjective_string) {
+           $tag = "JJ";
+           $tree[$tag] = $adjective_string;
+       }
+        return $tree;
+   }
+   /**
+     * Takes current tree and returns
+     * tree by adding Noun node to it
+     *
+    * @param $tagger_array POS tagged array
+    * @param $tree current generated tree
+    * @return mixed NN added tree
+     */
+    public static function extractNNUsingRDP($tagger_array, $tree)
+    {
+        //Combining multiple noun into one
+        $noun_string = "";
+        while (isset($tagger_array[$tree['cur_node']]['tag']) && (
+                "NN" == $tagger_array[$tree['cur_node']]['tag'] ||
+                "NNS" == $tagger_array[$tree['cur_node']]['tag'] ||
+                "NNP" == $tagger_array[$tree['cur_node']]['tag'] ||
+                "NNPS" == $tagger_array[$tree['cur_node']]['tag'] ||
+                "PRP" == $tagger_array[$tree['cur_node']]['tag'])) {
+            $noun_string .= " " . $tagger_array[$tree['cur_node']]['token'];
+            $tree['cur_node'] = $tree['cur_node'] + 1;
+        }
+        if ("" != $noun_string) {
+            $tag = "NN";
+            $tree[$tag] = $noun_string;
+        }
+        // Checking for preposition
+        if (isset($tagger_array[$tree['cur_node']]['tag']) &&
+            ("IN" == $tagger_array[$tree['cur_node']]['tag'])) {
+            $tag = $tagger_array[$tree['cur_node']]['tag'];
+            $preposition_string = $tagger_array[$tree['cur_node']]['token'];
+            $tree[$tag] = $preposition_string;
+            $tree['cur_node'] = $tree['cur_node'] + 1;
+        }
+        if (isset($tagger_array[$tree['cur_node']]['tag']) &&
+            ("DT" == $tagger_array[$tree['cur_node']]['tag'])) {
+            $tree['DT_1'] = $tagger_array[$tree['cur_node']]['token'];
+            $tree['cur_node'] = $tree['cur_node'] + 1;
+        }
+        $adjective_string = "";
+        while (isset($tagger_array[$tree['cur_node']]['tag']) &&
+            ("JJ" == $tagger_array[$tree['cur_node']]['tag'] ||
+            "JJR" == $tagger_array[$tree['cur_node']]['tag'] ||
+            "JJS" == $tagger_array[$tree['cur_node']]['tag'])) {
+            $adjective_string .= " " .
+                $tagger_array[$tree['cur_node']]['token'];
+            $tree['cur_node'] = $tree['cur_node'] + 1;
+        }
+        if ("" != $adjective_string) {
+            $tag = "JJ_1";
+            $tree[$tag] = $adjective_string;
+        }
+        $propernoun_string = "";
+        while (isset($tagger_array[$tree['cur_node']]['tag']) &&
+            ("NN" == $tagger_array[$tree['cur_node']]['tag'] ||
+            "NNS" == $tagger_array[$tree['cur_node']]['tag'] ||
+            "NNP" == $tagger_array[$tree['cur_node']]['tag'] ||
+            "NNPS" == $tagger_array[$tree['cur_node']]['tag'] ||
+            "PRP" == $tagger_array[$tree['cur_node']]['tag'])) {
+            $propernoun_string .= " " .
+                $tagger_array[$tree['cur_node']]['token'];
+            $tree['cur_node'] = $tree['cur_node'] + 1;
+        }
+        if ("" != $propernoun_string) {
+            $tag = "NNP";
+            $tree[$tag] = $propernoun_string;
+        }
+        return $tree;
+    }
+
+    /**
+     * @param $tagger_array POS tagged array
+     * @param $tree current tree
+     * @return mixed VP added tree
+     */
+    public static function extractVPUsingRDP($tagger_array, $tree)
+    {
+        $VP = $tree['cur_node'];
+        $tree_vp = ['cur_node' => $VP];
+        $tree_vb = ['cur_node' => $tree_vp['cur_node']];
+        $tree_vb = TripletExtractor::extractVBUsingRDP($tagger_array, $tree_vb);
+        if ($VP == $tree_vb['cur_node']) {
+            return $tree;
+        }
+        $tree_np = ['cur_node' => $tree_vb['cur_node']];
+        $tree_np = TripletExtractor::extractNPUsingRDP($tagger_array, $tree_np);
+        if ($VP == $tree_np['cur_node']) {
+        } else {
+            $cur_node = $tree_np['cur_node'];
+            unset($tree_vb['cur_node']);
+            unset($tree_np['cur_node']);
+            $tree_new_sub['VB'] = $tree_vb;
+            $tree_new_sub['NP'] = $tree_np['NP'];
+            $tree_new['cur_node'] = $cur_node;
+            $tree_new['VP'] = $tree_new_sub;
+            return $tree_new;
+        }
+        return $tree;
+    }
+
+    /**
+     * @param $tagger_array POS tagged tree
+     * @param $tree current tree
+     * @return mixed VB added tree
+     */
+    public static function extractVBUsingRDP($tagger_array, $tree)
+    {
+        while (isset($tagger_array[$tree['cur_node']]['tag']) &&
+            ("VB" != $tagger_array[$tree['cur_node']]['tag'] &&
+            "VBD" != $tagger_array[$tree['cur_node']]['tag'] &&
+            "VBG" != $tagger_array[$tree['cur_node']]['tag'] &&
+            "VBN" != $tagger_array[$tree['cur_node']]['tag'] &&
+            "VBP" != $tagger_array[$tree['cur_node']]['tag'] &&
+            "VBZ" != $tagger_array[$tree['cur_node']]['tag'])) {
+            $tree['cur_node'] = $tree['cur_node'] + 1;
+        }
+        $verb_string = "";
+        while (isset($tagger_array[$tree['cur_node']]['tag']) &&
+            ("VB" == $tagger_array[$tree['cur_node']]['tag'] ||
+            "VBD" == $tagger_array[$tree['cur_node']]['tag'] ||
+            "VBG" == $tagger_array[$tree['cur_node']]['tag'] ||
+            "VBN" == $tagger_array[$tree['cur_node']]['tag'] ||
+            "VBP" == $tagger_array[$tree['cur_node']]['tag'] ||
+            "VBZ" == $tagger_array[$tree['cur_node']]['tag'])) {
+            $verb_string .= " " . $tagger_array[$tree['cur_node']]['token'];
+            $tree['cur_node'] = $tree['cur_node'] + 1;
+        }
+        if ("" != $verb_string) {
+            $tag = "VB";
+            $tree[$tag] = $verb_string;
+        }
+        return $tree;
+   }
+   /**
+    * @param $tree fully generated tree
+    * @return array triplet array
+    */
+   public static function extractTriplet($tree)
+   {
+       $triplet = array();
+       $triplet['subject'] = TripletExtractor::extractSubjectFromTree($tree);
+       $triplet['predicate'] =
+           TripletExtractor::extractPredicateFromTree($tree);
+       $triplet['object'] = TripletExtractor::extractObjectFromTree($tree);
+       return $triplet;
+    }
+
+    /**
+     * @param $triplet_tree any statement
+     * @return array processed triplet
+     */
+    public static function processTripletForStorage($triplet_tree)
+    {
+        $processed_triplet = array();
+        $processed_triplet['RAW'] =
+            TripletExtractor::getRawTripletForStorage($triplet_tree);
+        $processed_triplet['FEATURED'] =
+            TripletExtractor::getFeaturedTripletForStorage($triplet_tree);
+        return $processed_triplet;
+    }
+
+    /**
+     * @param $triplet_tree triplet array
+     * @return array raw triplet array
+     */
+    public static function getRawTripletForStorage($triplet_tree)
+    {
+        $raw_triplet = array();
+        $question_answer_triplet = array();
+        if (isset($triplet_tree['subject']['RAW'])
+            && isset($triplet_tree['predicate']['RAW'])
+            && isset($triplet_tree['object']['RAW'])
+            && !TripletExtractor::IsNullOrEmptyString($triplet_tree['subject']['RAW'])
+            && !TripletExtractor::IsNullOrEmptyString($triplet_tree['predicate']['RAW'])
+            && !TripletExtractor::IsNullOrEmptyString($triplet_tree['object']['RAW'])
+        ) {
+
+            $SUBJECT = trim($triplet_tree['subject']['RAW']);
+            $PREDICATE = trim($triplet_tree['predicate']['RAW']);
+            $OBJECT = trim($triplet_tree['object']['RAW']);
+
+            $raw_triplet['SUBJECT'] =
+                TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT;
+            $raw_triplet['PREDICATE'] =
+                $SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT;
+            $raw_triplet['OBJECT'] =
+                $SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word;
+
+            $question_answer_triplet[TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT] = $SUBJECT;
+            $question_answer_triplet[$SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT] = $PREDICATE;
+            $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word] = $OBJECT;
+            $raw_triplet['QUESTION_ANSWER_LIST'] = $question_answer_triplet;
+        }
+        return $raw_triplet;
+    }
+
+    /**
+     * @param $triplet_tree triplet array
+     * @return array featured triplet array
+     */
+    public static function getFeaturedTripletForStorage($triplet_tree)
+    {
+        $featured_triplet = array();
+        $question_answer_triplet = array();
+        if (isset($triplet_tree['subject']['FEATURED'])
+            && isset($triplet_tree['predicate']['FEATURED'])
+            && isset($triplet_tree['object']['FEATURED'])
+            && !TripletExtractor::IsNullOrEmptyString($triplet_tree['subject']['FEATURED'])
+            && !TripletExtractor::IsNullOrEmptyString($triplet_tree['predicate']['FEATURED'])
+            && !TripletExtractor::IsNullOrEmptyString($triplet_tree['object']['FEATURED'])
+        ) {
+            $SUBJECT = trim($triplet_tree['subject']['FEATURED']);
+            $PREDICATE = trim($triplet_tree['predicate']['FEATURED']);
+            $OBJECT = trim($triplet_tree['object']['FEATURED']);
+
+            $featured_triplet['SUBJECT'] =
+                TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT;
+            $featured_triplet['PREDICATE'] =
+                $SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT;
+            $featured_triplet['OBJECT'] =
+                $SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word;
+
+            $question_answer_triplet[TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT] = $SUBJECT;
+            $question_answer_triplet[$SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT] = $PREDICATE;
+            $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word] = $OBJECT;
+
+            $featured_triplet['QUESTION_ANSWER_LIST'] = $question_answer_triplet;
+        }
+        return $featured_triplet;
+    }
+
+    /**
+     * @param $string any string
+     * @return bool true if null of empty string
+     */
+    public static function IsNullOrEmptyString($string)
+    {
+        return (!isset($string) || trim($string) === '');
+    }
+
+    /**
+     * @param $tree generated tree
+     * @return array subject array
+     */
+    public static function extractSubjectFromTree($tree)
+    {
+        $subject = array();
+        if (isset($tree['NP']) && $tree['NP'] != null) {
+            $tree_np = $tree['NP'];
+            $value = TripletExtractor::extractFirstNounFromNPTree($tree_np);
+            $subject['RAW'] = $value;
+            $featured_subject = "";
+            $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($tree_np));
+            foreach ($it as $v) {
+                $featured_subject .= $v . " ";
+            }
+            $subject['FEATURED'] = $featured_subject;
+        } else {
+            $subject['RAW'] = "";
+            $subject['FEATURED'] = "";
+        }
+        return $subject;
+    }
+
+    /**
+     * @param $tree generated tree
+     * @return array predicate array
+     */
+    public static function extractPredicateFromTree($tree)
+    {
+        $predicate = array();
+        if (isset($tree['VP']) && $tree['VP'] != null) {
+            $tree_vp = $tree['VP'];
+            $value = TripletExtractor::extractDeepestVerbFromVBTree($tree_vp);
+            $predicate['RAW'] = $value;
+            $featured_predicate = "";
+            if (isset($tree_vp['VB']) && $tree_vp['VB'] != null) {
+                $tree_vb = $tree_vp['VB'];
+                $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($tree_vb));
+                foreach ($it as $v) {
+                    $featured_predicate .= $v . " ";
+                }
+                $predicate['FEATURED'] = $featured_predicate;
+            }
+        } else {
+            $predicate['RAW'] = "";
+            $predicate['FEATURED'] = "";
+        }
+        return $predicate;
+    }
+
+    /**
+     * @param $tree generated tree
+     * @return array object array
+     */
+    public static function extractObjectFromTree($tree)
+    {
+        $object = array();
+        if (isset($tree['VP']) && $tree['VP'] != null) {
+            $tree_vp = $tree['VP'];
+            if (isset($tree_vp['NP']) && $tree_vp['NP'] != null) {
+                $nb = $tree_vp['NP'];
+                $value = TripletExtractor::extractFirstNounFromNPTree($nb);
+                $object['RAW'] = $value;
+                $featured_object = "";
+                $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($nb));
+                foreach ($it as $v) {
+                    $featured_object .= $v . " ";
+                }
+                $object['FEATURED'] = $featured_object;
+            } else {
+                $object['RAW'] = "";
+                $object['FEATURED'] = "";
+            }
+        } else {
+            $object['RAW'] = "";
+            $object['FEATURED'] = "";
+        }
+        return $object;
+    }
+
+    /**
+     * @param $tree_np noun phrase subtree
+     * @return string first noun
+     */
+    public static function extractFirstNounFromNPTree($tree_np)
+    {
+        if (isset($tree_np['NN']) && $tree_np['NN'] != null) {
+            $nn = $tree_np['NN'];
+            if (isset($nn['NN']) && $nn['NN'] != null) {
+                $nn = $nn['NN'];
+                return $nn;
+            }
+        }
+        return "";
+    }
+
+    /**
+     * @param $tree_vp verb phrase subtree
+     * @return string deepest verb
+     */
+    public static function extractDeepestVerbFromVBTree($tree_vp)
+    {
+        if (isset($tree_vp['VB']) && $tree_vp['VB'] != null) {
+            $vb = $tree_vp['VB'];
+            if ($vb['VB'] != null) {
+                $vb = $vb['VB'];
+                return $vb;
+            }
+        }
+        return "";
+    }
+
+    /**
+     * @param $tree generated tree
+     * @return array attributes array
+     */
+    public static function extractAttributes($tree)
+    {
+        $attribute_map = array();
+        if (isset($tree['JJ']) && count($tree['JJ']) > 0) {
+            $attribute_map['JJ'] = $tree['JJ']['JJ'];
+        }
+
+        if (isset($tree['NN']) && count($tree['NN']) > 0) {
+
+            $nn_tree = $tree['NN'];
+
+            if (isset($nn_tree['IN']) && count($nn_tree['IN']) > 0) {
+                $attribute_map['IN'] = $nn_tree['IN'];
+            }
+
+            if (isset($nn_tree['JJ_1']) && count($nn_tree['JJ_1']) > 0) {
+                $attribute_map['JJ_1'] = $nn_tree['JJ_1'];
+            }
+
+            if (isset($nn_tree['NNP']) && count($nn_tree['NNP']) > 0) {
+                $attribute_map['NNP'] = $nn_tree['NNP'];
+            }
+        }
+        return $attribute_map;
+    }
+
+    /**
+     * @param $statement any statement
+     * @return array processed triplet
+     */
+    public static function storeStatementAsTriplet($statement)
+    {
+        try {
+            $tagged_statement =
+                TripletExtractor::partOfSpeechTagger_Brill($statement);
+            $statement_tree =
+                TripletExtractor::generateParseTreeUsingRDP($tagged_statement);
+            $triplet_tree = TripletExtractor::extractTriplet($statement_tree);
+            return TripletExtractor::processTripletForStorage($triplet_tree);
+        } catch (\Exception $e) {
+
+        }
+    }
+
+    /**
+     * @param $statement_array array of statements
+     * @return array list of triplets
+     */
+    public static function storeStatementArraysAsTriplet($statement_array)
+    {
+        $triplets_list = array();
+        $question_list = array();
+        $question_answer_list = array();
+        foreach ($statement_array as $key => $value) {
+            try {
+                if (str_word_count($key) >= 3) {
+                    $extracted_triplet = TripletExtractor::storeStatementAsTriplet($key);
+
+                    if (isset($extracted_triplet['RAW']) &&
+                        sizeof($extracted_triplet['RAW']) > 0) {
+                        $question_list[$extracted_triplet['RAW']['SUBJECT']] = $value;
+                        $question_list[$extracted_triplet['RAW']['PREDICATE']] = $value;
+                        $question_list[$extracted_triplet['RAW']['OBJECT']] = $value;
+                        $question_answer_list = array_merge($question_answer_list,
+                            $extracted_triplet['RAW']['QUESTION_ANSWER_LIST']);
+                    }
+
+                    if (isset($extracted_triplet['FEATURED']) &&
+                        sizeof($extracted_triplet['FEATURED']) > 0) {
+                        $question_list[$extracted_triplet['FEATURED']['SUBJECT']] = $value;
+                        $question_list[$extracted_triplet['FEATURED']['PREDICATE']] = $value;
+                        $question_list[$extracted_triplet['FEATURED']['OBJECT']] = $value;
+                        $question_answer_list = array_merge($question_answer_list,
+                            $extracted_triplet['FEATURED']['QUESTION_ANSWER_LIST']);
+                    }
+                }
+            } catch (\Exception $e) {
+                echo 'Caught exception: ', $e->getMessage(), "\n";
+                continue;
+            }
+        }
+        $triplets_list['QUESTION_LIST'] = $question_list;
+        $triplets_list['QUESTION_ANSWER_LIST'] = $question_answer_list;
+        return $triplets_list;
+    }
+
+    /**
+     * Takes any question started with WH question and returns the
+     * triplet from the question
+     *
+     * @param $question_string question string
+     * @return array question triplet
+     */
+    public static function questionParser($question_string)
+    {
+        $question_string_tagged = TripletExtractor::partOfSpeechTagger_Brill(
+            $question_string);
+        $index = 0;
+        $generated_question_array = array();
+        if (isset($question_string_tagged[$index]) &&
+            ("WRB" == trim($question_string_tagged[$index]['tag']) ||
+                "WP" == trim($question_string_tagged[$index]['tag']))
+        ) {
+            if ("WHO" == strtoupper(
+                    trim($question_string_tagged[$index]['token']))) {
+                $index = $index + 1;
+                $generated_question_array =
+                    TripletExtractor::parseWHOQuestion(
+                        $question_string_tagged, $index);
+            } else {
+                if ("WHERE" == strtoupper(
+                        trim($question_string_tagged[$index]['token'])) ||
+                    "WHEN" == strtoupper(
+                        trim($question_string_tagged[$index]['token'])) ||
+                    "WHAT" == strtoupper(
+                        trim($question_string_tagged[$index]['token']))
+                ) {
+                    $index = $index + 1;
+                    $generated_question_array =
+                        TripletExtractor::parseWHPlusQuestion_New(
+                            $question_string_tagged,
+                        $index);
+                }
+            }
+        }
+        return $generated_question_array;
+    }
+
+    /**
+     * Takes tagged question string starts with Who
+     * and returns question triplet from the question string
+     *
+     * @param $question_string_tagged tagged question statement
+     * @param $index current index in statement
+     * @return array parsed triplet
+     */
+    public static function parseWHOQuestion($question_string_tagged, $index)
+    {
+        $generated_question_array = array();
+        $tree = ["cur_node" => $index];
+        $tree['NP'] = "WHO";
+        $triplet = array();
+        $tree_vp = TripletExtractor::extractVPUsingRDP(
+            $question_string_tagged, $tree);
+        $triplet['predicate'] = TripletExtractor::extractPredicateFromTree(
+            $tree_vp);
+        $triplet['object'] = TripletExtractor::extractObjectFromTree(
+            $tree_vp);
+        if (isset($triplet['object']['RAW'])
+            && isset($triplet['predicate']['RAW'])
+            && !TripletExtractor::IsNullOrEmptyString(
+                $triplet['object']['RAW'])
+            && !TripletExtractor::IsNullOrEmptyString(
+                $triplet['predicate']['RAW'])
+        ) {
+            $generated_question_array['RAW']['1'] =
+                trim($triplet['object']['RAW']) .
+                " " . trim($triplet['predicate']['RAW']) .
+                " " . trim(TripletExtractor::$question_word);
+            $generated_question_array['RAW']['2'] =
+                trim(TripletExtractor::$question_word) .
+                " " . trim($triplet['predicate']['RAW']) .
+                " " . trim($triplet['object']['RAW']);
+        }
+        if (isset($triplet['object']['FEATURED'])
+            && isset($triplet['predicate']['FEATURED'])
+            && !TripletExtractor::IsNullOrEmptyString(
+                $triplet['object']['FEATURED'])
+            && !TripletExtractor::IsNullOrEmptyString(
+                $triplet['predicate']['FEATURED'])
+        ) {
+            $generated_question_array['FEATURED']['1'] =
+                trim($triplet['object']['FEATURED']) .
+                " " . trim($triplet['predicate']['FEATURED']) .
+                " " . trim(TripletExtractor::$question_word);
+            $generated_question_array['FEATURED']['2'] =
+                trim($triplet['object']['FEATURED']) .
+                " " . trim($triplet['predicate']['FEATURED']) .
+                " " . trim(TripletExtractor::$question_word);
+        }
+        return $generated_question_array;
+    }
+
+    /**
+     * Takes tagged question string starts with Wh+ except Who
+     * and returns question triplet from the question string
+     *
+     * @param $question_string_tagged tagged question statement
+     * @param $index current index in statement
+     * @return array parsed triplet
+     */
+    public static function parseWHPlusQuestion($question_string_tagged, $index)
+    {
+        $generated_question_array = array();
+        $aux_verb = "";
+        while (isset($question_string_tagged[$index]) &&
+            ("VB" == trim($question_string_tagged[$index]['tag']) ||
+                "VBD" == trim($question_string_tagged[$index]['tag']) ||
+                "VBG" == trim($question_string_tagged[$index]['tag']) ||
+                "VBN" == trim($question_string_tagged[$index]['tag']) ||
+                "VBP" == trim($question_string_tagged[$index]['tag']) ||
+                "VBZ" == trim($question_string_tagged[$index]['tag']))) {
+            $aux_verb .= " " . trim($question_string_tagged[$index]['token']);
+            $index = $index + 1;
+        }
+        $tree = ["cur_node" => $index];
+        $tree['NP'] = "WHPlus";
+        $triplet = array();
+        $tree_np = TripletExtractor::extractNPUsingRDP(
+            $question_string_tagged, $tree);
+        $triplet['subject'] = TripletExtractor::extractSubjectFromTree(
+            $tree_np);
+        $tree_vp = TripletExtractor::extractVPUsingRDP(
+            $question_string_tagged, $tree);
+        $triplet['predicate'] = TripletExtractor::extractPredicateFromTree(
+            $tree_vp);
+        $triplet['object'] = TripletExtractor::extractObjectFromTree(
+            $tree_vp);
+        if (isset($aux_verb)
+            && !TripletExtractor::IsNullOrEmptyString($aux_verb)
+        ) {
+            $triplet['predicate']['RAW'] = trim($aux_verb) .
+                " " . $triplet['predicate']['RAW'];
+            if (!isset($triplet['predicate']['FEATURED'])) {
+                $triplet['predicate']['FEATURED'] = "";
+            }
+            $triplet['predicate']['FEATURED'] = trim($aux_verb) .
+                " " . $triplet['predicate']['FEATURED'];
+        }
+        if (isset($triplet['subject']['RAW'])
+            && isset($triplet['predicate']['RAW'])
+            && !TripletExtractor::IsNullOrEmptyString(
+                $triplet['subject']['RAW'])
+            && !TripletExtractor::IsNullOrEmptyString(
+                $triplet['predicate']['RAW'])
+        ) {
+            $generated_question_array['RAW']['1'] =
+                trim($triplet['subject']['RAW']) .
+                " " . trim($triplet['predicate']['RAW']) .
+                " " . trim(TripletExtractor::$question_word);
+            $generated_question_array['RAW']['2'] =
+                trim(TripletExtractor::$question_word) .
+                " " . trim($triplet['predicate']['RAW']) .
+                " " . trim($triplet['subject']['RAW']);
+        }
+        if (isset($triplet['subject']['FEATURED'])
+            && isset($triplet['predicate']['FEATURED'])
+            && !TripletExtractor::IsNullOrEmptyString(
+                $triplet['subject']['FEATURED'])
+            && !TripletExtractor::IsNullOrEmptyString(
+                $triplet['predicate']['FEATURED'])
+        ) {
+            $generated_question_array['FEATURED']['1'] =
+                trim($triplet['subject']['FEATURED']) .
+                " " . trim($triplet['predicate']['FEATURED']) .
+                " " . trim(TripletExtractor::$question_word);
+            $generated_question_array['FEATURED']['2'] =
+                trim(TripletExtractor::$question_word) .
+                " " . trim($triplet['predicate']['FEATURED']) .
+                " " . trim($triplet['subject']['FEATURED']);
+        }
+        return $generated_question_array;
+    }
+}
\ No newline at end of file
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index fd2ab14..c33e644 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -38,6 +38,7 @@ use seekquarry\yioop\library\IndexManager;
 use seekquarry\yioop\library\PhraseParser;
 use seekquarry\yioop\library\Thesaurus;
 use seekquarry\yioop\library\index_bundle_iterators as I;
+use seekquarry\yioop\library\TripletExtractor;
 
 /**
  * logging is done during crawl not through web,
@@ -398,6 +399,21 @@ class PhraseModel extends ParallelModel
                         if (isset($out_results['PAGES'][$out_count])) {
                             $results['PAGES'][$i] =
                                 $out_results['PAGES'][$out_count];
+
+                            if (isset($out_results['PAGES'][$out_count]
+                                [self::QUESTION_TRIPLETS])) {
+                                $triplets_with_answer = 
+                                    $out_results['PAGES'][$out_count]
+                                        [self::QUESTION_TRIPLETS];
+                                $question = trim($phrase);
+
+                                if (isset($triplets_with_answer[$question])) {
+                                   $out_results['PAGES'][$out_count]['ANSWER']
+                                        = $triplets_with_answer[$question];
+                                }
+                            }
+                            $results['PAGES'][$i] =
+                                $out_results['PAGES'][$out_count];
                             $out_count++;
                         }
                     }
@@ -899,8 +915,33 @@ class PhraseModel extends ParallelModel
                 }
             }
         }
+        if ($this->isQuestion($phrase)) {
+            $generated_question = TripletExtractor::questionParser(trim($phrase));
+            if(isset($generated_question['FEATURED'])){
+                $phrase = $generated_question['FEATURED']['1'];
+            }else if(isset($generated_question['RAW'])){
+                $phrase = $generated_question['RAW']['1'];
+            }
+        }
         return $phrase;
     }
+
+    /**
+     * Takes a phrase query entered by user and return true if it is question
+     * and false if not
+     *
+     * @param $phrase any statement
+     * @return bool returns true if statement is question
+     */
+    public function isQuestion($phrase)
+    {
+        $regex_starts_with_que = "/^(who|what|which|where|when|whose|whome|how)(.*)$/";
+        $regex_ends_with_que = "/^(.*)\?$/"; // Not in use
+        if (preg_match($regex_starts_with_que, trim($phrase))) {
+            return true;
+        }
+        return false;
+    }
     /**
      * Matches terms (non white-char strings) in the language $lang_tag in
      * $phrase that begin with  $start_with and don't contain  $not_contain,
diff --git a/src/views/SearchView.php b/src/views/SearchView.php
index abcda33..19df84c 100755
--- a/src/views/SearchView.php
+++ b/src/views/SearchView.php
@@ -334,8 +334,23 @@ class SearchView extends View implements CrawlConstants
                 is_array($page[self::WORD_CLOUD])) { ?>
                 <p><span class="echo-link" <?=$subtitle ?>><?=
                     UrlParser::simplifyUrl($url, 40)." "
-                ?></span><?php
+                ?></span>
+                <?php
+                if(isset($page['ANSWER'])){
+                    $answer = $page['ANSWER'];
+                ?>
+                <span class="echo-link" <?=$subtitle ?>>
+                    <?php e("<span class='word-cloud-spacer'>".
+                    "Possible Answer:"."</span>");?>
+                    <?=$answer." "?>
+                </span>
+                <?php
+                    }
+                ?>
+                <?php
                 $cloud = $page[self::WORD_CLOUD];
+                ?>
+                <?php
                 $i = 1;
                 e("<span class='word-cloud-spacer'>".
                     tl('search_view_word_cloud')."</span>");
diff --git a/src/views/elements/PageoptionsElement.php b/src/views/elements/PageoptionsElement.php
index 16a7ed7..3acfd4c 100644
--- a/src/views/elements/PageoptionsElement.php
+++ b/src/views/elements/PageoptionsElement.php
@@ -403,6 +403,11 @@ class PageOptionsElement extends Element
                 e("<h3>".tl('pageoptions_element_extracted_words')."</h3>");
                 e("<pre>\n{$data['EXTRACTED_WORDS']}\n</pre>");
             }
+            if (isset($data["QUESTIONS_TRIPLET"])) {
+                e("<h3>" . tl('pageoptions_element_extracted_questions') . 
+                    "</h3>");
+                e("<pre>\n{$data['QUESTIONS_TRIPLET']}\n</pre>");
+            }
             if (isset($data["EXTRACTED_META_WORDS"])) {
                 e("<h3>".tl('pageoptions_element_extracted_metas')."</h3>");
                 e("<pre>\n{$data['EXTRACTED_META_WORDS']}\n</pre>");
-- 
2.10.0.windows.1


From b5f477f2e40a02770887101f0e06dd1df0ddb6e4 Mon Sep 17 00:00:00 2001
From: Salil Shenoy <salilshenoy@gmail.com>
Date: Sat, 22 Oct 2016 15:33:18 -0700
Subject: [PATCH 2/5] Replaced the use of array() by []

---
 src/executables/Fetcher.php      | 15 ++++++++-
 src/library/TripletExtractor.php | 69 ++++++++++++++++++++--------------------
 2 files changed, 48 insertions(+), 36 deletions(-)

diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 963f982..53f2908 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -15,7 +15,7 @@
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * GNU General Public License for more details.89
  *
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
@@ -559,6 +559,7 @@ class Fetcher implements CrawlConstants
         $local_archives = [""];
         while (CrawlDaemon::processHandler()) {
             $start_time = microtime(true);
+            $info = [];
             $fetcher_message_file = C\CRAWL_DIR.
                 "/schedules/{$prefix}FetcherMessages.txt";
             if (file_exists($fetcher_message_file)) {
@@ -577,6 +578,8 @@ class Fetcher implements CrawlConstants
                 if ($info[self::CRAWL_TIME] == 0) {
                     $info[self::STATUS] = self::NO_DATA_STATE;
                     $this->to_crawl = [];
+                } else {
+                   L\crawlLog("Crawl time is now " . $this->crawl_time); 
                 }
             } else if ($this->crawl_type == self::ARCHIVE_CRAWL &&
                     $this->arc_type != "WebArchiveBundle" &&
@@ -981,6 +984,11 @@ class Fetcher implements CrawlConstants
         if (isset($info[self::CRAWL_TIME])
             && ($info[self::CRAWL_TIME] != $this->crawl_time
             || $info[self::CRAWL_TIME] == 0)) {
+            if ($info[self::CRAWL_TIME] > 0) {
+                L\crawlLog("New Crawl Time Found: {$info[self::CRAWL_TIME]}");
+            } else {
+                L\crawlLog("Crawl Time Changing to 0");
+            }
             $dir = C\CRAWL_DIR."/schedules";
             $time_change = true;
             /* Zero out the crawl. If haven't done crawl before, then scheduler
@@ -1033,6 +1041,8 @@ class Fetcher implements CrawlConstants
                 "{$this->crawl_time}.txt") && file_exists(
                 "$dir/$prefix".self::fetch_batch_name.
                     "{$this->crawl_time}.txt")) {
+                L\crawlLog("Loading old batches for ".
+                    "{$this->crawl_time}.");
                 $info = unserialize(file_get_contents(
                     "$dir/$prefix".self::fetch_crawl_info.
                         "{$this->crawl_time}.txt"));
@@ -1763,6 +1773,9 @@ class Fetcher implements CrawlConstants
                 }
                 $doc_info = $processor->handle($site[self::PAGE],
                     $site[self::URL]);
+                if (C\FETCHER_PROCESS_DELAY > 0 ) {
+                    usleep(C\FETCHER_PROCESS_DELAY);
+                }
                 if (isset($site[self::REPOSITORY_TYPE]) &&
                     $site[self::REPOSITORY_TYPE] == self::REPOSITORY_GIT) {
                     $site[self::URL] = $tmp_url_store;
diff --git a/src/library/TripletExtractor.php b/src/library/TripletExtractor.php
index 2dd8bbc..6bb3887 100644
--- a/src/library/TripletExtractor.php
+++ b/src/library/TripletExtractor.php
@@ -23,10 +23,9 @@
  * @author Chris Pollett chris@pollett.org
  * @license http://www.gnu.org/licenses/ GPL3
  * @link http://www.seekquarry.com/
- * @copyright 2009 - 2015
+ * @copyright 2009 - 2016
  * @filesource
  */
-
 namespace seekquarry\yioop\library;
 
 use seekquarry\yioop\configs as C;
@@ -49,10 +48,10 @@ class TripletExtractor
     /**
      * Takes a phrase and tags each term in it with its part of speech.
      * So each term in the original phrase gets mapped to term~part_of_speech
-     * This tagger is based on a Brill tagger. It makes uses a lexicon
+     * This tagger is based on a Brill tagger. It uses a lexicon
      * consisting of words from the Brown corpus together with a list of
      * part of speech tags that that word had in the Brown Corpus. These are
-     * used to get an initial part of speech (in word was not present than
+     * used to get an initial part of speech (if word was not present than
      * we assume it is a noun). From this a fixed set of rules is used to modify
      * the initial tag if necessary.
      *
@@ -86,13 +85,13 @@ class TripletExtractor
         }
         preg_match_all("/[\w\d]+/", $text, $matches);
         $tokens = $matches[0];
-        $nouns = array('NN', 'NNS', 'NNP');
-        $verbs = array('VBD', 'VBP', 'VB');
-        $result = array();
-        $previous = array('token' => -1, 'tag' => -1);
+        $nouns = ['NN', 'NNS', 'NNP'];
+        $verbs = ['VBD', 'VBP', 'VB'];
+        $result = [];
+        $previous = ['token' => -1, 'tag' => -1];
         $previous_token = -1;
         sort($tokens);
-        $dictionary = array();
+        $dictionary = [];
         /*
             Notice we sorted the tokens, and notice how we use $cur_pos
             so only advance forward through $lex_string. So the
@@ -118,9 +117,9 @@ class TripletExtractor
         $tag_list = array();
         foreach ($matches[0] as $token) {
             $prev_tag_list = $tag_list;
-            $tag_list = array();
+            $tag_list = [];
             // default to a common noun
-            $current = array('token' => $token, 'tag' => 'NN');
+            $current = ['token' => $token, 'tag' => 'NN'];
             // remove trailing full stops
             $token = strtolower(rtrim($token, "."));
             if (isset($dictionary[$token])) {
@@ -208,7 +207,7 @@ class TripletExtractor
     public static function taggedPartOfSpeechTokensToString($tagged_tokens)
     {
         $tagged_phrase = "";
-        $simplified_parts_of_speech = array(
+        $simplified_parts_of_speech = [
             "NN" => "NN",
             "NNS" => "NN",
             "NNP" => "NN",
@@ -228,7 +227,7 @@ class TripletExtractor
             "RBR" => "AV",
             "RBS" => "AV",
             "WRB" => "AV"
-        );
+        ];
         foreach ($tagged_tokens as $t) {
             $tag = trim($t['tag']);
             $tag = (isset($simplified_parts_of_speech[$tag])) ?
@@ -257,12 +256,12 @@ class TripletExtractor
             fclose($fh);
         }
         preg_match_all("/[\w\d\.]+/", $text, $matches);
-        $nouns = array('NN', 'NNS');
-        $return = array();
+        $nouns = ['NN', 'NNS'];
+        $return = [];
         $i = 0;
         foreach ($matches[0] as $token) {
             // default to a common noun
-            $return[$i] = array('token' => $token, 'tag' => 'NN');
+            $return[$i] = ['token' => $token, 'tag' => 'NN'];
             // remove trailing full stops
             if (substr($token, -1) == '.') {
                 $token = preg_replace('/\.+$/', '', $token);
@@ -275,7 +274,7 @@ class TripletExtractor
             if ($i > 0) {
                 if ($return[$i - 1]['tag'] == 'DT' &&
                     in_array($return[$i]['tag'],
-                        array('VBD', 'VBP', 'VB'))
+                        ['VBD', 'VBP', 'VB'])
                 ) {
                     $return[$i]['tag'] = 'NN';
                 }
@@ -346,7 +345,7 @@ class TripletExtractor
      */
     public static function generateParseTreeUsingRDP($tagger_array)
     {
-        $tree = array();
+        $tree = [];
         $tree = ["cur_node" => 0];
         $tree_np = TripletExtractor::extractNPUsingRDP($tagger_array, $tree);
         $tree = ["cur_node" => $tree_np['cur_node']];
@@ -578,7 +577,7 @@ class TripletExtractor
     */
    public static function extractTriplet($tree)
    {
-       $triplet = array();
+       $triplet = [];
        $triplet['subject'] = TripletExtractor::extractSubjectFromTree($tree);
        $triplet['predicate'] =
            TripletExtractor::extractPredicateFromTree($tree);
@@ -592,7 +591,7 @@ class TripletExtractor
      */
     public static function processTripletForStorage($triplet_tree)
     {
-        $processed_triplet = array();
+        $processed_triplet = [];
         $processed_triplet['RAW'] =
             TripletExtractor::getRawTripletForStorage($triplet_tree);
         $processed_triplet['FEATURED'] =
@@ -606,8 +605,8 @@ class TripletExtractor
      */
     public static function getRawTripletForStorage($triplet_tree)
     {
-        $raw_triplet = array();
-        $question_answer_triplet = array();
+        $raw_triplet = [];
+        $question_answer_triplet = [];
         if (isset($triplet_tree['subject']['RAW'])
             && isset($triplet_tree['predicate']['RAW'])
             && isset($triplet_tree['object']['RAW'])
@@ -641,8 +640,8 @@ class TripletExtractor
      */
     public static function getFeaturedTripletForStorage($triplet_tree)
     {
-        $featured_triplet = array();
-        $question_answer_triplet = array();
+        $featured_triplet = [];
+        $question_answer_triplet = [];
         if (isset($triplet_tree['subject']['FEATURED'])
             && isset($triplet_tree['predicate']['FEATURED'])
             && isset($triplet_tree['object']['FEATURED'])
@@ -685,7 +684,7 @@ class TripletExtractor
      */
     public static function extractSubjectFromTree($tree)
     {
-        $subject = array();
+        $subject = [];
         if (isset($tree['NP']) && $tree['NP'] != null) {
             $tree_np = $tree['NP'];
             $value = TripletExtractor::extractFirstNounFromNPTree($tree_np);
@@ -709,7 +708,7 @@ class TripletExtractor
      */
     public static function extractPredicateFromTree($tree)
     {
-        $predicate = array();
+        $predicate = [];
         if (isset($tree['VP']) && $tree['VP'] != null) {
             $tree_vp = $tree['VP'];
             $value = TripletExtractor::extractDeepestVerbFromVBTree($tree_vp);
@@ -736,7 +735,7 @@ class TripletExtractor
      */
     public static function extractObjectFromTree($tree)
     {
-        $object = array();
+        $object = [];
         if (isset($tree['VP']) && $tree['VP'] != null) {
             $tree_vp = $tree['VP'];
             if (isset($tree_vp['NP']) && $tree_vp['NP'] != null) {
@@ -798,7 +797,7 @@ class TripletExtractor
      */
     public static function extractAttributes($tree)
     {
-        $attribute_map = array();
+        $attribute_map = [];
         if (isset($tree['JJ']) && count($tree['JJ']) > 0) {
             $attribute_map['JJ'] = $tree['JJ']['JJ'];
         }
@@ -846,8 +845,8 @@ class TripletExtractor
      */
     public static function storeStatementArraysAsTriplet($statement_array)
     {
-        $triplets_list = array();
-        $question_list = array();
+        $triplets_list = [];
+        $question_list = [];
         $question_answer_list = array();
         foreach ($statement_array as $key => $value) {
             try {
@@ -894,7 +893,7 @@ class TripletExtractor
         $question_string_tagged = TripletExtractor::partOfSpeechTagger_Brill(
             $question_string);
         $index = 0;
-        $generated_question_array = array();
+        $generated_question_array = [];
         if (isset($question_string_tagged[$index]) &&
             ("WRB" == trim($question_string_tagged[$index]['tag']) ||
                 "WP" == trim($question_string_tagged[$index]['tag']))
@@ -934,10 +933,10 @@ class TripletExtractor
      */
     public static function parseWHOQuestion($question_string_tagged, $index)
     {
-        $generated_question_array = array();
+        $generated_question_array = [];
         $tree = ["cur_node" => $index];
         $tree['NP'] = "WHO";
-        $triplet = array();
+        $triplet = [];
         $tree_vp = TripletExtractor::extractVPUsingRDP(
             $question_string_tagged, $tree);
         $triplet['predicate'] = TripletExtractor::extractPredicateFromTree(
@@ -989,7 +988,7 @@ class TripletExtractor
      */
     public static function parseWHPlusQuestion($question_string_tagged, $index)
     {
-        $generated_question_array = array();
+        $generated_question_array = [];
         $aux_verb = "";
         while (isset($question_string_tagged[$index]) &&
             ("VB" == trim($question_string_tagged[$index]['tag']) ||
@@ -1003,7 +1002,7 @@ class TripletExtractor
         }
         $tree = ["cur_node" => $index];
         $tree['NP'] = "WHPlus";
-        $triplet = array();
+        $triplet = [];
         $tree_np = TripletExtractor::extractNPUsingRDP(
             $question_string_tagged, $tree);
         $triplet['subject'] = TripletExtractor::extractSubjectFromTree(
-- 
2.10.0.windows.1


From e6f31b220bc2d531512c8129bdf6a6fc94d0702b Mon Sep 17 00:00:00 2001
From: Salil Shenoy <salilshenoy@gmail.com>
Date: Sat, 22 Oct 2016 15:42:07 -0700
Subject: [PATCH 3/5] Removingg extra characters in Fetcher.php

---
 src/executables/Fetcher.php | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index 53f2908..9b9161f 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -15,7 +15,7 @@
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.89
+ * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
-- 
2.10.0.windows.1


From 08364c1e03ecf40c1a6264b2ba6920270bbe073a Mon Sep 17 00:00:00 2001
From: Salil Shenoy <salilshenoy@gmail.com>
Date: Wed, 2 Nov 2016 22:24:55 -0700
Subject: [PATCH 4/5] Implemneted review changes 1. Wrapped some lines > 80
 characters. 2.Code formatting, removed some repeated code

---
 src/library/TripletExtractor.php | 238 +++++++++++++++++++++++++++------------
 1 file changed, 163 insertions(+), 75 deletions(-)

diff --git a/src/library/TripletExtractor.php b/src/library/TripletExtractor.php
index 6bb3887..77ff6a1 100644
--- a/src/library/TripletExtractor.php
+++ b/src/library/TripletExtractor.php
@@ -107,17 +107,15 @@ class TripletExtractor
                 $line = trim(substr($lex_string, $token_pos,
                     $cur_pos - $token_pos));
                 $tag_list = explode(' ', $line);
-                $dictionary[strtolower(rtrim($token, "."))] =
-                    array_slice($tag_list, 1);
+                $dictionary[$token] = array_slice($tag_list, 1);
                 $cur_pos++;
             }
         }
         // now using our dictionary we tag
         $i = 0;
-        $tag_list = array();
+        $tag_list = [];
+        $prev_tag_list = [];
         foreach ($matches[0] as $token) {
-            $prev_tag_list = $tag_list;
-            $tag_list = [];
             // default to a common noun
             $current = ['token' => $token, 'tag' => 'NN'];
             // remove trailing full stops
@@ -193,6 +191,8 @@ class TripletExtractor
             $i++;
             $previous = $current;
             $previous_token = $token;
+            $prev_tag_list = $tag_list;
+            $tag_list = [];
         }
         return $result;
     }
@@ -243,7 +243,7 @@ class TripletExtractor
      * @param $text any statement
      * @return array words tagged with POS tags
      */
-    public static function partOfSpeechTagger_Brill($text)
+    public static function partOfSpeechTaggerBrill($text)
     {
         static $dict = null;
         $lexicon = C\LOCALE_DIR . "/en_US/resources/lexicon.txt";
@@ -509,6 +509,10 @@ class TripletExtractor
     }
 
     /**
+     * Takes current tree and returns
+     * tree by adding auxiliary verb
+     * node to it
+     *
      * @param $tagger_array POS tagged array
      * @param $tree current tree
      * @return mixed VP added tree
@@ -539,6 +543,9 @@ class TripletExtractor
     }
 
     /**
+     * Takes current tree and returns
+     * tree by adding Verb node to it.
+     *
      * @param $tagger_array POS tagged tree
      * @param $tree current tree
      * @return mixed VB added tree
@@ -572,6 +579,9 @@ class TripletExtractor
         return $tree;
    }
    /**
+     * Takes current tree and returns
+     * a triplet extracted from the tree.
+     *
     * @param $tree fully generated tree
     * @return array triplet array
     */
@@ -586,6 +596,10 @@ class TripletExtractor
     }
 
     /**
+     * Takes triplet tree  and returns
+     * the processed triplet from the
+     * tree.
+     *
      * @param $triplet_tree any statement
      * @return array processed triplet
      */
@@ -600,6 +614,10 @@ class TripletExtractor
     }
 
     /**
+     * Takes triplet tree and returns
+     * tree an array of raw
+     * triplets.
+     *
      * @param $triplet_tree triplet array
      * @return array raw triplet array
      */
@@ -610,9 +628,12 @@ class TripletExtractor
         if (isset($triplet_tree['subject']['RAW'])
             && isset($triplet_tree['predicate']['RAW'])
             && isset($triplet_tree['object']['RAW'])
-            && !TripletExtractor::IsNullOrEmptyString($triplet_tree['subject']['RAW'])
-            && !TripletExtractor::IsNullOrEmptyString($triplet_tree['predicate']['RAW'])
-            && !TripletExtractor::IsNullOrEmptyString($triplet_tree['object']['RAW'])
+            && !TripletExtractor::isNullOrEmptyString(
+                                    $triplet_tree['subject']['RAW'])
+            && !TripletExtractor::isNullOrEmptyString(
+                                    $triplet_tree['predicate']['RAW'])
+            && !TripletExtractor::isNullOrEmptyString(
+                                    $triplet_tree['object']['RAW'])
         ) {
 
             $SUBJECT = trim($triplet_tree['subject']['RAW']);
@@ -620,21 +641,31 @@ class TripletExtractor
             $OBJECT = trim($triplet_tree['object']['RAW']);
 
             $raw_triplet['SUBJECT'] =
-                TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT;
+                TripletExtractor::$question_word . " " .
+                                    $PREDICATE . " " . $OBJECT;
             $raw_triplet['PREDICATE'] =
-                $SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT;
+                $SUBJECT . " " . TripletExtractor::$question_word . " " .
+                                    $OBJECT;
             $raw_triplet['OBJECT'] =
-                $SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word;
+                $SUBJECT . " " . $PREDICATE . " " .
+                                    TripletExtractor::$question_word;
 
-            $question_answer_triplet[TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT] = $SUBJECT;
-            $question_answer_triplet[$SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT] = $PREDICATE;
-            $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word] = $OBJECT;
+            $question_answer_triplet[TripletExtractor::$question_word . " " .
+                            $PREDICATE . " " . $OBJECT] = $SUBJECT;
+            $question_answer_triplet[$SUBJECT . " " .
+                TripletExtractor::$question_word . " " . $OBJECT] = $PREDICATE;
+            $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " .
+                TripletExtractor::$question_word] = $OBJECT;
             $raw_triplet['QUESTION_ANSWER_LIST'] = $question_answer_triplet;
         }
         return $raw_triplet;
     }
 
     /**
+     * Takes triplet tree and returns
+     * tree an array of featured
+     * triplets.
+     *
      * @param $triplet_tree triplet array
      * @return array featured triplet array
      */
@@ -645,24 +676,34 @@ class TripletExtractor
         if (isset($triplet_tree['subject']['FEATURED'])
             && isset($triplet_tree['predicate']['FEATURED'])
             && isset($triplet_tree['object']['FEATURED'])
-            && !TripletExtractor::IsNullOrEmptyString($triplet_tree['subject']['FEATURED'])
-            && !TripletExtractor::IsNullOrEmptyString($triplet_tree['predicate']['FEATURED'])
-            && !TripletExtractor::IsNullOrEmptyString($triplet_tree['object']['FEATURED'])
+            && !TripletExtractor::isNullOrEmptyString(
+                                    $triplet_tree['subject']['FEATURED'])
+            && !TripletExtractor::isNullOrEmptyString(
+                                    $triplet_tree['predicate']['FEATURED'])
+            && !TripletExtractor::isNullOrEmptyString(
+                                    $triplet_tree['object']['FEATURED'])
         ) {
             $SUBJECT = trim($triplet_tree['subject']['FEATURED']);
             $PREDICATE = trim($triplet_tree['predicate']['FEATURED']);
             $OBJECT = trim($triplet_tree['object']['FEATURED']);
 
             $featured_triplet['SUBJECT'] =
-                TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT;
+                TripletExtractor::$question_word . " " . $PREDICATE .
+                                    " " . $OBJECT;
             $featured_triplet['PREDICATE'] =
-                $SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT;
+                $SUBJECT . " " . TripletExtractor::$question_word .
+                                    " " . $OBJECT;
             $featured_triplet['OBJECT'] =
-                $SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word;
+                $SUBJECT . " " . $PREDICATE . " " .
+                                    TripletExtractor::$question_word;
 
-            $question_answer_triplet[TripletExtractor::$question_word . " " . $PREDICATE . " " . $OBJECT] = $SUBJECT;
-            $question_answer_triplet[$SUBJECT . " " . TripletExtractor::$question_word . " " . $OBJECT] = $PREDICATE;
-            $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " . TripletExtractor::$question_word] = $OBJECT;
+            $question_answer_triplet[TripletExtractor::$question_word . " " .
+                                    $PREDICATE . " " . $OBJECT] = $SUBJECT;
+            $question_answer_triplet[$SUBJECT . " " .
+                                    TripletExtractor::$question_word .
+                                                " " . $OBJECT] = $PREDICATE;
+            $question_answer_triplet[$SUBJECT . " " . $PREDICATE . " " .
+                                    TripletExtractor::$question_word] = $OBJECT;
 
             $featured_triplet['QUESTION_ANSWER_LIST'] = $question_answer_triplet;
         }
@@ -670,15 +711,22 @@ class TripletExtractor
     }
 
     /**
+     * Takes a string and checks if
+     * it is set or empty.
+     *
      * @param $string any string
      * @return bool true if null of empty string
      */
-    public static function IsNullOrEmptyString($string)
+    public static function isNullOrEmptyString($string)
     {
         return (!isset($string) || trim($string) === '');
     }
 
     /**
+     * Takes current tree and returns
+     * the array of text tagged as
+     * Subject.
+     *
      * @param $tree generated tree
      * @return array subject array
      */
@@ -690,7 +738,8 @@ class TripletExtractor
             $value = TripletExtractor::extractFirstNounFromNPTree($tree_np);
             $subject['RAW'] = $value;
             $featured_subject = "";
-            $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($tree_np));
+            $it = new \RecursiveIteratorIterator(
+                                new \RecursiveArrayIterator($tree_np));
             foreach ($it as $v) {
                 $featured_subject .= $v . " ";
             }
@@ -703,6 +752,10 @@ class TripletExtractor
     }
 
     /**
+     * Takes current tree and returns
+     * the array of text tagged as
+     * Predicate.
+     *
      * @param $tree generated tree
      * @return array predicate array
      */
@@ -716,7 +769,8 @@ class TripletExtractor
             $featured_predicate = "";
             if (isset($tree_vp['VB']) && $tree_vp['VB'] != null) {
                 $tree_vb = $tree_vp['VB'];
-                $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($tree_vb));
+                $it = new \RecursiveIteratorIterator(
+                                    new \RecursiveArrayIterator($tree_vb));
                 foreach ($it as $v) {
                     $featured_predicate .= $v . " ";
                 }
@@ -730,6 +784,10 @@ class TripletExtractor
     }
 
     /**
+     * Takes current tree and returns
+     * the array of text tagged as
+     * Object.
+     *
      * @param $tree generated tree
      * @return array object array
      */
@@ -743,7 +801,8 @@ class TripletExtractor
                 $value = TripletExtractor::extractFirstNounFromNPTree($nb);
                 $object['RAW'] = $value;
                 $featured_object = "";
-                $it = new \RecursiveIteratorIterator(new \RecursiveArrayIterator($nb));
+                $it = new \RecursiveIteratorIterator(
+                                    new \RecursiveArrayIterator($nb));
                 foreach ($it as $v) {
                     $featured_object .= $v . " ";
                 }
@@ -760,6 +819,9 @@ class TripletExtractor
     }
 
     /**
+     * Takes noun phrase tree and return
+     * the first noun from the tree.
+     *
      * @param $tree_np noun phrase subtree
      * @return string first noun
      */
@@ -776,6 +838,9 @@ class TripletExtractor
     }
 
     /**
+     * Takes verb phrase tree and returns
+     * the base form of the verb.
+     *
      * @param $tree_vp verb phrase subtree
      * @return string deepest verb
      */
@@ -792,6 +857,10 @@ class TripletExtractor
     }
 
     /**
+     * Takes current tree and return
+     * attribute maps for noun, adjectives,
+     * preposition.
+     *
      * @param $tree generated tree
      * @return array attributes array
      */
@@ -822,6 +891,11 @@ class TripletExtractor
     }
 
     /**
+     * Takes the statement and apply
+     * the rules in the defined in the
+     * lexicon, assign parts of speech
+     * and generate a triplet tree.
+     *
      * @param $statement any statement
      * @return array processed triplet
      */
@@ -829,7 +903,7 @@ class TripletExtractor
     {
         try {
             $tagged_statement =
-                TripletExtractor::partOfSpeechTagger_Brill($statement);
+                TripletExtractor::partOfSpeechTaggerBrill($statement);
             $statement_tree =
                 TripletExtractor::generateParseTreeUsingRDP($tagged_statement);
             $triplet_tree = TripletExtractor::extractTriplet($statement_tree);
@@ -840,6 +914,11 @@ class TripletExtractor
     }
 
     /**
+     * Process individual statements
+     * from the statement array. Generate
+     * a list of question and answer
+     * pairs.
+     *
      * @param $statement_array array of statements
      * @return array list of triplets
      */
@@ -851,23 +930,32 @@ class TripletExtractor
         foreach ($statement_array as $key => $value) {
             try {
                 if (str_word_count($key) >= 3) {
-                    $extracted_triplet = TripletExtractor::storeStatementAsTriplet($key);
+                    $extracted_triplet =
+                            TripletExtractor::storeStatementAsTriplet($key);
 
                     if (isset($extracted_triplet['RAW']) &&
                         sizeof($extracted_triplet['RAW']) > 0) {
-                        $question_list[$extracted_triplet['RAW']['SUBJECT']] = $value;
-                        $question_list[$extracted_triplet['RAW']['PREDICATE']] = $value;
-                        $question_list[$extracted_triplet['RAW']['OBJECT']] = $value;
-                        $question_answer_list = array_merge($question_answer_list,
+                        $question_list[$extracted_triplet['RAW']['SUBJECT']]
+                                            = $value;
+                        $question_list[$extracted_triplet['RAW']['PREDICATE']]
+                                            = $value;
+                        $question_list[$extracted_triplet['RAW']['OBJECT']]
+                                            = $value;
+                        $question_answer_list =
+                            array_merge($question_answer_list,
                             $extracted_triplet['RAW']['QUESTION_ANSWER_LIST']);
                     }
 
                     if (isset($extracted_triplet['FEATURED']) &&
                         sizeof($extracted_triplet['FEATURED']) > 0) {
-                        $question_list[$extracted_triplet['FEATURED']['SUBJECT']] = $value;
-                        $question_list[$extracted_triplet['FEATURED']['PREDICATE']] = $value;
-                        $question_list[$extracted_triplet['FEATURED']['OBJECT']] = $value;
-                        $question_answer_list = array_merge($question_answer_list,
+                        $question_list[$extracted_triplet['FEATURED']['SUBJECT']]
+                                            = $value;
+                        $question_list[$extracted_triplet['FEATURED']['PREDICATE']]
+                                            = $value;
+                        $question_list[$extracted_triplet['FEATURED']['OBJECT']]
+                                            = $value;
+                        $question_answer_list =
+                            array_merge($question_answer_list,
                             $extracted_triplet['FEATURED']['QUESTION_ANSWER_LIST']);
                     }
                 }
@@ -890,33 +978,30 @@ class TripletExtractor
      */
     public static function questionParser($question_string)
     {
-        $question_string_tagged = TripletExtractor::partOfSpeechTagger_Brill(
+        $question_string_tagged = TripletExtractor::partOfSpeechTaggerBrill(
             $question_string);
         $index = 0;
         $generated_question_array = [];
-        if (isset($question_string_tagged[$index]) &&
-            ("WRB" == trim($question_string_tagged[$index]['tag']) ||
-                "WP" == trim($question_string_tagged[$index]['tag']))
-        ) {
-            if ("WHO" == strtoupper(
-                    trim($question_string_tagged[$index]['token']))) {
-                $index = $index + 1;
-                $generated_question_array =
-                    TripletExtractor::parseWHOQuestion(
-                        $question_string_tagged, $index);
-            } else {
-                if ("WHERE" == strtoupper(
-                        trim($question_string_tagged[$index]['token'])) ||
-                    "WHEN" == strtoupper(
-                        trim($question_string_tagged[$index]['token'])) ||
-                    "WHAT" == strtoupper(
-                        trim($question_string_tagged[$index]['token']))
-                ) {
+        if (isset($question_string_tagged[$index])) {
+            $tag = trim($question_string_tagged[$index]['tag']);
+            if ("WRB" ==  $tag || "WP" == $tag) {
+                $token = strtoupper(
+                    trim($question_string_tagged[$index]['token']));
+                if ("WHO" == $token) {
                     $index = $index + 1;
                     $generated_question_array =
+                        TripletExtractor::parseWHOQuestion(
+                            $question_string_tagged, $index);
+                } else {
+                    if ("WHERE" == $token ||
+                        "WHEN" == $token ||
+                        "WHAT" == $token) {
+                        $index = $index + 1;
+                        $generated_question_array =
                         TripletExtractor::parseWHPlusQuestion_New(
                             $question_string_tagged,
                         $index);
+                    }
                 }
             }
         }
@@ -945,9 +1030,9 @@ class TripletExtractor
             $tree_vp);
         if (isset($triplet['object']['RAW'])
             && isset($triplet['predicate']['RAW'])
-            && !TripletExtractor::IsNullOrEmptyString(
+            && !TripletExtractor::sNullOrEmptyString(
                 $triplet['object']['RAW'])
-            && !TripletExtractor::IsNullOrEmptyString(
+            && !TripletExtractor::isNullOrEmptyString(
                 $triplet['predicate']['RAW'])
         ) {
             $generated_question_array['RAW']['1'] =
@@ -961,9 +1046,9 @@ class TripletExtractor
         }
         if (isset($triplet['object']['FEATURED'])
             && isset($triplet['predicate']['FEATURED'])
-            && !TripletExtractor::IsNullOrEmptyString(
+            && !TripletExtractor::isNullOrEmptyString(
                 $triplet['object']['FEATURED'])
-            && !TripletExtractor::IsNullOrEmptyString(
+            && !TripletExtractor::isNullOrEmptyString(
                 $triplet['predicate']['FEATURED'])
         ) {
             $generated_question_array['FEATURED']['1'] =
@@ -990,15 +1075,18 @@ class TripletExtractor
     {
         $generated_question_array = [];
         $aux_verb = "";
-        while (isset($question_string_tagged[$index]) &&
-            ("VB" == trim($question_string_tagged[$index]['tag']) ||
-                "VBD" == trim($question_string_tagged[$index]['tag']) ||
-                "VBG" == trim($question_string_tagged[$index]['tag']) ||
-                "VBN" == trim($question_string_tagged[$index]['tag']) ||
-                "VBP" == trim($question_string_tagged[$index]['tag']) ||
-                "VBZ" == trim($question_string_tagged[$index]['tag']))) {
-            $aux_verb .= " " . trim($question_string_tagged[$index]['token']);
-            $index = $index + 1;
+        while (isset($question_string_tagged[$index])) {
+            $tag = trim($question_string_tagged[$index]['tag']);
+            if ("VB" ==  $tag ||
+                "VBD" == $tag ||
+                "VBG" == $tag ||
+                "VBN" == $tag ||
+                "VBP" == $tag ||
+                "VBZ" == $tag) {
+                $token = trim($question_string_tagged[$index]['token']);
+                $aux_verb .= " " . $token;
+                $index = $index + 1;
+            }
         }
         $tree = ["cur_node" => $index];
         $tree['NP'] = "WHPlus";
@@ -1014,7 +1102,7 @@ class TripletExtractor
         $triplet['object'] = TripletExtractor::extractObjectFromTree(
             $tree_vp);
         if (isset($aux_verb)
-            && !TripletExtractor::IsNullOrEmptyString($aux_verb)
+            && !TripletExtractor::isNullOrEmptyString($aux_verb)
         ) {
             $triplet['predicate']['RAW'] = trim($aux_verb) .
                 " " . $triplet['predicate']['RAW'];
@@ -1026,9 +1114,9 @@ class TripletExtractor
         }
         if (isset($triplet['subject']['RAW'])
             && isset($triplet['predicate']['RAW'])
-            && !TripletExtractor::IsNullOrEmptyString(
+            && !TripletExtractor::isNullOrEmptyString(
                 $triplet['subject']['RAW'])
-            && !TripletExtractor::IsNullOrEmptyString(
+            && !TripletExtractor::isNullOrEmptyString(
                 $triplet['predicate']['RAW'])
         ) {
             $generated_question_array['RAW']['1'] =
@@ -1042,9 +1130,9 @@ class TripletExtractor
         }
         if (isset($triplet['subject']['FEATURED'])
             && isset($triplet['predicate']['FEATURED'])
-            && !TripletExtractor::IsNullOrEmptyString(
+            && !TripletExtractor::isNullOrEmptyString(
                 $triplet['subject']['FEATURED'])
-            && !TripletExtractor::IsNullOrEmptyString(
+            && !TripletExtractor::isNullOrEmptyString(
                 $triplet['predicate']['FEATURED'])
         ) {
             $generated_question_array['FEATURED']['1'] =
-- 
2.10.0.windows.1


From 7eb3a0a9ca53bf581586414dfa589447fb613857 Mon Sep 17 00:00:00 2001
From: Salil Shenoy <salilshenoy@gmail.com>
Date: Wed, 2 Nov 2016 22:41:20 -0700
Subject: [PATCH 5/5] Wrapping up code lines > 80 columns

---
 src/controllers/components/CrawlComponent.php | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php
index 1efafea..122a48b 100644
--- a/src/controllers/components/CrawlComponent.php
+++ b/src/controllers/components/CrawlComponent.php
@@ -1346,8 +1346,8 @@ class CrawlComponent extends Component implements CrawlConstants
                     PhraseParser::extractPhrasesInLists($phrase_string,
                         $lang);
                 $len = strlen($phrase_string);
-                if (PhraseParser::computeSafeSearchScore($word_lists['WORD_LIST'], $len) <
-                    0.012) {
+                if (PhraseParser::computeSafeSearchScore(
+                    $word_lists['WORD_LIST'], $len) < 0.012) {
                     $meta_ids[] = "safe:true";
                     $safe = true;
                 } else {
@@ -1368,7 +1368,8 @@ class CrawlComponent extends Component implements CrawlConstants
             $data["EXTRACTED_META_WORDS"] = wordwrap($parent->clean(
                 print_r($meta_ids, true), "string"), 75, "\n", true);
             $data["QUESTIONS_TRIPLET"] = wordwrap($parent->clean(
-                print_r($word_lists['QUESTION_ANSWER_LIST'], true), "string"), 75, "\n", true);
+                print_r($word_lists['QUESTION_ANSWER_LIST'], true), 
+                "string"), 75, "\n", true);
         }
         return $data;
     }
-- 
2.10.0.windows.1