From 4cc42d91dbe451f4e9c59540cfbaaa37553875c1 Mon Sep 17 00:00:00 2001 From: SalilShenoy Date: Tue, 21 Nov 2017 20:00:59 -0800 Subject: [PATCH] TripletExtraction Improved and Lexicon Table Structure modified --- src/configs/Config.php | 2 + src/configs/Createdb.php | 14 +- src/library/VersionFunctions.php | 53 ++- src/locale/hi/resources/Tokenizer.php | 592 +++++++++++++++------------------- src/models/ProfileModel.php | 4 + 5 files changed, 329 insertions(+), 336 deletions(-) diff --git a/src/configs/Config.php b/src/configs/Config.php index 0a010845..d42a5e7b 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -976,3 +976,5 @@ nsconddefine('AD_LOGO','resources/adv-logo.png'); nsconddefine('SENTENCE_COMPRESSION_ENABLED', false); /** Define cipher to be used in AES */ nsconddefine('AES_256_CBC', 'aes-256-cbc'); +/** The number of rows to be used in bulk insert from Lexicon */ +nsconddefine('NUM_LEX_BULK_INSERTS',100000); diff --git a/src/configs/Createdb.php b/src/configs/Createdb.php index efd9bf31..a2086903 100755 --- a/src/configs/Createdb.php +++ b/src/configs/Createdb.php @@ -264,26 +264,22 @@ foreach ($locales as $locale) { foreach ($lines as $line) { $line = trim($line, " "); $line = explode(" ", $line); - if (empty($line[0]) || empty($line[1])) - continue; $insert_values .= '(\'' . trim($line[0]) . '\',\'' . $locale[0] . '\',\'' . trim($line[1]) . '\'),'; $count++; - if ($count >= 10000) { + if ($count >= C\NUM_LEX_BULK_INSERTS) { $insert_values = rtrim($insert_values, ','); - $query = 'INSERT INTO LEXICON (WORD, LOCALE, POS) VALUES - {$insert_values}'; + $query = "INSERT INTO LEXICON (TERM, LOCALE, PART_OF_SPEECH) + VALUES {$insert_values}"; $db->exec($query); $insert_values = ""; $count = 0; - if ($db->affectedRows() == 0) { - continue; - } } } + if ($count > 0) { $insert_values = rtrim($insert_values, ','); - $query = "INSERT INTO LEXICON (WORD, LOCALE, POS) VALUES + $query = "INSERT INTO LEXICON (TERM, LOCALE, PART_OF_SPEECH) VALUES {$insert_values}"; $db->exec($query); } diff --git a/src/library/VersionFunctions.php b/src/library/VersionFunctions.php index 6131a945..afa22885 100644 --- a/src/library/VersionFunctions.php +++ b/src/library/VersionFunctions.php @@ -1564,6 +1564,55 @@ function upgradeDatabaseVersion54(&$db) */ function upgradeDatabaseVersion55(&$db) { - $db->execute("CREATE TABLE LEXICON(WORD VARCHAR, LOCALE VARCHAR, - POS VARCHAR, PRIMARY KEY(WORD, LOCALE))"); + $db->execute("CREATE TABLE LEXICON( + TERM VARCHAR(". C\LONG_NAME_LEN ."), + LOCALE VARCHAR(" . C\NAME_LEN . "), + PART_OF_SPEECH VARCHAR(16), PRIMARY KEY(TERM, LOCALE))"); + + // Retrieve the locales added to the Locale table + $sql = "SELECT LOCALE_TAG from LOCALE"; + $result = $db->execute($sql); + if ($result) { + $locales = $db->fetchArray($result); + } + /* + * Go through the locales, check of there is a lexicon, + * if present then add it to the Lexicon database. + * as (term, part_of_speech, locale) + */ + foreach ($locales as $locale) { + $folder_name = $locale; + if (strstr($locale, "-")) { + $locale_name = explode("-", $locale); + $folder_name = $locale_name . "_" . $locale_name; + } + $lexicon_file = C\LOCALE_DIR . "/" . $folder_name . + "/resources/lexicon.txt.gz"; + if (file_exists($lexicon_file)) { + $lines = gzfile($lexicon_file); + $insert_values = ""; + $count = 0; + foreach ($lines as $line) { + $line = trim($line, " "); + $line = explode(" ", $line); + $insert_values .= '(' . trim($line[0]) . ',' . $locale[0] . + ',' . trim($line[1]) . '),'; + $count++; + if ($count >= C\NUM_LEX_BULK_INSERTS) { + $insert_values = rtrim($insert_values, ','); + $query = "INSERT INTO LEXICON (TERM, LOCALE, PART_OF_SPEECH) + VALUES {$insert_values}"; + $db->exec($query); + $insert_values = ""; + $count = 0; + } + } + if ($count > 0) { + $insert_values = rtrim($insert_values, ','); + $query = "INSERT INTO LEXICON (TERM, LOCALE, PART_OF_SPEECH) + VALUES {$insert_values}"; + $db->exec($query); + } + } + } } diff --git a/src/locale/hi/resources/Tokenizer.php b/src/locale/hi/resources/Tokenizer.php index f0df8568..08da7bfc 100755 --- a/src/locale/hi/resources/Tokenizer.php +++ b/src/locale/hi/resources/Tokenizer.php @@ -47,12 +47,13 @@ class Tokenizer * List of verb-like parts of speech that might appear in lexicon * @array */ - public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]; + public static $verb_phrases = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ", + "RB"]; /** * List of noun-like parts of speech that might appear in lexicon * @array */ - public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "PRP"]; + public static $noun_phrases = ["NN", "NNS", "NNP", "NNPS", "DT"]; /** * List of adjective-like parts of speech that might appear in lexicon * @array @@ -62,8 +63,13 @@ class Tokenizer * List of postpositional-like parts of speech that might appear in lexicon * @array */ - public static $postpositional_phrases = ["inj", "PREP", "proNN", "CONJ", - "INT", "particle", "case", "PSP"]; + public static $postpositional_phrases = ["IN", "inj", "PREP", "proNN", + "CONJ", "INT", "particle", "case", "PSP", "direct_DT", "PRP"]; + /* + * List of questions in Hindi + */ + public static $questions = ["क्या", "कब", "कहा", "क्यों", "कौन", "जिसे", + "जिसका", "कहाँ", "कहां"]; /** * Any unique identifier corresponding to the component of a triplet which * can be answered using a question answer list @@ -88,40 +94,6 @@ class Tokenizer { return $pre_segment; } - /** - * Removes the stop words from the page (used for Word Cloud generation) - * - * @param string $page the page to remove stop words from. - * @return string $page with no stop words - */ - public static function stopwordsRemover($page) - { - $stop_words = [ - "पर ", "इन ", "वह ", "यिह ", "वुह ", "जिन्हें", "जिन्हों", - "तिन्हें", "तिन्हों", "किन्हों", "किन्हें", "इत्यादि", "द्वारा", - "इन्हें", "इन्हों", "उन्हों", "बिलकुल", "निहायत", "ऱ्वासा", - "इन्हीं", "उन्हीं", "उन्हें", "इसमें", "जितना", "दुसरा", - "कितना", "दबारा", "साबुत", "वग़ैरह", "दूसरे", "कौनसा", "लेकिन", - "होता", "करने", "किया", "लिये", "अपने", "नहीं", "दिया", "इसका", - "करना", "वाले", "सकते", "इसके", "सबसे", "होने", "करते", "बहुत", - "वर्ग", "करें", "होती", "अपनी", "उनके", "कहते", "होते", "करता", - "उनकी", "इसकी", "सकता", "रखें", "अपना", "उसके", "जिसे", - "तिसे", "किसे", "किसी", "काफ़ी", "पहले", "नीचे", "बाला", "यहाँ", - "जैसा", "जैसे", "मानो", "अंदर", "भीतर", "पूरा", "सारा", "होना", - "उनको", "वहाँ", "वहीं", "जहाँ", "जीधर","उनका", "इनका", "के", - "हैं", "गया", "बनी", "एवं", "हुआ", "साथ", "बाद", "लिए", "कुछ", - "कहा", "यदि", "हुई", "इसे", "हुए", "अभी", "सभी", "कुल", "रहा", - "रहे", "इसी", "उसे", "जिस", "जिन", "तिस", "तिन", "कौन", "किस", - "कोई", "ऐसे", "तरह", "किर", "साभ", "संग", "यही", "बही", "उसी", - "फिर", "मगर", "का", "एक", "यह", "से", "को", "इस", "कि", "जो", - "कर", "मे", "ने", "तो", "ही", "या", "हो", "था", "तक", "आप", "ये", - "थे", "दो", "वे", "थी", "जा", "ना", "उस", "एस", "पे", "उन", "सो", - "भी", "और", "घर", "तब", "जब", "अत", "व", "न" - ]; - $page = preg_replace('/\b('.implode('|',$stop_words).')\b/u', '', - $page); - return $page; - } /** * Computes the stem of an Hindi word * @@ -130,10 +102,6 @@ class Tokenizer */ public static function stem($word) { - if (in_array($word, self::$no_stem_list)) { - return $word; - } - $word = self::removeSuffix($word); return $word; } /** @@ -144,30 +112,6 @@ class Tokenizer */ private static function removeSuffix($word) { - $length = mb_strlen($word); - if ($length > 5) { - $last_three = mb_substr($word, -3); - if (in_array($last_three, ["िया", "ियो"])) { - $word = mb_substr($word, 0, -3); - return $word; - } - } - if ($length > 4) { - $last_two = mb_substr($word, -2); - if (in_array($last_two, ["ाए", " ाओ", " ुआ", " ुओ", - "ये", " ेन", " ेण", " ीय", "टी", "ार", "ाई"])) { - $word = mb_substr($word, 0, -2); - return $word; - } - } - if ($length > 3) { - $last_one = mb_substr($word, -1); - if (in_array($last_one, [" ा", " े", " ी", " ो", "ि ", - "अ"])) { - $word = mb_substr($word, 0, -1); - return $word; - } - } return $word; } /** @@ -195,7 +139,7 @@ class Tokenizer */ public static function tagTokenizePartofSpeech($text) { - $tokens = preg_split("/[\s]+/", $text); + $tokens = preg_split("/\s+/u", $text); $result = []; $tag_list = []; $i = 0; @@ -204,25 +148,25 @@ class Tokenizer { //Tag the tokens as found in the Lexicon $token = trim($token); - $current = ['token' => $token, 'tag' => 'UNKNOWN']; - $word = $current['token']; - $sql = "SELECT * FROM LEXICON WHERE WORD = '{$word}' - AND LOCALE = 'hi'"; + $current = ["token" => $token, "tag" => "UNKNOWN"]; + $term = $current["token"]; + $sql = "SELECT PART_OF_SPEECH FROM LEXICON WHERE TERM = '{$term}' + AND LOCALE = 'hi'"; $queryResult = @$model->db->execute($sql); if ($queryResult !== false) { $row = $model->db->fetchArray($queryResult); - $current['tag'] = $row['POS']; + $current["tag"] = $row["PART_OF_SPEECH"]; } if (is_numeric($token)) { - $current['tag'] = "NN"; + $current["tag"] = "NN"; } else if (strcmp($token,"है") == 0 || strcmp($token, "हैं") == 0) { - $current['tag'] = "VB"; + $current["tag"] = "VB"; } - if (!isset($current['tag'])) { - $current['tag'] = "UNKNOWN"; + if (!isset($current["tag"])) { + $current["tag"] = "UNKNOWN"; } $result[$i] = $current; @@ -236,65 +180,75 @@ class Tokenizer public static function tagUnknownWords($partiallyTaggedText) { $result = $partiallyTaggedText; - $verbs = ['VBZ','VBD','VBN']; + $verbs = ["VBZ","VBD","VBN"]; $length = count($result); $previous = $result[0]; for ($i = 1; $i < $length; $i++) { $current = $result[$i]; - $current['token'] = trim($current['token']); - $current['tag'] = trim($current['tag']); - if ($current['tag'] == "UNKNOWN" || $previous['tag'] == "UNKNOWN") - { - //RULE 1: If the previous word tagged is a Adjective / Pronoun - // Postposition then the current word is likely to be a noun - if ($previous['tag'] == 'JJ' || - $previous['tag'] == 'PRO_NN' || - $previous['tag'] == 'POST_POS') { - $current['tag'] = 'NN'; + $current["token"] = trim($current["token"]); + $current["tag"] = trim($current["tag"]); + if ($current["tag"] == "UNKNOWN" || $previous["tag"] == "UNKNOWN") { + /** + * RULE 1: If the previous word tagged is a Adjective Pronoun + * Postposition then the current word is likely to be a noun + */ + if ($previous["tag"] == "JJ" || + $previous["tag"] == "PRO_NN" || + $previous["tag"] == "POST_POS") { + $current["tag"] = "NN"; $result[$i] = $current; } - //RULE 2: If the current word is a verb then the previous word is - //likely to be a noun - if (in_array($current['tag'], $verbs)) { - $previous['tag'] = 'NN'; - $result[$i] = $previous; + /** + * RULE 2: If the current word is a verb then the previous + * word is likely to be a noun + */ + if (in_array($current["tag"], $verbs)) { + $previous["tag"] = "NN"; + $result[$i-1] = $previous; } - //PRONOUN IDENTIFICATION - //RULE 3: If the previous word is unknown and cuurent word is a - //noun then the previous word is most likely to be a pronoun - if ($previous['tag'] == 'UNKNOWN' && - $current['tag'] == 'NN') { - $previous['tag'] = 'PRP'; + /** + * PRONOUN IDENTIFICATION + * RULE 3: If the previous word is unknown and cuurent word + * is a noun then the previous word is most likely to be a + * pronoun + */ + if ($previous["tag"] == "UNKNOWN" && + $current["tag"] == "NN") { + $previous["tag"] = "PRP"; $result[$i-1] = $previous; } - //VERB IDENTIFICATION - //RULE 4: If the current word is tagged as Auxilary verb and - //previous word is tagged as Unknown then most likely that the - //previous word is a verb - if ($current['tag'] == 'VAUX' && - $previous['tag'] == 'UNKNOWN') { - $previous['tag'] = 'VB'; + /** + * VERB IDENTIFICATION + * RULE 4: If the current word is tagged as Auxilary verb and + * previous word is tagged as Unknown then most likely that + * the previous word is a verb + */ + if ($current["tag"] == "VAUX" && + $previous["tag"] == "UNKNOWN") { + $previous["tag"] = "VB"; $result[$i-1] = $previous; - } - //ADJECTIVE IDENTIFIATION - //RULE 5: if the currennt word ends with 'तम' or 'इक' or 'िक' - //or 'तर' then the word is an adjective - if(mb_substr($current['token'], -2, 2) == "इक" || - mb_substr($current['token'], -2, 2) == "िक" || - mb_substr($current['token'], -2, 2) == "तर" || - mb_substr($current['token'], -2, 2) == "तम") { - $current['tag'] = 'AJ'; + } + /** + * ADJECTIVE IDENTIFIATION + * RULE 5: if the currennt word ends with "तम" or "इक" or "िक" + * or "तर" then the word is an adjective + */ + if(mb_substr($current["token"], -2, 2) == "इक" || + mb_substr($current["token"], -2, 2) == "िक" || + mb_substr($current["token"], -2, 2) == "तर" || + mb_substr($current["token"], -2, 2) == "तम") { + $current["tag"] = "JJ"; $result[$i] = $current; } - if ($current['tag'] == "UNKNOWN") { - $current['tag'] = 'NN'; + if ($current["tag"] == "UNKNOWN") { + $current["tag"] = "NN"; $result[$i] = $current; } - if ($previous['tag'] == "UNKNOWN"){ - $previous['tag'] = 'NN'; + if ($previous["tag"] == "UNKNOWN"){ + $previous["tag"] = "NN"; $result[$i-1] = $previous; } } @@ -327,10 +281,10 @@ class Tokenizer "direct_DT" => "DT", ]; foreach ($tagged_tokens as $t) { - $tag = trim($t['tag']); + $tag = trim($t["tag"]); $tag = (isset($simplified_parts_of_speech[$tag])) ? $simplified_parts_of_speech[$tag] : $tag; - $token = ($with_tokens) ? $t['token'] . "~" : ""; + $token = ($with_tokens) ? $t["token"] . "~" : ""; $tagged_phrase .= $token . $tag . " "; } return $tagged_phrase; @@ -353,21 +307,21 @@ class Tokenizer { //Combining multiple noun into one $noun_string = ""; - $cur_node = $tree['cur_node']; - while (isset($tagged_phrase[$cur_node]['tag']) && - (in_array(trim($tagged_phrase[$cur_node]['tag']), + $cur_node = $tree["cur_node"]; + while (isset($tagged_phrase[$cur_node]["tag"]) && + (in_array(trim($tagged_phrase[$cur_node]["tag"]), self::$noun_phrases))) { - $noun_string .= " " . $tagged_phrase[$cur_node]['token']; + $noun_string .= " " . $tagged_phrase[$cur_node]["token"]; $cur_node++; } if (!empty($noun_string)) { $tree["NN"] = $noun_string; } - $tree['cur_node'] = $cur_node; + $tree["cur_node"] = $cur_node; return $tree; } /** - * Takes a part-of-speech tagged phrase and pre-tree with a + * Takes a part-of-speech tagged phrase and parse-tree with a * parse-from position and builds a parse tree for a sequence of * postpositional phrases if possible * @@ -379,56 +333,57 @@ class Tokenizer * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase */ - public static function extractPostposition($tagged_phrase, $tree, + public static function extractPostpositionPhrase($tagged_phrase, $tree, $index = 1) { - $cur_node = $tree['cur_node']; - if (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), + $cur_node = $tree["cur_node"]; + $tree_pp["cur_node"] = $tree["cur_node"]; + if (isset ($tagged_phrase[$cur_node]["tag"]) && + in_array($tagged_phrase[$cur_node]["tag"], + self::$postpositional_phrases)) { + $pp_string =""; + while (isset($tagged_phrase[$cur_node]["tag"]) && + in_array($tagged_phrase[$cur_node]["tag"], self::$postpositional_phrases)) { - $preposition_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$postpositional_phrases)) { - $preposition_string .= " ". $tagged_phrase[$cur_node]['token']; + $pp_string .= " " . $tagged_phrase[$cur_node]["token"]; $cur_node++; } - if (!empty($preposition_string)) { - $tree["IN_$index"] = $preposition_string; - } - if (isset($tagged_phrase[$cur_node]['tag']) && - trim($tagged_phrase[$cur_node]['tag']) == "DT") { - $tree['DT_$index'] = $tagged_phrase[$cur_node]['token']; - $cur_node++; + if (!empty($pp_string)) { + $tree_pp["IN_$index"] = $pp_string; } $adjective_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$adjective_phrases)) { - $adjective_string .= " " . $tagged_phrase[$cur_node]['token']; + while (isset($tagged_phrase[$cur_node]["tag"]) && + in_array($tagged_phrase[$cur_node]["tag"], + self::$adjective_phrases)) { + $adjective_string .= " " . + $tagged_phrase[$cur_node]["token"]; $cur_node++; } if (!empty($adjective_string)) { - $tree["JJ_$index"] = $adjective_string; + $tree_pp["JJ_$index"] = $adjective_string; } - $prep_noun_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$noun_phrases)) { - $prep_noun_string .= " ". $tagged_phrase[$cur_node]['token']; + $nn_string = ""; + while (isset($tagged_phrase[$cur_node]["tag"]) && + in_array($tagged_phrase[$cur_node]["tag"], + self::$noun_phrases)) { + $nn_string .= " " . $tagged_phrase[$cur_node]["token"]; $cur_node++; } - if ($prep_noun_string) { - $tree["NP_$index"] = $prep_noun_string; + if (!empty($nn_string)) { + $tree_pp["NN_$index"] = $nn_string; } - $tree_next = self::extractPostposition($tagged_phrase, - ["cur_node" => $cur_node], $index + 1); + $tree_pp["cur_node"] = $cur_node; + $tree_next = self::extractPostpositionPhrase($tagged_phrase, + $tree_pp, $index+1); + $tree_pp = array_merge ($tree_pp, $tree_next); } - $tree['cur_node'] = $cur_node; + $tree["cur_node"] = $tree_pp["cur_node"]; + unset ($tree_pp["cur_node"]); + $tree["POST"] = $tree_pp; return $tree; } /** - * Takes a part-of-speech tagged phrase and pre-tree with a + * Takes a part-of-speech tagged phrase and parse-tree with a * parse-from position and builds a parse tree for a noun phrase if possible * * @param array $tagged_phrase @@ -438,30 +393,26 @@ class Tokenizer * current parse position in $tagged_phrase] * @return array has fields * "cur_node" index of how far we parsed $tagged_phrase - * "JJ" with value an adjective subtree - * "POST" with value a post position subtree + * "JJ" with value an Adjective subtree + * "NN" with value of a Noun Subtree */ public static function extractNounPhrase($tagged_phrase, $tree) { - $cur_node = $tree['cur_node']; - $tree_jj = self::extractAdjective($tagged_phrase, - ['cur_node' => $tree['cur_node']]); - $tree_nn =self::extractNoun($tagged_phrase, - ['cur_node' => $tree_jj['cur_node']]); - $tree_post = self::extractPostposition($tagged_phrase, - ['cur_node' => $tree_nn['cur_node']]); - if ($tree_nn['cur_node'] == $cur_node) { - $tree['NP'] = ""; + $cur_node = $tree["cur_node"]; + $tree_jj = self::extractAdjective($tagged_phrase, + ["cur_node" => $tree["cur_node"]]); + $tree_nn = self::extractNoun($tagged_phrase, + ["cur_node" => $tree_jj["cur_node"]]); + if ($tree_nn["cur_node"] == $cur_node) { + $tree["NP"] = ""; } else { - $cur_node = $tree_post['cur_node']; - unset($tree_jj['cur_node']); - $tree_new_sub['JJ'] = $tree_jj; - unset($tree_nn['cur_node']); - $tree_new_sub['NN'] = $tree_nn; - unset($tree_post['cur_node']); - $tree_new_sub['POST'] = $tree_post; - $tree_new['cur_node'] = $cur_node; - $tree_new['NP'] = $tree_new_sub; + $cur_node = $tree_nn["cur_node"]; + unset($tree_jj["cur_node"]); + $tree_new_sub["JJ"] = $tree_jj; + unset($tree_nn["cur_node"]); + $tree_new_sub["NN"] = $tree_nn; + $tree_new["cur_node"] = $cur_node; + $tree_new["NP"] = $tree_new_sub; return $tree_new; } return $tree; @@ -482,24 +433,18 @@ class Tokenizer */ public static function extractVerb($tagged_phrase, $tree) { - $cur_node = $tree['cur_node']; - // skip stuff before verb (intensifiers and adverbs) - while (isset($tagged_phrase[$cur_node]['tag']) && - !in_array(trim($tagged_phrase[$cur_node]['tag']), - self::$verb_phrases)) { - $cur_node++; - } + $cur_node = $tree["cur_node"]; $verb_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), + while (isset($tagged_phrase[$cur_node]["tag"]) && + in_array(trim($tagged_phrase[$cur_node]["tag"]), self::$verb_phrases)) { - $verb_string .= " " . $tagged_phrase[$cur_node]['token']; + $verb_string .= " " . $tagged_phrase[$cur_node]["token"]; $cur_node++; } if (!empty($verb_string)) { $tree["VB"] = $verb_string; } - $tree['cur_node'] = $cur_node; + $tree["cur_node"] = $cur_node; return $tree; } /** @@ -518,39 +463,40 @@ class Tokenizer */ public static function extractVerbPhrase($tagged_phrase, $tree) { - $cur_node = $tree['cur_node']; - $tree_vb = self::extractVerb($tagged_phrase, ['cur_node' => $cur_node]); - if ($tree_vb['cur_node'] == $cur_node) { + $cur_node = $tree["cur_node"]; + $tree_vb = self::extractVerb($tagged_phrase, ["cur_node" => $cur_node]); + if ($tree_vb["cur_node"] == $cur_node) { + $tree["VP"] = []; return $tree; } - $cur_node = $tree_vb['cur_node']; - $preposition_string = ""; - while (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), + $cur_node = $tree_vb["cur_node"]; + $postposition_string = ""; + while (isset($tagged_phrase[$cur_node]["tag"]) && + in_array(trim($tagged_phrase[$cur_node]["tag"]), self::$postpositional_phrases)) { - $preposition_string .= " ". $tagged_phrase[$cur_node]['token']; + $postposition_string .= " ". $tagged_phrase[$cur_node]["token"]; $cur_node++; } - if (!empty($preposition_string)) { - $tree_vb["IN"] = $preposition_string; + if (!empty($postposition_string)) { + $tree_vb["IN"] = $postposition_string; } $tree_np = self::extractNounPhrase($tagged_phrase, - ['cur_node' => $cur_node]); + ["cur_node" => $cur_node]); $tree_new = []; $tree_new_sub = []; - if ($tree_np['cur_node'] != $cur_node) { - $cur_node = $tree_np['cur_node']; - unset($tree_vb['cur_node'], $tree_np['cur_node']); - $tree_new_sub['VB'] = $tree_vb; - $tree_new_sub['NP'] = $tree_np['NP']; - $tree_new['cur_node'] = $cur_node; - $tree_new['VP'] = $tree_new_sub; + if ($tree_np["cur_node"] != $cur_node) { + $cur_node = $tree_np["cur_node"]; + unset($tree_vb["cur_node"], $tree_np["cur_node"]); + $tree_new_sub["VB"] = $tree_vb; + $tree_new_sub["NP"] = $tree_np["NP"]; + $tree_new["cur_node"] = $cur_node; + $tree_new["VP"] = $tree_new_sub; return $tree_new; } - unset($tree_vb['cur_node']); - $tree_new_sub['VB'] = $tree_vb; - $tree_new['cur_node'] = $cur_node; - $tree_new['VP'] = $tree_new_sub; + unset($tree_vb["cur_node"]); + $tree_new_sub["VB"] = $tree_vb; + $tree_new["cur_node"] = $cur_node; + $tree_new["VP"] = $tree_new_sub; return $tree_new; } /** @@ -570,17 +516,17 @@ class Tokenizer public static function extractAdjective($tagged_phrase, $tree) { $adjective_string = ""; - $cur_node = $tree['cur_node']; - while (isset($tagged_phrase[$cur_node]['tag']) && - in_array(trim($tagged_phrase[$cur_node]['tag']), + $cur_node = $tree["cur_node"]; + while (isset($tagged_phrase[$cur_node]["tag"]) && + in_array(trim($tagged_phrase[$cur_node]["tag"]), self::$adjective_phrases)) { - $adjective_string .= " " . $tagged_phrase[$cur_node]['token']; + $adjective_string .= " " . $tagged_phrase[$cur_node]["token"]; $cur_node++; } if (!empty($adjective_string)) { $tree["JJ"] = $adjective_string; } - $tree['cur_node'] = $cur_node; + $tree["cur_node"] = $cur_node; return $tree; } /** @@ -592,24 +538,23 @@ class Tokenizer * "tag"=> part_of_speech_tag_for_term) * @return array used to represent a tree. The array has up to three fields * $tree["cur_node"] index of how far we parsed our$tagged_phrase - * $tree["NP"] contains a subtree for a noun phrase - * $tree["VP"] contains a subtree for a verb phrase + * $tree["NP"] contains a subtree for a subject phrase + * $tree["POST"] contains a subtree for a object phrase + * $tree["VP"] contains a subtree for a predicate phrase */ public static function generatePhraseParseTree($tagged_phrase) { $tree = []; - $tree_np = self::extractNounPhrase($tagged_phrase,['cur_node' => 0]); - $tree = ["cur_node" => $tree_np['cur_node']]; + $tree_np = self::extractNounPhrase($tagged_phrase,["cur_node" => 0]); + $tree = ["cur_node" => $tree_np["cur_node"]]; + $tree_pp = self::extractPostpositionPhrase($tagged_phrase, $tree); + $tree["cur_node"] = $tree_pp["cur_node"]; $tree_vp = self::extractVerbPhrase($tagged_phrase, $tree); - $tree['cur_node'] = $tree_vp['cur_node']; - if ($tree == $tree_vp) { - unset($tree_np['cur_node'], $tree_vp['cur_node']); - $tree['NP'] = $tree_np['NP']; - return $tree; - } - unset($tree_np['cur_node'], $tree_vp['cur_node']); - $tree['NP'] = $tree_np['NP']; - $tree['VP'] = $tree_vp['VP']; + $tree["cur_node"] = $tree_vp["cur_node"]; + unset($tree_np["cur_node"], $tree_pp["cur_node"], $tree_vp["cur_node"]); + $tree["NP"] = $tree_np["NP"]; + $tree["POST"] = $tree_pp["POST"]; + $tree["VP"] = $tree_vp["VP"]; return $tree; } /** @@ -628,26 +573,29 @@ class Tokenizer $triplets_list = []; $question_list = []; $question_answer_list = []; - $triplet_types = ['CONCISE', 'RAW']; + $triplet_types = ["CONCISE", "RAW"]; foreach ($word_and_phrase_list as $word_and_phrase => $position_list) { - $tagged_phrase = self::tagTokenizePartOfSpeech($word_and_phrase); + $sentence = $word_and_phrase; + $sentence = preg_replace("/\s+/u", " ", $word_and_phrase); + $sentence = trim($sentence); + $tagged_phrase = self::tagTokenizePartOfSpeech($sentence); $parse_tree = self::generatePhraseParseTree($tagged_phrase); $triplets = self::extractTripletsParseTree($parse_tree); $extracted_triplets = self::rearrangeTripletsByType($triplets); foreach ($triplet_types as $type) { if (!empty($extracted_triplets[$type])) { $triplets = $extracted_triplets[$type]; - $questions = $triplets['QUESTION_LIST']; + $questions = $triplets["QUESTION_LIST"]; foreach ($questions as $question) { $question_list[$question] = $position_list; } $question_answer_list = array_merge($question_answer_list, - $triplets['QUESTION_ANSWER_LIST']); + $triplets["QUESTION_ANSWER_LIST"]); } } } - $out_triplets['QUESTION_LIST'] = $question_list; - $out_triplets['QUESTION_ANSWER_LIST'] = $question_answer_list; + $out_triplets["QUESTION_LIST"] = $question_list; + $out_triplets["QUESTION_ANSWER_LIST"] = $question_answer_list; return $out_triplets; } /** @@ -681,19 +629,19 @@ class Tokenizer public static function extractSubjectParseTree($tree) { $subject = []; - if (!empty($tree['NP'])) { - $subject['CONCISE'] = self::extractDeepestSpeechPartPhrase( - $tree['NP'], "NN"); + if (!empty($tree["NP"])) { + $subject["CONCISE"] = self::extractDeepestSpeechPartPhrase( + $tree["NP"], "NN"); $raw_subject = ""; $it = new \RecursiveIteratorIterator( - new \RecursiveArrayIterator($tree['NP'])); + new \RecursiveArrayIterator($tree["NP"])); foreach ($it as $v) { $raw_subject .= $v . " "; } - $subject['RAW']= $raw_subject; + $subject["RAW"]= $raw_subject; } else { - $subject['CONCISE'] = ""; - $subject['RAW'] = ""; + $subject["CONCISE"] = ""; + $subject["RAW"] = ""; } return $subject; } @@ -709,23 +657,23 @@ class Tokenizer public static function extractPredicateParseTree($tree) { $predicate = []; - if (!empty($tree['VP'])) { - $tree_vp = $tree['VP']; - $predicate['CONCISE'] = self::extractDeepestSpeechPartPhrase( + if (!empty($tree["VP"])) { + $tree_vp = $tree["VP"]; + $predicate["CONCISE"] = self::extractDeepestSpeechPartPhrase( $tree_vp, "VB"); $raw_predicate = ""; - if (!empty($tree_vp['VB'])) { - $tree_vb = $tree_vp['VB']; + if (!empty($tree_vp["VB"])) { + $tree_vb = $tree_vp["VB"]; $it = new \RecursiveIteratorIterator( new \RecursiveArrayIterator($tree_vb)); foreach ($it as $v) { $raw_predicate .= $v . " "; } - $predicate['RAW'] = $raw_predicate; + $predicate["RAW"] = $raw_predicate; } } else { - $predicate['CONCISE'] = ""; - $predicate['RAW'] = ""; + $predicate["CONCISE"] = ""; + $predicate["RAW"] = ""; } return $predicate; } @@ -741,26 +689,25 @@ class Tokenizer public static function extractObjectParseTree($tree) { $object = []; - if (!empty($tree['VP'])) { - $tree_vp = $tree['VP']; - if (!empty($tree_vp['NP'])) { - $nb = $tree_vp['NP']; - $object['CONCISE'] = self::extractDeepestSpeechPartPhrase($nb, + if (!empty($tree["POST"])) { + $tree_pp = $tree["POST"]; + if (!empty($tree_pp["NP"])) { + $np = $tree_pp["NP"]; + $object["CONCISE"] = self::extractDeepestSpeechPartPhrase($np, "NN"); - $raw_object = ""; - $it = new \RecursiveIteratorIterator( - new \RecursiveArrayIterator($nb)); - foreach ($it as $v) { - $raw_object .= $v . " "; - } - $object['RAW'] = $raw_object; } else { - $object['CONCISE'] = ""; - $object['RAW'] = ""; + $object["CONCISE"] = ""; } + $raw_object = ""; + $it = new \RecursiveIteratorIterator( + new \RecursiveArrayIterator($tree_pp)); + foreach ($it as $v) { + $raw_object .= $v . " "; + } + $object["RAW"] = $raw_object; } else { - $object['CONCISE'] = ""; - $object['RAW'] = ""; + $object["CONCISE"] = ""; + $object["RAW"] = ""; } return $object; } @@ -771,15 +718,15 @@ class Tokenizer * original phrase and RAW to the case where extraneous words have been * removed * - * @param are $tree a parse tree for a sentence + * @param array $parse_tree a parse tree for a sentence * @return array triplet array */ public static function extractTripletsParseTree($parse_tree) { $triplets = []; - $triplets['subject'] = self::extractSubjectParseTree($parse_tree); - $triplets['object'] = self::extractObjectParseTree($parse_tree); - $triplets['predicate'] = self::extractPredicateParseTree($parse_tree); + $triplets["subject"] = self::extractSubjectParseTree($parse_tree); + $triplets["object"] = self::extractObjectParseTree($parse_tree); + $triplets["predicate"] = self::extractPredicateParseTree($parse_tree); return $triplets; } /** @@ -794,17 +741,17 @@ class Tokenizer public static function rearrangeTripletsByType($sub_pred_obj_triplets) { $processed_triplet = []; - $processed_triplets['CONCISE'] = - self::extractTripletByType($sub_pred_obj_triplets, 'CONCISE'); - $processed_triplets['RAW'] = - self::extractTripletByType($sub_pred_obj_triplets, 'RAW'); + $processed_triplets["CONCISE"] = + self::extractTripletByType($sub_pred_obj_triplets, "CONCISE"); + $processed_triplets["RAW"] = + self::extractTripletByType($sub_pred_obj_triplets, "RAW"); return $processed_triplets; } /** * Takes a triplets array with subject, predicate, object fields with - * CONCISE, RAW subfields and produces a triplits with $type subfield (where - * $type is one of CONCISE and RAW) and with subject, predicate, object, - * and QUESTION_ANSWER_LIST subfields + * CONCISE, RAW subfields and produces triplets with $type subfield + * where $type is one of CONCISE and RAW and with subject, predicate, + * object and QUESTION_ANSWER_LIST subfields * * @param array $sub_pred_obj_triplets in format described above * @param string $type either CONCISE or RAW @@ -813,27 +760,30 @@ class Tokenizer public static function extractTripletByType($sub_pred_obj_triplets, $type) { $triplets = []; - if (!empty($sub_pred_obj_triplets['subject'][$type]) - && !empty($sub_pred_obj_triplets['predicate'][$type]) - && !empty($sub_pred_obj_triplets['object'][$type])) { + if (!empty($sub_pred_obj_triplets["subject"][$type]) + && !empty($sub_pred_obj_triplets["predicate"][$type]) + && !empty($sub_pred_obj_triplets["object"][$type])) { $question_answer_triplets = []; $question_marker = self::$question_marker; - $sentence = [$sub_pred_obj_triplets['subject'][$type], - $sub_pred_obj_triplets['predicate'][$type], - $sub_pred_obj_triplets['object'][$type]]; + $sentence = [$sub_pred_obj_triplets["subject"][$type], + $sub_pred_obj_triplets["object"][$type], + $sub_pred_obj_triplets["predicate"][$type]]; $question_triplets = []; for ($j = 0; $j < 2; $j++) { for ($i = 0; $i < 3; $i++) { - $q_sentence = $sentence; - $q_sentence[$i] = $question_marker; - $q_sentence_string = implode(" ", $q_sentence); - $question_triplets[] = $q_sentence_string; - $question_answer_triplets[$q_sentence_string] = - preg_replace('/\s+/u', ' ',$sentence[$i]); + $question = $sentence; + $question[$i] = $question_marker; + $question_string = implode(" ", $question); + $question_string = trim($question_string); + $question_string = preg_replace("/\s+/u", " ", + $question_string); + $question_triplets[] = $question_string; + $question_answer_triplets[$question_string] = + preg_replace("/\s+/u", " ", $sentence[$i]); } } - $triplets['QUESTION_LIST'] = $question_triplets; - $triplets['QUESTION_ANSWER_LIST'] = $question_answer_triplets; + $triplets["QUESTION_LIST"] = $question_triplets; + $triplets["QUESTION_ANSWER_LIST"] = $question_answer_triplets; } return $triplets; } @@ -845,33 +795,26 @@ class Tokenizer * @param int $index current index in statement * @return array parsed triplet */ - public static function parseWhoQuestion($tagged_question, $index) + public static function parseQuestion($tagged_question, $index) { - $start_pos = 0; - if ($index == 0) - $start_pos = $index + 1; $generated_questions = []; - $question_marker = self::getQuestionMarker(); + $question_marker = trim(self::getQuestionMarker()); $triplets = []; - $tree_np = self::extractNounPhrase($tagged_question, ["cur_node" => - $start_pos]); - $triplets['subject'] = self::extractSubjectParseTree($tree_np); - $tree = ["cur_node" => $index]; - $tree['NP'] = $tagged_question[$index]['token']; - $tree_vp = self::extractVerbPhrase($tagged_question, $tree); - $triplets['predicate'] = self::extractPredicateParseTree($tree_vp); - $triplet_types = ['CONCISE', 'RAW']; + $tree_np = self::extractNounPhrase($tagged_question, + ["cur_node" => 0]); + $triplets["subject"] = self::extractSubjectParseTree($tree_np); + $tree_vp = self::extractVerbPhrase($tagged_question, + ["cur_node" => $index+1]); + $triplets["predicate"] = self::extractPredicateParseTree($tree_vp); + $triplet_types = ["CONCISE", "RAW"]; foreach ($triplet_types as $type) { - if (!empty($triplets['subject'][$type]) - && !empty($triplets['predicate'][$type])) { - $generated_questions[$type][] = - trim($triplets['subject'][$type]) . - " " . trim($triplets['predicate'][$type]) . " " . - $question_marker; - $generated_questions[$type][] = - trim($triplets['subject'][$type]) . + if (!empty($triplets["subject"][$type]) + && !empty($triplets["predicate"][$type])) { + $question = trim (trim($triplets["subject"][$type]) . " " . $question_marker . - " " . trim($triplets['predicate'][$type]); + " " . trim($triplets["predicate"][$type])); + $question = preg_replace("/\s+/u", " ", $question); + $generated_questions[$type][] = $question; } } return $generated_questions; @@ -885,10 +828,11 @@ class Tokenizer */ public function isQuestion($phrase) { - $who_question = "कौन"; $phrase = trim($phrase); - if (mb_strpos($phrase, $who_question) !== false) { - return true; + for ($i = 0; $i < count(self::$questions); $i++) { + if (mb_strpos($phrase, trim(self::$questions[$i])) !== false) { + return true; + } } return false; } @@ -902,27 +846,25 @@ class Tokenizer return self::$question_marker; } /** - * Takes WH questions and returns the triplet from the question + * Takes questions and returns the triplet from the question * * @param string $question question to parse * @return array question triplet */ public static function questionParser($question) { - /* - * Array of 'wh' questions: What, When, Where, Why, Who, Which, Whom, - * Whose - */ - $wh_questions = array( "क्या", "कब", "कहा", "क्यों", "कौन", "जिसे", - "जिसका", "कहाँ"); + $question = trim($question); + $question = preg_replace("/\s+/u", " ", $question); $tagged_question = self::tagTokenizePartOfSpeech($question); $index = -1; foreach ($tagged_question as $i => $term_pos) { - if (in_array($term_pos['token'], $wh_questions)) { + if (in_array($term_pos["token"], self::$questions)) { $index = $i; + $term_pos["tag"] = "p_wh"; + $tagged_question[$i] = $term_pos; break; } } - return self::parseWhoQuestion($tagged_question, $index); + return self::parseQuestion($tagged_question, $index); } } diff --git a/src/models/ProfileModel.php b/src/models/ProfileModel.php index 3b31e613..ac3f19fa 100755 --- a/src/models/ProfileModel.php +++ b/src/models/ProfileModel.php @@ -390,6 +390,10 @@ class ProfileModel extends Model ACCESS_COUNT INTEGER, PRIMARY KEY(ADDRESS, PAGE_NAME))", "VERSION" => "CREATE TABLE VERSION(ID INTEGER PRIMARY KEY)", + "LEXICON" => "CREATE TABLE LEXICON( + TERM VARCHAR(". C\LONG_NAME_LEN ."), + LOCALE VARCHAR(" . C\NAME_LEN . "), + PART_OF_SPEECH VARCHAR(16), PRIMARY KEY(TERM, LOCALE))", ]; } /** -- 2.15.0.windows.1