. * * @author Chris Pollett chris@pollett.org * @license http://www.gnu.org/licenses/ GPL3 * @link http://www.seekquarry.com/ * @copyright 2009 - 2015 * @filesource */ if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();} /** * This class has a collection of methods for Dutch locale specific * tokenization. In particular, it has a stemmer, . * * @author Charles Bocage * @package seek_quarry\locale\nl */ class NlTokenizer { /** * Words we don't want to be stemmed * @var array */ static $no_stem_list = array("abs", "ahs", "aken", "àlle", "als", "are", "allèen", "ate", "aten", "azen", "bse", "cfce", "curaçao", "dègelijk", "dme", "ede", "eden", "eds", "ehs", "ems", "ene", "epe", "eps", "ers", "eten", "ets", "even", "fme", "gedaçht", "ghe", "gve", "hdpe", "hôte", "hpe", "hse", "ibs", "ics", "ile", "ims", "jònge", "kwe", "ldpe", "lldpe", "lme", "lze", "maitres", "mwe", "nme", "ode", "ogen", "oke", "ole", "ons", "ònze", "open", "ops", "oren", "ors", "oss", "oven", "ows", "pre", "pve", "rhône", "ròme", "rwe", "ske", "sme", "spe", "ste", "the", "tje", "uce", "uden", "uien", "uren", "use", "uwe", "vse", "ype"); /** * Stub function which could be used for a word segmenter. * Such a segmenter on input thisisabunchofwords would output * this is a bunch of words * * @param string $pre_segment before segmentation * @return string should return string with words separated by space * in this case does nothing */ static function segment($pre_segment) { return $pre_segment; } /** * boolean that tells the code if the e suffix was removed in step2 or not * @var array */ static $removedESuffix = false; /** * Computes the stem of a Dutch word * * For example, lichamelijk, lichamelijke, lichamelijkheden and lichamen, * all have licham as a stem * * @param string $word the string to stem * @return string the stem of $words */ static function stem($word) { self::$removedESuffix = false; $result = $word; if (isset($word) && !empty($word)) { $result = self::removeAllUmlautAndAcuteAccents($word); $result = trim(mb_strtolower($result)); $result = self::substituteIAndY($result); $R1 = self::getRIndex($result, 1); $R2 = self::getRIndex($result, $R1); $result = self::step1($result, $R1); $result = self::step2($result); $result = self::step3a($result, $R2); $result = self::step3b($result, $R2); $result = self::step4($result); $result = mb_strtolower($result); } return $result; } /** * Remove all umlaut and acute accents that need to be removed. * * @param string $word the string to remove the umlauts and accents from * @return string the string with the umlauts and accents removed */ private static function removeAllUmlautAndAcuteAccents($word) { $result = preg_replace("/\é|\ë/", "e", $word); $result = preg_replace("/\á|ä/", "a", $result); $result = preg_replace("/\ó|ö/", "o", $result); $result = preg_replace("/\ï/", "i", $result); $result = preg_replace("/\ü|\ú/", "u", $result); return $result; } /** * Put initial y, y after a vowel, and i between vowels into upper case. * * @param string $word the string to put initial y, y after a vowel, and * i between vowels into upper case. * @return string the string with an initial y, y after a vowel, and i * between vowels into upper case. */ private static function substituteIAndY($word) { $wordSplit = preg_split('/(?!^)(?=.)/u', $word); for ($i = 0; $i < count($wordSplit); $i++) { if ($i == 0) { if ($wordSplit[$i] == 'y') { $wordSplit[$i] = 'Y'; } } else { if ($wordSplit[$i] == 'i') { if (count($wordSplit) > ($i + 1)) { if (self::isVowel($wordSplit[$i - 1]) && self::isVowel($wordSplit[$i + 1])) { $wordSplit[$i] = 'I'; } } } elseif ($wordSplit[$i] == 'y') { if (self::isVowel($wordSplit[$i - 1])) { $wordSplit[$i] = 'Y'; } } } } return implode('', $wordSplit); } /** * Check that the letter is a vowel * * @param string $letter the character to check * @return boolean true if it is a vowel, otherwise false */ private static function isVowel($letter) { $result = false; switch ($letter) { case 'e': case 'a': case 'o': case 'i': case 'u': case 'y': case 'è': $result = true; } return $result; } //get the R index /** * Get the R index. The R index is the first consonent that follows a * vowel after the $start index * * @param string $word the string to search for the R index * @param int $start the index to start searching for the R index in the * string * @return int the R index if found, otherwise -1 */ private static function getRIndex($word, $start) { $result = -1; $wordSplit = preg_split('/(?!^)(?=.)/u', $word); for ($i = $start <= 0 ? 1 : $start; $i < count($wordSplit); $i++) { if (!self::isVowel($wordSplit[$i]) && self::isVowel($wordSplit[$i - 1])) { $result = ($i + 1); break; } } return $result; } /** * Define a valid en-ending as a non-vowel, and not gem and remove it * * @param string $word the string to stem * @param int $R1 the int that represents the R index * @return string the string with the valid en-ending as a non-vowel, and * not gem ending removed */ private static function step1($word, $R1) { $result = $word; if ($R1 > -1) { $wordLength = strlen($word); if ($wordLength > 2 && $R1 < $wordLength) { if (self::endsWith($word, "heden")) { $result = self::replace($word, '/heden$/', 'heid', $R1); } else { if (preg_match("/(? 2 && !self::isVowel($wordSplit[$wordSplitLength - 2])) { $wordSplit = array_slice($wordSplit, 0, ($wordSplitLength - 1)); $result = implode('', $wordSplit); $result = self::undouble($result); self::$removedESuffix = true; } } return $result; } /** * Delete the letters heid if in R2 and not preceded by a c, and treat an a * preceding en like in step 1 * * @param string $word the string to delete the letters heid if in R2 and * not preceded by a c, and treat an a preceding en like in step 1 * @param int $R2 the R index * @return string the string with the letters heid if in R2 and not * preceded by a c deleted, and treated an a preceding en like in step 1 */ private static function step3a($word, $R2) { $result = $word; if ($R2 > -1) { if (preg_match("/(? 2) { if ($R2 > -1) { if (preg_match('/(end|ing)$/', $word, $matches, 0, $R2)) { if (preg_match('/eig(end|ing)$/', $word, $matches, 0, $R2)) { $result = self::replace($word, '/(end|ing)$/', '', $R2); } elseif (preg_match('/ig(end|ing)$/', $word, $matches, 0, $R2)) { $result = self::replace($word, '/(igend|iging)$/', '', $R2); $result = self::undouble($result); } else { $result = self::replace($word, '/(end|ing)$/', '', $R2); $result = self::undouble($result); } } elseif (preg_match("/(? mon, weed -> wed). * * @param string $word the string to check for the CVD combination * @return string the string with the CVD combination removed otherwise * the original string */ private static function step4($word) { $result = $word; $wordSplit = preg_split('/(?!^)(?=.)/u', $word); $numberOfLetters = count($wordSplit); if ($numberOfLetters > 3) { $c = $wordSplit[$numberOfLetters - 4]; $v1 = $wordSplit[$numberOfLetters - 3]; $v2 = $wordSplit[$numberOfLetters - 2]; $d = $wordSplit[$numberOfLetters - 1]; if (!self::isVowel($c) && self::isVowel($v1) && self::isVowel($v2) && !self::isVowel($d) && $v1 == $v2 && $d != 'I' && $v1 != 'i') { unset($wordSplit[$numberOfLetters - 2]); $result = implode('', $wordSplit); } } return $result; } /** * Replace a string based on a regex expression * * @param string $word the string to search for regex replacement * @param string $reges the regex to use to find and replacement * @param string $replace the string to replace if the pattern is matched * @param int $offset the int to start to look for the regex replacement * @return string the string with the characters replaced if the regex * matches, otherwise the original string */ private static function replace($word, $regex, $replace, $offset) { $result = ""; if ($offset > 0) { $part1 = substr($word, 0, $offset); $part2 = substr($word, $offset, strlen($word)); $part2 = preg_replace($regex, $replace, $part2); $result = $part1 . "" . $part2; } else { $result = preg_replace($regex, $replace, $word); } return $result; } //checks to see if a string ends with a certain string /** * Checks to see if a string ends with a certain string * * @param string $haystack the string to check * @param string $needle the string to match at the end * @return boolean true if it ends with $needle, otherwise false */ private static function endsWith($haystack, $needle, $case = true) { if ($case) { return (strcmp(substr($haystack, strlen($haystack) - strlen($needle)), $needle) === 0); } return (strcasecmp(substr($haystack, strlen($haystack) - strlen($needle)), $needle) === 0); } //undoubles a string /** * undoubles the end of a string. If the string ends in kk, tt, dd remove * one of the characters * * @param string $word the string to undouble * @return string the undoubled string, otherwise the original string */ private static function undouble($word) { $result = $word; if (self::endsWith($word, "kk") || self::endsWith($word, "tt") || self::endsWith($word, "dd")) { $result = substr($word, 0, strlen($word) - 1); } return $result; } } ?>$