fork download
  1. <?php
  2. /**
  3.  * Pengakar: Indonesian stemmer
  4.  * (c) 2012–2015 Ivan Lanin <ivanlanin at gmail dot com>
  5.  *
  6.  * This program is free software: you can redistribute it and/or modify
  7.  * it under the terms of the GNU General Public License as published by
  8.  * the Free Software Foundation, either version 3 of the License, or
  9.  * (at your option) any later version.
  10.  *
  11.  * This program is distributed in the hope that it will be useful,
  12.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14.  * GNU General Public License for more details.
  15.  *
  16.  * You should have received a copy of the GNU General Public License
  17.  * along with this program. If not, see <http://w...content-available-to-author-only...u.org/licenses/>.
  18.  */
  19. namespace Kateglo;
  20.  
  21. /**
  22.  * Main class
  23.  */
  24. class Pengakar
  25. {
  26. private $dict;
  27. private $rules;
  28. private $options;
  29.  
  30. /**
  31.   * Konstruktor
  32.   */
  33. public function __construct()
  34. {
  35. // Read dictionary and create associative array
  36. $dict = file_get_contents('./data/kamus.txt');
  37. $tmp = explode("\n", $dict);
  38. foreach ($tmp as $entry) {
  39. $attrib = explode("\t", strtolower($entry)); // 0: lemma; 1: class
  40. $key = str_replace(' ', '', $attrib[0]); // remove space
  41. $this->dict[$key] = array('lemma' => $attrib[0], 'class' => $attrib[1]);
  42. }
  43. // Options
  44. $this->options = array(
  45. 'SORT_INSTANCE' => false, // sort by number of instances
  46. 'NO_NO_MATCH' => false, // hide no match entry
  47. 'NO_DIGIT_ONLY' => true, // hide digit only
  48. 'STRICT_CONFIX' => false, // use strict disallowed_confixes rules
  49. );
  50. // Define rules
  51. $VOWEL = 'a|i|u|e|o'; // vowels
  52. $CONSONANT = 'b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|y|z'; // consonants
  53. $ANY = $VOWEL . '|' . $CONSONANT; // any characters
  54. $this->rules = array(
  55. 'affixes' => array(
  56. array(1, array('kah', 'lah', 'tah', 'pun')),
  57. array(1, array('mu', 'ku', 'nya')),
  58. array(0, array('ku', 'kau')),
  59. array(1, array('i', 'kan', 'an')),
  60. ),
  61. 'prefixes' => array(
  62. array(0, "(di|ke|se)({$ANY})(.+)", ""), // 0
  63. array(0, "(ber|ter)({$ANY})(.+)", ""), // 1, 6 normal
  64. array(0, "(be|te)(r)({$VOWEL})(.+)", ""), // 1, 6 be-rambut
  65. array(0, "(be|te)({$CONSONANT})({$ANY}?)(er)(.+)", ""), // 3, 7 te-bersit, te-percaya
  66. array(0, "(bel|pel)(ajar|unjur)", ""), // ajar, unjur
  67. array(0, "(me|pe)(l|m|n|r|w|y)(.+)", ""), // 10, 20: merawat, pemain
  68. array(0, "(mem|pem)(b|f|v)(.+)", ""), // 11 23: membuat, pembuat
  69. array(0, "(men|pen)(c|d|j|z)(.+)", ""), // 14 27: mencabut, pencabut
  70. array(0, "(meng|peng)(g|h|q|x)(.+)", ""), // 16 29: menggiring, penghasut
  71. array(0, "(meng|peng)({$VOWEL})(.+)", ""), // 17 30 meng-anjurkan, peng-anjur
  72. array(0, "(mem|pem)({$VOWEL})(.+)", "p"), // 13 26: memerkosa, pemerkosa
  73. array(0, "(men|pen)({$VOWEL})(.+)", "t"), // 15 28 menutup, penutup
  74. array(0, "(meng|peng)({$VOWEL})(.+)", "k"), // 17 30 mengalikan, pengali
  75. array(0, "(meny|peny)({$VOWEL})(.+)", "s"), // 18 31 menyucikan, penyucian
  76. array(0, "(mem)(p)({$CONSONANT})(.+)", ""), // memproklamasikan
  77. array(0, "(pem)({$CONSONANT})(.+)", "p"), // pemrogram
  78. array(0, "(men|pen)(t)({$CONSONANT})(.+)", ""), // mentransmisikan pentransmisian
  79. array(0, "(meng|peng)(k)({$CONSONANT})(.+)", ""), // mengkristalkan pengkristalan
  80. array(0, "(men|pen)(s)({$CONSONANT})(.+)", ""), // mensyaratkan pensyaratan
  81. array(0, "(menge|penge)({$CONSONANT})(.+)", ""), // swarabakti: mengepel
  82. array(0, "(mempe)(r)({$VOWEL})(.+)", ""), // 21
  83. array(0, "(memper)({$ANY})(.+)", ""), // 21
  84. array(0, "(pe)({$ANY})(.+)", ""), // 20
  85. array(0, "(per)({$ANY})(.+)", ""), // 21
  86. array(0, "(pel)({$CONSONANT})(.+)", ""), // 32 pelbagai, other?
  87. array(0, "(mem)(punya)", ""), // Exception: mempunya
  88. array(0, "(pen)(yair)", "s"), // Exception: penyair > syair
  89. ),
  90. 'disallowed_confixes' => array(
  91. array('ber-', '-i'),
  92. array('ke-', '-i'),
  93. array('pe-', '-kan'),
  94. array('di-', '-an'),
  95. array('meng-', '-an'),
  96. array('ter-', '-an'),
  97. array('ku-', '-an'),
  98. ),
  99. 'allomorphs' => array(
  100. 'be' => array('be-', 'ber-', 'bel-'),
  101. 'te' => array('te-', 'ter-', 'tel-'),
  102. 'pe' => array('pe-', 'per-', 'pel-', 'pen-', 'pem-', 'peng-', 'peny-', 'penge-'),
  103. 'me' => array('me-', 'men-', 'mem-', 'meng-', 'meny-', 'menge-'),
  104. ),
  105. );
  106.  
  107. }
  108.  
  109. /**
  110.   * Ambil konten
  111.   *
  112.   * @param string $url
  113.   *
  114.   * @return string
  115.   */
  116. public function getContent($url)
  117. {
  118. // Curl
  119. $agent = "Mozilla/5.0 (Windows; U; Windows NT 5.0; en; rv:1.9.0.4) Gecko/2009011913 Firefox/3.0.6";
  120. $domain = 'http://' . parse_url($url, PHP_URL_HOST);
  121. $curl = curl_init();
  122. curl_setopt($curl, CURLOPT_URL, $url);
  123. curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
  124. curl_setopt($curl, CURLOPT_REFERER, $domain); // pseudo referer
  125. curl_setopt($curl, CURLOPT_USERAGENT, $agent); // pseudo agent
  126. $html = curl_exec($curl);
  127. curl_close($curl);
  128. // Process HTML
  129. $ret = $html;
  130. $ret = preg_replace('/<(script|style)\b[^>]*>(.*?)<\/\1>/is', "", $ret); // remove script & style
  131. $ret = preg_replace('/<(br|p)[^>]*>/i', "\n", $ret); // new line for br & p
  132. $ret = trim(strip_tags($ret)); // strip tags
  133. $ret = preg_replace('/^\s*/m', '', $ret); // trim left
  134. $ret = preg_replace('/\s*$/m', '', $ret); // trim right
  135. $ret = preg_replace('/\n+/', "\n\n", $ret); // two new line: readability
  136.  
  137. return $ret;
  138. }
  139.  
  140. /**
  141.   * Ambil hasil API
  142.   *
  143.   * @param string $query
  144.   *
  145.   * @return string
  146.   */
  147. public function getApi($query)
  148. {
  149. $words = $this->stem($query);
  150. if ($query != '') {
  151. $ret = json_encode($words);
  152. } else {
  153. $ret = 'API Pengakar<br /><br />' .
  154. 'Sintaks:<br />' .
  155. '* <a href="./?api=1&q=pengakar">?api=1&q=...</a><br />' .
  156. '* <a href="./?api=1&url=http://i...content-available-to-author-only...n.org/pengakar/">?api=1&url=...</a><br /><br />' .
  157. 'Hasil:<br />' .
  158. 'lemma => { <br />' .
  159. '&nbsp;&nbsp;count,<br />' .
  160. '&nbsp;&nbsp;roots => { <br />' .
  161. '&nbsp;&nbsp;&nbsp;&nbsp;root => lemma, affixes {}, suffixes {}, prefixes {} <br />' .
  162. '&nbsp;&nbsp;} <br />' .
  163. '}' .
  164. '';
  165. }
  166.  
  167. return $ret;
  168. }
  169.  
  170. /**
  171.   * Ambil hasil dalam format HTML
  172.   *
  173.   * @param string $query
  174.   *
  175.   * @return string
  176.   */
  177. public function getHtml($query)
  178. {
  179. $words = $this->stem($query);
  180. $url_template = 'http://k...content-available-to-author-only...o.com/?mod=dict&action=view&phrase=%1$s';
  181.  
  182. // Render display
  183. $word_count = count($words);
  184. foreach ($words as $key => $word) {
  185. $roots = $word['roots'];
  186. $root_count = count($roots);
  187. //if ($root_count <= 1) continue; // display disambig only
  188. if ($word['count'] > 1) {
  189. $instances = ' <span class="instance">x' . $word['count'] . '</span>';
  190. } else {
  191. $instances = '';
  192. }
  193. if ($root_count == 0) { // no match
  194. $lost .= sprintf(
  195. '<li><span class="notfound">%s</span>%s</li>',
  196. $key,
  197. $instances
  198. );
  199. } else {
  200. $i = 0;
  201. unset($components);
  202. foreach ($roots as $lemma => $attrib) {
  203. $i++;
  204. $affixes = $attrib['affixes'];
  205. $url = sprintf($url_template, $attrib['lemma']);
  206. $lemma_url = sprintf('<a href="%s" target="kateglo">%s</a>', $url, $attrib['lemma']);
  207. $components .= $components ? '; ' : '';
  208. if ($key == $lemma && $root_count == 1) { // is baseword
  209. $components .= $lemma_url . $instances . $class;
  210. } else {
  211. // Multiroot
  212. if ($root_count > 1 && $i == 1) {
  213. $components .= $key . $instances . ': ';
  214. }
  215. if ($root_count > 1) {
  216. $components .= "({$i}) ";
  217. }
  218. // Prefix, lemma, & suffix
  219. if (is_array($attrib['prefixes'])) {
  220. $components .= implode('', $attrib['prefixes']);
  221. }
  222. $components .= $lemma_url;
  223. if (is_array($attrib['suffixes'])) {
  224. $components .= implode('', $attrib['suffixes']);
  225. }
  226. // Single root
  227. if ($root_count == 1) {
  228. $components .= $instances;
  229. }
  230. }
  231. }
  232. $found .= sprintf('<li>%s</li>', $components);
  233. }
  234. }
  235. // Render display
  236. if ($word_count >= 10) {
  237. $ret .= '<div style="-webkit-column-count: 3; -moz-column-count: 3;">';
  238. }
  239. $ret .= '<ol style="margin:0;">';
  240. $ret .= $lost;
  241. $ret .= $found;
  242. $ret .= '</ol>';
  243. if ($word_count >= 10) {
  244. $ret .= '</div>';
  245. }
  246.  
  247. return $ret;
  248. }
  249.  
  250. /**
  251.   * Tokenisasi
  252.   *
  253.   * @param string $query
  254.   *
  255.   * @return array
  256.   */
  257. private function stem($query)
  258. {
  259. $words = array();
  260. $raw = preg_split('/[^a-zA-Z0-9\-]/', $query, -1, PREG_SPLIT_NO_EMPTY);
  261. foreach ($raw as $r) {
  262. // Remove all digit "word" if necessary
  263. if ($this->options['NO_DIGIT_ONLY'] && preg_match('/^\d+$/', $r)) {
  264. continue;
  265. }
  266. $key = strtolower($r);
  267. $words[$key]['count']++;
  268. }
  269. foreach ($words as $key => $word) {
  270. $words[$key]['roots'] = $this->stemWord($key);
  271. // If NO_NO_MATCH, remove words that has no root
  272. if (count($words[$key]['roots']) == 0 && $this->options['NO_NO_MATCH']) {
  273. unset($words[$key]);
  274. continue;
  275. }
  276. $instances[$key] = $word['count'];
  277. }
  278. $word_count = count($words);
  279. if ($this->options['SORT_INSTANCE']) {
  280. $keys = array_keys($words);
  281. array_multisort($instances, SORT_DESC, $keys, SORT_ASC, $words);
  282. } else {
  283. ksort($words);
  284. }
  285.  
  286. return $words;
  287. }
  288.  
  289. /**
  290.   * Stem individual word
  291.   *
  292.   * @param string $word
  293.   *
  294.   * @return array
  295.   */
  296. private function stemWord($word)
  297. {
  298. // Preprocess: Create empty affix if original word is in lexicon
  299. $word = trim($word);
  300. $roots = array($word => '');
  301. if (array_key_exists($word, $this->dict)) {
  302. $roots[$word]['affixes'] = array();
  303. }
  304. // Has dash? Try to also find root for each element
  305. if (strpos($word, '-')) {
  306. $dash_parts = explode('-', $word);
  307. foreach ($dash_parts as $dash_part) {
  308. $roots[$dash_part]['affixes'] = array();
  309. }
  310. }
  311.  
  312. // Process: Find suffixes, pronoun prefix, and other prefix (3 times, Asian)
  313. foreach ($this->rules['affixes'] as $group) {
  314. $is_suffix = $group[0];
  315. $affixes = $group[1];
  316. foreach ($affixes as $affix) {
  317. $pattern = $is_suffix ? "(.+)({$affix})" : "({$affix})(.+)";
  318. $this->addRoot($roots, array($is_suffix, $pattern, ''));
  319. }
  320. }
  321. for ($i = 0; $i < 3; $i++) {
  322. foreach ($this->rules['prefixes'] as $rule) {
  323. $this->addRoot($roots, $rule);
  324. }
  325. }
  326.  
  327. // Postprocess 1: Select valid affixes
  328. foreach ($roots as $lemma => $attrib) {
  329. // Not in dictionary? Unset and exit
  330. if (!array_key_exists($lemma, $this->dict)) {
  331. unset($roots[$lemma]);
  332. continue;
  333. }
  334. // Escape if we don't have to check valid confix pairs
  335. if (!$this->options['STRICT_CONFIX']) {
  336. continue;
  337. }
  338. // Check confix pairs
  339. $affixes = $attrib['affixes'];
  340. foreach ($this->rules['disallowed_confixes'] as $pair) {
  341. $prefix = $pair[0];
  342. $suffix = $pair[1];
  343. $prefix_key = substr($prefix, 0, 2);
  344. if (array_key_exists($prefix_key, $this->rules['allomorphs'])) {
  345. foreach ($this->rules['allomorphs'][$prefix_key] as $allomorf) {
  346. if (in_array($allomorf, $affixes) && in_array($suffix, $affixes)) {
  347. unset($roots[$lemma]);
  348. }
  349. }
  350. } elseif (in_array($prefix, $affixes) && in_array($suffix, $affixes)) {
  351. unset($roots[$lemma]);
  352. }
  353. }
  354. }
  355.  
  356. // Postprocess 2: Handle suffixes and prefixes
  357. foreach ($roots as $lemma => $attrib) {
  358. $affixes = $attrib['affixes'];
  359. $attrib['lemma'] = $this->dict[$lemma]['lemma'];
  360. $attrib['class'] = $this->dict[$lemma]['class'];
  361. // Divide affixes into suffixes and prefixes
  362. foreach ($attrib['affixes'] as $affix) {
  363. $type = (substr($affix, 0, 1) == '-') ? 'suffixes' : 'prefixes';
  364. $attrib[$type][] = $affix;
  365. }
  366. // Reverse suffix order
  367. if (is_array($attrib['suffixes'])) {
  368. krsort($attrib['suffixes']);
  369. }
  370. $roots[$lemma] = $attrib;
  371. }
  372.  
  373. return $roots;
  374. }
  375.  
  376. /**
  377.   * Greedy algorithm: add every possible branch
  378.   *
  379.   * @param array $roots
  380.   * @param array $rule
  381.   *
  382.   * @return void
  383.   */
  384. private function addRoot(&$roots, $rule)
  385. {
  386. $is_suffix = $rule[0];
  387. $pattern = '/^' . $rule[1] . '$/i';
  388. $variant = $rule[2];
  389. foreach ($roots as $lemma => $attrib) {
  390. preg_match($pattern, $lemma, $matches);
  391. if (count($matches) > 0) {
  392. unset($new_lemma);
  393. unset($new_affix);
  394. $affix_index = $is_suffix ? 2 : 1;
  395.  
  396. // Lemma
  397. for ($i = 1; $i < count($matches); $i++) {
  398. if ($i != $affix_index) {
  399. $new_lemma .= $matches[$i];
  400. }
  401. }
  402. if ($variant) {
  403. $new_lemma = $variant . $new_lemma;
  404. }
  405.  
  406. // Affix, add - before (suffix), after (prefix)
  407. $new_affix .= $is_suffix ? '-' : '';
  408. $new_affix .= $matches[$affix_index];
  409. $new_affix .= $is_suffix ? '' : '-';
  410. $new_affix = array($new_affix); // make array
  411. if (is_array($attrib['affixes'])) { // merge
  412. $new_affix = array_merge($attrib['affixes'], $new_affix);
  413. }
  414.  
  415. // Push
  416. $roots[$new_lemma] = array('affixes' => $new_affix);
  417. }
  418. }
  419. }
  420. }
  421.  
Success #stdin #stdout 0.02s 52488KB
stdin
Standard input is empty
stdout
Standard output is empty