tweet:2009:0207

DokuWiki 日本語強化

参照URL: http://www.higuchi.com/dokuwiki/dokuwiki:localize

MailのEncodeをISO-2022-jpにする
- 今回、これを導入しなかった。だって、ja=iso2022-jpって言えないし。だからってiso2022-jpに固定するのもどうよ？

検索の日本語化

pkgsrcからMecabを入れる
- /etc/mk.confにMECAB_CHARSET= utf-8 を忘れずに設定すること

inc/indexer.phpに以下の変更を加える

$ diff -c ./indexer.php.orig ./indexer.php
*** ./indexer.php.orig  Sat Feb  7 21:48:34 2009
--- ./indexer.php       Sat Feb  7 22:02:14 2009
***************
*** 6,11 ****
--- 6,15 ----
   * @author     Andreas Gohr <andi@splitbrain.org>
   */
  
+ /* for Japanese index search with Mecab */
+ define ('PRE_TOKENIZER', '/usr/pkg/bin/mecab -O wakati');
+ /* ------------------------------------ */
+ 
  if(!defined('DOKU_INC')) die('meh.');
  require_once(DOKU_INC.'inc/io.php');
  require_once(DOKU_INC.'inc/utf8.php');
***************
*** 55,62 ****
--- 59,70 ----
      $l = strlen($w);
      // If left alone, all chinese "words" will get put into w3.idx
      // So the "length" of a "word" is faked
+ /* for Japanese index search with Mecab */
+ /*
      if(preg_match('/'.IDX_ASIAN2.'/u',$w))
          $l += ord($w) - 0xE1;  // Lead bytes from 0xE2-0xEF
+ */
+ /* ------------------------------------ */
      return $l;
  }
  
***************
*** 220,225 ****
--- 228,257 ----
  
      list($page,$body) = $data;
  
+ /* for Japanese index search with Mecab */
+     if(function_exists(proc_open) && defined('PRE_TOKENIZER')) {
+         $dspec = array(
+             0 => array("pipe", "r"),
+             1 => array("pipe", "w"),
+             2 => array("file", "/dev/null", "w")
+         );
+         $process = proc_open(PRE_TOKENIZER, $dspec, $pipes);
+         if(is_resource($process)) {
+             stream_set_blocking($pipes[0], FALSE);
+             stream_set_blocking($pipes[1], FALSE);
+             fwrite($pipes[0], $body . "\n");
+             fclose($pipes[0]);
+  
+             $body = '';
+             while(!feof($pipes[1])) {
+                 $body .= fgets($pipes[1], 32768);
+             }
+             fclose($pipes[1]);
+             proc_close($process);
+         }
+     }
+ /* ------------------------------------ */
+ 
      $body   = strtr($body, "\r\n\t", '   ');
      $tokens = explode(' ', $body);
      $tokens = array_count_values($tokens);   // count the frequency of each token
***************
*** 489,495 ****
--- 521,532 ----
              $wild |= 2;
              $wlen -= 1;
          }
+ /* for Japanese index search with Mecab */
+ /*
          if ($wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) continue;
+ */
+         if (preg_match('/[^0-9A-Za-z]/u', $string) && $wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) continue;
+ /* ------------------------------------ */
          if(!isset($tokens[$xword])){
              $tokenlength[$wlen][] = $xword;
          }
***************
*** 628,639 ****
--- 665,701 ----
   */
  function idx_tokenizer($string,&$stopwords,$wc=false){
      $words = array();
+ /* for Japanese index search with Mecab */
+     if(function_exists(proc_open) && defined('PRE_TOKENIZER')) {
+         $dspec = array(
+             0 => array("pipe", "r"),
+             1 => array("pipe", "w"),
+             2 => array("file", "/dev/null", "w")
+         );
+         $process = proc_open(PRE_TOKENIZER, $dspec, $pipes);
+         if(is_resource($process)) {
+             stream_set_blocking($pipes[0], FALSE);
+             stream_set_blocking($pipes[1], FALSE);
+             fwrite($pipes[0], $string . "\n");
+             fclose($pipes[0]);
+             $string = '';
+             while(!feof($pipes[1])) {
+                 $string .= fgets($pipes[1], 32768);
+             }
+             fclose($pipes[1]);
+             proc_close($process);
+         }
+     }
+ /* ------------------------------------ */
      $wc = ($wc) ? '' : $wc = '\*';
  
      if(preg_match('/[^0-9A-Za-z]/u', $string)){
+ /* for Japanese index search with Mecab */
+ /*
          // handle asian chars as single words (may fail on older PHP version)
          $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string);
          if(!is_null($asia)) $string = $asia; //recover from regexp failure
+ */
  
          $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc));
          foreach ($arr as $w) {

こんなんでどうだろう…。

目次

DokuWiki 日本語強化