tweet:2009:0207_04
DokuWiki 日本語強化
参照URL: http://www.higuchi.com/dokuwiki/dokuwiki:localize
- MailのEncodeをISO-2022-jpにする
- 今回、これを導入しなかった。だって、ja=iso2022-jpって言えないし。だからってiso2022-jpに固定するのもどうよ?
- 検索の日本語化
- pkgsrcからMecabを入れる
- /etc/mk.confにMECAB_CHARSET= utf-8 を忘れずに設定すること
- inc/indexer.phpに以下の変更を加える
$ diff -c ./indexer.php.orig ./indexer.php *** ./indexer.php.orig Sat Feb 7 21:48:34 2009 --- ./indexer.php Sat Feb 7 22:02:14 2009 *************** *** 6,11 **** --- 6,15 ---- * @author Andreas Gohr <andi@splitbrain.org> */ + /* for Japanese index search with Mecab */ + define ('PRE_TOKENIZER', '/usr/pkg/bin/mecab -O wakati'); + /* ------------------------------------ */ + if(!defined('DOKU_INC')) die('meh.'); require_once(DOKU_INC.'inc/io.php'); require_once(DOKU_INC.'inc/utf8.php'); *************** *** 55,62 **** --- 59,70 ---- $l = strlen($w); // If left alone, all chinese "words" will get put into w3.idx // So the "length" of a "word" is faked + /* for Japanese index search with Mecab */ + /* if(preg_match('/'.IDX_ASIAN2.'/u',$w)) $l += ord($w) - 0xE1; // Lead bytes from 0xE2-0xEF + */ + /* ------------------------------------ */ return $l; } *************** *** 220,225 **** --- 228,257 ---- list($page,$body) = $data; + /* for Japanese index search with Mecab */ + if(function_exists(proc_open) && defined('PRE_TOKENIZER')) { + $dspec = array( + 0 => array("pipe", "r"), + 1 => array("pipe", "w"), + 2 => array("file", "/dev/null", "w") + ); + $process = proc_open(PRE_TOKENIZER, $dspec, $pipes); + if(is_resource($process)) { + stream_set_blocking($pipes[0], FALSE); + stream_set_blocking($pipes[1], FALSE); + fwrite($pipes[0], $body . "\n"); + fclose($pipes[0]); + + $body = ''; + while(!feof($pipes[1])) { + $body .= fgets($pipes[1], 32768); + } + fclose($pipes[1]); + proc_close($process); + } + } + /* ------------------------------------ */ + $body = strtr($body, "\r\n\t", ' '); $tokens = explode(' ', $body); $tokens = array_count_values($tokens); // count the frequency of each token *************** *** 489,495 **** --- 521,532 ---- $wild |= 2; $wlen -= 1; } + /* for Japanese index search with Mecab */ + /* if ($wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) continue; + */ + if (preg_match('/[^0-9A-Za-z]/u', $string) && $wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) continue; + /* ------------------------------------ */ if(!isset($tokens[$xword])){ $tokenlength[$wlen][] = $xword; } *************** *** 628,639 **** --- 665,701 ---- */ function idx_tokenizer($string,&$stopwords,$wc=false){ $words = array(); + /* for Japanese index search with Mecab */ + if(function_exists(proc_open) && defined('PRE_TOKENIZER')) { + $dspec = array( + 0 => array("pipe", "r"), + 1 => array("pipe", "w"), + 2 => array("file", "/dev/null", "w") + ); + $process = proc_open(PRE_TOKENIZER, $dspec, $pipes); + if(is_resource($process)) { + stream_set_blocking($pipes[0], FALSE); + stream_set_blocking($pipes[1], FALSE); + fwrite($pipes[0], $string . "\n"); + fclose($pipes[0]); + $string = ''; + while(!feof($pipes[1])) { + $string .= fgets($pipes[1], 32768); + } + fclose($pipes[1]); + proc_close($process); + } + } + /* ------------------------------------ */ $wc = ($wc) ? '' : $wc = '\*'; if(preg_match('/[^0-9A-Za-z]/u', $string)){ + /* for Japanese index search with Mecab */ + /* // handle asian chars as single words (may fail on older PHP version) $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); if(!is_null($asia)) $string = $asia; //recover from regexp failure + */ $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc)); foreach ($arr as $w) {
こんなんでどうだろう…。
tweet/2009/0207_04.txt · 最終更新: 2009/02/07 04:00 by 127.0.0.1