php做的簡(jiǎn)單中文分詞代碼

2024-05-04 21:48:50

字體：大中小

供稿：網(wǎng)友

中文搜索引擎來(lái)說(shuō),中文分詞是整個(gè)系統(tǒng)最基礎(chǔ)的部分之一,因?yàn)槟壳盎趩巫值闹形乃阉魉惴ú⒉皇翘?當(dāng)然,本文不是要對(duì)中文搜索引擎做研究,而是分享如果用 PHP 做一個(gè)站內(nèi)搜索引擎,本文是這個(gè)系統(tǒng)中的一篇.

進(jìn)行中文分詞的 PHP 類就在下面了,用 proc_open() 函數(shù)來(lái)執(zhí)行分詞程序,并通過(guò)管道和其交互,輸入要進(jìn)行分詞的文本,讀取分詞結(jié)果.

<?php

class NLP{

    private static $cmd_path;

    // 不以'/'結(jié)尾

    static function set_cmd_path($path){

        self::$cmd_path = $path;

    }//開源代碼Vevb.com

    private function cmd($str){

        $descriptorspec = array(

           0 => array("pipe", "r"),

           1 => array("pipe", "w"),

        );

        $cmd = self::$cmd_path . "/ictclas";

        $process = proc_open($cmd, $descriptorspec, $pipes);

        if (is_resource($process)) {

            $str = iconv('utf-8', 'gbk', $str);

            fwrite($pipes[0], $str);

            $output = stream_get_contents($pipes[1]);

            fclose($pipes[0]);

            fclose($pipes[1]);

            $return_value = proc_close($process);

        }

        /*

        $cmd = "printf '$input' | " . self::$cmd_path . "/ictclas";

        exec($cmd, $output, $ret);

        $output = join("n", $output);

        */

        $output = trim($output);

        $output = iconv('gbk', 'utf-8', $output);

        return $output;

    }

    /**

     * 進(jìn)行分詞, 返回詞語(yǔ)列表.

     */

    function tokenize($str){

        $tokens = array();

        $output = self::cmd($input);

        if($output){

            $ps = preg_split('/s+/', $output);

            foreach($ps as $p){

                list($seg, $tag) = explode('/', $p);

                $item = array(

                    'seg' => $seg,

                    'tag' => $tag,

                    );

                $tokens[] = $item;

            }

        }

        return $tokens;

    }

}

NLP::set_cmd_path(dirname(__FILE__));

?>