zoukankan      html  css  js  c++  java
  • 纯php分词封装的类

      分享一个纯php分词封装的类

    <?php
    /*
     * 本插件非成品插件,只是封装的一个底层类,可用于各种需要分词的,同义词替换的场合
     */
    
    class trie
    {
        protected $dict;
        protected $dictFile;
        protected $specSymbol; //规格常见符号
        protected $ty_dict;
    
        /**
         * @param string $dictFile 字典文件路径, 每行一句
         */
        public function __construct()
        {
            $this->dict = [];
            $this->ty_dict = [];
            $this->specSymbol = "*|M|m|φ|Φ|st|ST";
        }
    
        public function loadData($cache = true)
        {
            global $dc;
    
            $cacheKey = __CLASS__ . "_" . md5($this->dictFile);
            if ($cache && false !== ($this->dict = $dc->get($cacheKey))) {
                return;
            }
    
            $this->loadDataFromFile();
    
            if ($cache) {
                $dc->set($cacheKey, $this->dict, null, 100000);
            }
        }
    
        /**
         * 从文件加载字典数据, 并构建 trie 树
         */
        public function loadDataFromFile()
        {
            $file = $this->dictFile;
            if (!file_exists($file)) {
                throw new InvalidArgumentException("字典文件不存在");
            }
    
            $handle = @fopen($file, "r");
            if (!is_resource($handle)) {
                throw new RuntimeException("字典文件无法打开");
            }
            while (!feof($handle)) {
                $line = fgets($handle);
                if (empty($line)) {
                    continue;
                }
                $this->addWords(trim($line));
            }
    
            fclose($handle);
        }
    
        /**
         * 分割文本(注意ascii占1个字节, unicode...)
         *
         * @param string $str
         *
         * @return string[]
         */
        protected function splitStr($str)
        {
            return preg_split("//u", $str, -1, PREG_SPLIT_NO_EMPTY);
        }
    
        /**
         * 往dict树中添加语句
         *
         * @param $wordArr
         */
        protected function addWords($words)
        {
            $wordArr = $this->splitStr($words);
            $curNode = &$this->dict;
            foreach ($wordArr as $char) {
                if (!isset($curNode)) {
                    $curNode[$char] = [];
                }
    
                $curNode = &$curNode[$char];
            }
            // 标记到达当前节点完整路径为"敏感词"
            $curNode['end']++;
        }
    
        /**
         * 过滤文本
         *
         * @param string $str 原始文本
         * @param string $replace 敏感字替换字符
         * @param int    $skipDistance 严格程度: 检测时允许跳过的间隔
         *
         * @return string 返回过滤后的文本
         */
        public function filter($str, $replace = '*', $skipDistance = 0)
        {
            $maxDistance = max($skipDistance, 0) + 1;
            $strArr = $this->splitStr($str);
            $length = count($strArr);
            for ($i = 0; $i < $length; $i++) {
                $char = $strArr[$i];
    
                if (!isset($this->dict[$char])) {
                    continue;
                }
    
                $curNode = &$this->dict[$char];
                $dist = 0;
                $matchIndex = [$i];
                for ($j = $i + 1; $j < $length && $dist < $maxDistance; $j++) {
                    if (!isset($curNode[$strArr[$j]])) {
                        $dist ++;
                        continue;
                    }
    
                    $matchIndex[] = $j;
                    $curNode = &$curNode[$strArr[$j]];
                }
    
                // 匹配
                if (isset($curNode['end'])) {
    //                Log::Write("match ");
                    foreach ($matchIndex as $index) {
                        $strArr[$index] = $replace;
                    }
                    $i = max($matchIndex);
                }
            }
            return implode('', $strArr);
        }
    
        /**
         * 查找
         *
         * @param $strArr
         *
         * @return bool|mixed
         */
        public function isMatch($strArr)
        {
            $strArr = is_array($strArr) ? $strArr : $this->splitStr($strArr);
            $curNode = $this->dict;
            foreach ($strArr as $char) {
                if (!isset($curNode[$char])) {
                    return false;
                }else{
                    $curNode = $curNode[$char];
                }
            }
            return isset($curNode['end']) ? $curNode['end'] : false;
        }
    
        /*
         * 判断词是否存在于词库中
         */
        public function isType($word,$filename='word'){
            //判断
            return $this->isMatch($word);
        }
    
    
        /*
         * 对前端传过来的$kw对进行分词
         * 然后返回对应类型的词
         * $kw string 前端传过来的关健词
         * $filename string 词库文件名
         * $ty_file string 同义词库文件名
         */
        public function split_kw($kw,$filename='word',$ty_file=''){
            $this->dictFile = DT_ROOT.'/api/dtapicom/trie/'.$filename.'.txt';
            $this->loadData();
            //第一步,先进行空格,,号拆分
            $temp = preg_split("/[s,,]+/", $kw); //explode(' ',trim($kw));
            $data = [];
            if(!empty($temp)){
                foreach ($temp as $k=>$v){
                    if($v) $data[] = $v;
                }
            }else{
                $data[] = $kw;
            }
            $word = []; //用来保存词库中匹配上的词
            //第二步,先把初步分词的去词库中匹配
            foreach ($data as $k=>$v){
                if($this->isMatch($v,$filename)){
                    $word[] = $v;//保存进已匹配数组中
                    unset($data[$k]); //删除已匹配上的词
                }
            }
            //第三步,对未匹配上的词进一步分词处理
            if(!empty($data)){
                foreach ($data as $k=>$v){
                    $temp = $this->split_word($v);
                    if(!empty($temp)){
                        foreach ($temp as $str){
                            $word[] = $str;
                            $v = str_replace($str,'',$v);
                            $data[$k] = $v;
                        }
                        //当前词已经为空时,删除当前元素
                        if(trim($v)=='') unset($data[$k]);
                    }
    
                }
            }
    
            //第四步,对剩下的词进行替换同义词
            if(!empty($data) && $ty_file){
                foreach ($data as $k=>$v){
                    $word[] = $this->tyReplace($v,$ty_file);
                }
            }
            return $word;
        }
    
    
        /*
         * 词库精细分词
         */
        public function split_word($strArr){
            $strArr = is_array($strArr) ? $strArr : $this->splitStr($strArr);
            $curNode = $this->dict;
            $find = [];
            $rootpostion = 0;//词根位置
            $prenode = false; //回塑参数,词典ab在字符串aab中时,需要把i向前回塑一次
            $words = [];
            $len = count($strArr);
            foreach ($strArr as $k=>$char) {
                $word = '';
                if (isset($curNode[$char])) {
                    for($i=$k;$i<$len;$i++){
                        $word .= $strArr[$i];
                        $curNode = $curNode[$strArr[$i]];
                        //遇到end时,将词保存下来
                        if(isset($curNode['end'])){
                            $words[] = $word;
    
                        }
                    }
                }
                //if($k) break;
                $curNode = $this->dict;
            }
            return $words;
        }
    
    
        /*
         * 编译同义词库
         */
        public function load_tongyi($filename='tongyi',$cache = true){
            global $dc;
            $file = DT_ROOT.'/api/dtapicom/trie/'.$filename.'.txt';
            $cacheKey = __CLASS__ . "_" . md5($file);
            if ($cache && false !== ($this->ty_dict = $dc->get($cacheKey))) {
                return;
            }
    
    
            if (!file_exists($file)) {
                throw new InvalidArgumentException("字典文件不存在");
            }
    
            $handle = @fopen($file, "r");
            if (!is_resource($handle)) {
                throw new RuntimeException("字典文件无法打开");
            }
            while (!feof($handle)) {
                $line = fgets($handle);
                if (empty($line)) {
                    continue;
                }
                $this->addTongyi(trim($line));
            }
    
            fclose($handle);
    
            if ($cache) {
                $dc->set($cacheKey, $this->ty_dict, null, 100000);
            }
        }
    
        /*
         * 添加同义词进字典
         */
        protected function addTongyi($str)
        {
            $arr = explode('=',$str);
            $words = $arr[0];
            $oldword = $arr[1];
            $wordArr = $this->splitStr($words);
            $curNode = &$this->ty_dict;
            foreach ($wordArr as $char) {
                if (!isset($curNode)) {
                    $curNode[$char] = [];
                }
    
                $curNode = &$curNode[$char];
            }
            // 标记到达当前节点完整路径为"敏感词"
            $curNode['end'] = $oldword;
        }
    
        /*
         * 同义词替换
         */
        public function tyReplace($strArr,$ty_file='tongyi'){
            $this->load_tongyi($ty_file);
            $arr = is_array($strArr) ? $strArr : $this->splitStr($strArr);
            $data = $this->ty_dict;
            foreach ($arr as $k=>$v){
                $data = $data[$v];
            }
            return $data['end'] ? $data['end'] : $strArr;
        }
    
        /*
         * 替换文本中的指定词
         * $text string 要替换的文本
         * $filename string 使用的词库
         */
        public function contentReplace($text,$filename='tongyi'){
            $str = strip_tags($text);
            preg_match_all('/([wx{4e00}-x{9fa5}]+)/u', $text,$arr);
            $this->load_tongyi($filename);
            $this->dict = $this->ty_dict;
            //先用同义词库分词
            foreach ($arr[0] as $k=>$v){
                $word = $this->split_word($v);
                if($word){
                    foreach ($word as $t){
                        $tyc = $this->tyReplace($t,$filename);
                        $text = str_replace($t,$tyc,$text);
                    }
                }
            }
            return $text;
        }
    
    
    }
    

      

  • 相关阅读:
    好还是坏:人工智能二分类问题
    神经网络手写数字识别
    TensorFlow or PyTorch
    什么是深度学习
    五个常见 AI 开发库
    AI——第四次工业革命
    NodeJs获取不到POST参数
    Android权限
    【nodejs学习】3.进程管理及异步编程
    每日一题
  • 原文地址:https://www.cnblogs.com/68xi/p/12266291.html
Copyright © 2011-2022 走看看