分享一个纯php分词封装的类
<?php
/*
* 本插件非成品插件,只是封装的一个底层类,可用于各种需要分词的,同义词替换的场合
*/
class trie
{
protected $dict;
protected $dictFile;
protected $specSymbol; //规格常见符号
protected $ty_dict;
/**
* @param string $dictFile 字典文件路径, 每行一句
*/
public function __construct()
{
$this->dict = [];
$this->ty_dict = [];
$this->specSymbol = "*|M|m|φ|Φ|st|ST";
}
public function loadData($cache = true)
{
global $dc;
$cacheKey = __CLASS__ . "_" . md5($this->dictFile);
if ($cache && false !== ($this->dict = $dc->get($cacheKey))) {
return;
}
$this->loadDataFromFile();
if ($cache) {
$dc->set($cacheKey, $this->dict, null, 100000);
}
}
/**
* 从文件加载字典数据, 并构建 trie 树
*/
public function loadDataFromFile()
{
$file = $this->dictFile;
if (!file_exists($file)) {
throw new InvalidArgumentException("字典文件不存在");
}
$handle = @fopen($file, "r");
if (!is_resource($handle)) {
throw new RuntimeException("字典文件无法打开");
}
while (!feof($handle)) {
$line = fgets($handle);
if (empty($line)) {
continue;
}
$this->addWords(trim($line));
}
fclose($handle);
}
/**
* 分割文本(注意ascii占1个字节, unicode...)
*
* @param string $str
*
* @return string[]
*/
protected function splitStr($str)
{
return preg_split("//u", $str, -1, PREG_SPLIT_NO_EMPTY);
}
/**
* 往dict树中添加语句
*
* @param $wordArr
*/
protected function addWords($words)
{
$wordArr = $this->splitStr($words);
$curNode = &$this->dict;
foreach ($wordArr as $char) {
if (!isset($curNode)) {
$curNode[$char] = [];
}
$curNode = &$curNode[$char];
}
// 标记到达当前节点完整路径为"敏感词"
$curNode['end']++;
}
/**
* 过滤文本
*
* @param string $str 原始文本
* @param string $replace 敏感字替换字符
* @param int $skipDistance 严格程度: 检测时允许跳过的间隔
*
* @return string 返回过滤后的文本
*/
public function filter($str, $replace = '*', $skipDistance = 0)
{
$maxDistance = max($skipDistance, 0) + 1;
$strArr = $this->splitStr($str);
$length = count($strArr);
for ($i = 0; $i < $length; $i++) {
$char = $strArr[$i];
if (!isset($this->dict[$char])) {
continue;
}
$curNode = &$this->dict[$char];
$dist = 0;
$matchIndex = [$i];
for ($j = $i + 1; $j < $length && $dist < $maxDistance; $j++) {
if (!isset($curNode[$strArr[$j]])) {
$dist ++;
continue;
}
$matchIndex[] = $j;
$curNode = &$curNode[$strArr[$j]];
}
// 匹配
if (isset($curNode['end'])) {
// Log::Write("match ");
foreach ($matchIndex as $index) {
$strArr[$index] = $replace;
}
$i = max($matchIndex);
}
}
return implode('', $strArr);
}
/**
* 查找
*
* @param $strArr
*
* @return bool|mixed
*/
public function isMatch($strArr)
{
$strArr = is_array($strArr) ? $strArr : $this->splitStr($strArr);
$curNode = $this->dict;
foreach ($strArr as $char) {
if (!isset($curNode[$char])) {
return false;
}else{
$curNode = $curNode[$char];
}
}
return isset($curNode['end']) ? $curNode['end'] : false;
}
/*
* 判断词是否存在于词库中
*/
public function isType($word,$filename='word'){
//判断
return $this->isMatch($word);
}
/*
* 对前端传过来的$kw对进行分词
* 然后返回对应类型的词
* $kw string 前端传过来的关健词
* $filename string 词库文件名
* $ty_file string 同义词库文件名
*/
public function split_kw($kw,$filename='word',$ty_file=''){
$this->dictFile = DT_ROOT.'/api/dtapicom/trie/'.$filename.'.txt';
$this->loadData();
//第一步,先进行空格,,号拆分
$temp = preg_split("/[s,,]+/", $kw); //explode(' ',trim($kw));
$data = [];
if(!empty($temp)){
foreach ($temp as $k=>$v){
if($v) $data[] = $v;
}
}else{
$data[] = $kw;
}
$word = []; //用来保存词库中匹配上的词
//第二步,先把初步分词的去词库中匹配
foreach ($data as $k=>$v){
if($this->isMatch($v,$filename)){
$word[] = $v;//保存进已匹配数组中
unset($data[$k]); //删除已匹配上的词
}
}
//第三步,对未匹配上的词进一步分词处理
if(!empty($data)){
foreach ($data as $k=>$v){
$temp = $this->split_word($v);
if(!empty($temp)){
foreach ($temp as $str){
$word[] = $str;
$v = str_replace($str,'',$v);
$data[$k] = $v;
}
//当前词已经为空时,删除当前元素
if(trim($v)=='') unset($data[$k]);
}
}
}
//第四步,对剩下的词进行替换同义词
if(!empty($data) && $ty_file){
foreach ($data as $k=>$v){
$word[] = $this->tyReplace($v,$ty_file);
}
}
return $word;
}
/*
* 词库精细分词
*/
public function split_word($strArr){
$strArr = is_array($strArr) ? $strArr : $this->splitStr($strArr);
$curNode = $this->dict;
$find = [];
$rootpostion = 0;//词根位置
$prenode = false; //回塑参数,词典ab在字符串aab中时,需要把i向前回塑一次
$words = [];
$len = count($strArr);
foreach ($strArr as $k=>$char) {
$word = '';
if (isset($curNode[$char])) {
for($i=$k;$i<$len;$i++){
$word .= $strArr[$i];
$curNode = $curNode[$strArr[$i]];
//遇到end时,将词保存下来
if(isset($curNode['end'])){
$words[] = $word;
}
}
}
//if($k) break;
$curNode = $this->dict;
}
return $words;
}
/*
* 编译同义词库
*/
public function load_tongyi($filename='tongyi',$cache = true){
global $dc;
$file = DT_ROOT.'/api/dtapicom/trie/'.$filename.'.txt';
$cacheKey = __CLASS__ . "_" . md5($file);
if ($cache && false !== ($this->ty_dict = $dc->get($cacheKey))) {
return;
}
if (!file_exists($file)) {
throw new InvalidArgumentException("字典文件不存在");
}
$handle = @fopen($file, "r");
if (!is_resource($handle)) {
throw new RuntimeException("字典文件无法打开");
}
while (!feof($handle)) {
$line = fgets($handle);
if (empty($line)) {
continue;
}
$this->addTongyi(trim($line));
}
fclose($handle);
if ($cache) {
$dc->set($cacheKey, $this->ty_dict, null, 100000);
}
}
/*
* 添加同义词进字典
*/
protected function addTongyi($str)
{
$arr = explode('=',$str);
$words = $arr[0];
$oldword = $arr[1];
$wordArr = $this->splitStr($words);
$curNode = &$this->ty_dict;
foreach ($wordArr as $char) {
if (!isset($curNode)) {
$curNode[$char] = [];
}
$curNode = &$curNode[$char];
}
// 标记到达当前节点完整路径为"敏感词"
$curNode['end'] = $oldword;
}
/*
* 同义词替换
*/
public function tyReplace($strArr,$ty_file='tongyi'){
$this->load_tongyi($ty_file);
$arr = is_array($strArr) ? $strArr : $this->splitStr($strArr);
$data = $this->ty_dict;
foreach ($arr as $k=>$v){
$data = $data[$v];
}
return $data['end'] ? $data['end'] : $strArr;
}
/*
* 替换文本中的指定词
* $text string 要替换的文本
* $filename string 使用的词库
*/
public function contentReplace($text,$filename='tongyi'){
$str = strip_tags($text);
preg_match_all('/([wx{4e00}-x{9fa5}]+)/u', $text,$arr);
$this->load_tongyi($filename);
$this->dict = $this->ty_dict;
//先用同义词库分词
foreach ($arr[0] as $k=>$v){
$word = $this->split_word($v);
if($word){
foreach ($word as $t){
$tyc = $this->tyReplace($t,$filename);
$text = str_replace($t,$tyc,$text);
}
}
}
return $text;
}
}