zoukankan      html  css  js  c++  java
  • 一个用PHP写的中文分词函数

    <?php

    class Segmentation {
    var $options = array('lowercase' => TRUE,
    'segment_english' => FALSE);
    var $dict_name = 'Unknown';
    var $dict_words = array();
    function setLowercase($value) {
    if ($value) {
    $this->options['lowercase'] = TRUE;
    } else {
    $this->options['lowercase'] = FALSE;
    }
    return TRUE;
    }
    function setSegmentEnglish($value) {
    if ($value) {
    $this->options['segment_english'] = TRUE;
    } else {
    $this->options['segment_english'] = FALSE;
    }
    return TRUE;
    }
    function load($dict_file) {
    if (!file_exists($dict_file)) {
    return FALSE;
    }
    $fp = fopen($dict_file, 'r');
    $temp = fgets($fp, 1024);
    if ($temp === FALSE) {
    return FALSE;
    } else {
    if (strpos($temp, "t") !== FALSE) {
    list ($dict_type, $dict_name) = explode("t", trim($temp));
    } else {
    $dict_type = trim($temp);
    $dict_name = 'Unknown';
    }
    $this->dict_name = $dict_name;
    if ($dict_type !== 'DICT_WORD_W') {
    return FALSE;
    }
    }
    while (!feof($fp)) {
    $this->dict_words[rtrim(fgets($fp, 32))] = 1;
    }
    fclose($fp);
    return TRUE;
    }
    function getDictName() {
    return $this->dict_name;
    }
    function segmentString($str) {
    if (count($this->dict_words) === 0) {
    return FALSE;
    }
    $lines = explode("n", $str);
    return $this->_segmentLines($lines);
    }
    function segmentFile($filename) {
    if (count($this->dict_words) === 0) {
    return FALSE;
    }
    $lines = file($filename);
    return $this->_segmentLines($lines);
    }
    function _segmentLines($lines) {
    $contents_segmented = '';
    foreach ($lines as $line) {
    $contents_segmented .= $this->_segmentLine(rtrim($line)) . " n";
    }
    do {
    $contents_segmented = str_replace(' ', ' ', $contents_segmented);
    } while (strpos($contents_segmented, ' ') !== FALSE);
    return $contents_segmented;?>

  • 相关阅读:
    【转】P2P通信原理与实现(C++)
    【转】P2P通信标准协议(二)之TURN
    【转】P2P之UDP穿透NAT的原理与实现
    【转】P2P的原理和常见的实现方式
    【转】linux中man使用技巧
    【转】go编译时,加入svn版本信息
    各种移动GPU压缩纹理的使用方法
    Unity贴图压缩格式设置
    关于U3D贴图格式压缩
    可能会导致.NET内存泄露的8种行为
  • 原文地址:https://www.cnblogs.com/ymj0906/p/3003497.html
Copyright © 2011-2022 走看看