zoukankan html css js c++ java

中文分词（机械传统方法）正向最大匹配

//ditionary.h
#include <iostream>
#include <string>
#include <hash_map>
#include <fstream>
#include <sstream>
using namespace std;

class Cditionary
{
public:
    Cditionary();
    ~Cditionary();
    int FindWord(string w);
private:
    string strtmp;
    string word;
    hash_map<string , int> wordhash;
    
};

Cditionary::Cditionary()
{
   ifstream infile("wordlist.txt");     // 打开词典
      if (!infile.is_open())     // 打开词典失败则退出程序
            {
                cerr << "Unable to open input file: " << "wordlexicon"<< " -- bailing out!" << endl;
                exit(-1);
            }
    while (getline(infile, strtmp, '\n'))     // 读入词典的每一行并将其添加入哈希中
    {
        istringstream istr(strtmp);
        istr >> word;     //读入每行第一个词
        wordhash[word] = 1;     //插入到哈希中
    }
}

Cditionary::~Cditionary()
{
}
int Cditionary::FindWord(string s)
{
    if (wordhash.find(s) != wordhash.end())
        return 1;
    else 
        return 0 ;


}

//main.cpp
#include "dictionary.h"
#define MaxWordLength 10
#define Sep "/"

Cditionary WordDic;

// 字符串用最大匹配法处理 
string SegmentSetence(string s1)
{
    string s2 = "";

    while (! s1.empty())
    {
        int len = s1.length();
        if (len > MaxWordLength)
            len = MaxWordLength;
        string temp = s1.substr(0, len);

        int n = WordDic.FindWord(temp);
        while (len > 2 && n == 0)
        {
            len -= 2;
            temp = temp.substr(0 , len);
            n = WordDic.FindWord(temp);
        }
        s2 += temp + Sep;
        s1 = s1.substr(temp.length(), s1.length());
    }
    return s2;
}

int main(int argc , char * argv[])
{
    string strtmp; // 用于保存从语料库中读入的每一行
    string line; // 用于输出每一行的结果

    ifstream infile(argv[1]); // 打开输入文件
    if (! infile.is_open()) // 打开输入文件失败则退出程序
{
        cerr << "Unable to open input file: " << " -- bailing out!" << endl;
        exit(-1);
}

    ofstream outfile1("result.txt"); // 确定输出文件
    if (! outfile1.is_open())
{
        cerr << "Unable to open file：SegmentResult.txt"
        << "--bailing out!" << endl;
        exit(-1);
}

    while (getline(infile, strtmp, 'n')) // 读入语料库中的每一行并用最大匹配法处理
{
        line = strtmp;
        line = SegmentSetence(line); // 调用分词函数进行分词处理
        outfile1 << line << endl; // 将分词结果写入目标文件
}

return 0;
}

原理参见：52NLP

每次取最大匹配到的长度，截取后重新继续匹配

查看全文

相关阅读:
认识 liunx 文件属性
 laravel 配置mongodb 出现 Database [text] not configured 问题【已解决】
php 判断字符串的长度的两种方法
 php 循环里面套sql怎么解决
 php 给入门新手们填的第一个坑
 Vue 调用其他Vue或自己Vue中的方法时，this指向不生效
 绑定点击事件触发多次
 computed 计算属性的获取和设置
 el-Cascader 最后一项删不掉
 前端 --- 格式化的配置

原文地址：https://www.cnblogs.com/lzhenf/p/2433891.html

中文分词 （机械传统方法 ）正向最大匹配

中文分词（机械传统方法）正向最大匹配