zoukankan      html  css  js  c++  java
  • 中文分词 (机械传统方法 )正向最大匹配

    //ditionary.h
    #include <iostream>
    #include <string>
    #include <hash_map>
    #include <fstream>
    #include <sstream>
    using namespace std;

    class Cditionary
    {
    public:
    Cditionary();
    ~Cditionary();
    int FindWord(string w);
    private:
    string strtmp;
    string word;
    hash_map<string , int> wordhash;

    };

    Cditionary::Cditionary()
    {
    ifstream infile("wordlist.txt"); // 打开词典
    if (!infile.is_open()) // 打开词典失败则退出程序
    {
    cerr << "Unable to open input file: " << "wordlexicon"<< " -- bailing out!" << endl;
    exit(-1);
    }
    while (getline(infile, strtmp, '\n')) // 读入词典的每一行并将其添加入哈希中
    {
    istringstream istr(strtmp);
    istr >> word; //读入每行第一个词
    wordhash[word] = 1; //插入到哈希中
    }
    }

    Cditionary::~Cditionary()
    {
    }
    int Cditionary::FindWord(string s)
    {
    if (wordhash.find(s) != wordhash.end())
    return 1;
    else
    return 0 ;


    }
    //main.cpp
    #include "dictionary.h"
    #define MaxWordLength 10
    #define Sep "/"

    Cditionary WordDic;

    // 字符串用最大匹配法处理
    string SegmentSetence(string s1)
    {
    string s2 = "";

    while (! s1.empty())
    {
    int len = s1.length();
    if (len > MaxWordLength)
    len = MaxWordLength;
    string temp = s1.substr(0, len);

    int n = WordDic.FindWord(temp);
    while (len > 2 && n == 0)
    {
    len -= 2;
    temp = temp.substr(0 , len);
    n = WordDic.FindWord(temp);
    }
    s2 += temp + Sep;
    s1 = s1.substr(temp.length(), s1.length());
    }
    return s2;
    }

    int main(int argc , char * argv[])
    {
    string strtmp; // 用于保存从语料库中读入的每一行
    string line; // 用于输出每一行的结果

    ifstream infile(argv[1]); // 打开输入文件
    if (! infile.is_open()) // 打开输入文件失败则退出程序
    {
    cerr << "Unable to open input file: " << " -- bailing out!" << endl;
    exit(-1);
    }

    ofstream outfile1("result.txt"); // 确定输出文件
    if (! outfile1.is_open())
    {
    cerr << "Unable to open file:SegmentResult.txt"
    << "--bailing out!" << endl;
    exit(-1);
    }

    while (getline(infile, strtmp, 'n')) // 读入语料库中的每一行并用最大匹配法处理
    {
    line = strtmp;
    line = SegmentSetence(line); // 调用分词函数进行分词处理
    outfile1 << line << endl; // 将分词结果写入目标文件
    }

    return 0;
    }

    原理参见:52NLP

    每次取最大匹配到的长度,截取后重新继续匹配 

  • 相关阅读:
    文档对象模型(DOM)
    Gridview,DataList,Repeater 鼠标经过时行颜色变换
    一组经典测试思想观点
    如何编写测试计划
    测试用例 之我见
    软件测试流程 之我见
    经典博文各系列文章
    JS实现在Repeater控件中创建可隐藏区域
    测试感想
    海量数据处理 算法总结
  • 原文地址:https://www.cnblogs.com/lzhenf/p/2433891.html
Copyright © 2011-2022 走看看