zoukankan      html  css  js  c++  java
  • C# 中文分词[基于统计的朴素贝叶斯算法]

      
    主要思想:
    1. 要有一个语料库
    2. 统计每个词出现的频率, 一会来做朴素贝叶斯候选
    3. 举例: 中国人民共和国的
        其中语料库中有中国, 人民, 中国人, 共和国等等的词组. 
    现在输入: 中国人都爱中华人民共和国;
    分词的时候取max( 各种分发得到的score ); 
    例如: solution1:中国人_都爱中华人民_共和国
    solution2:中国_人_都爱中华人民_共和国
    solution3:中国_人_都爱_中华_人民_共和国 

                  bestSegSolution = max( solutions(segSlution[i] ));

          4.对于一句汉字的分词可以看做

                   seg( StringIn ) =  firPart + seg(StringIn – firPart); //   我用score来衡量当前分词结果的好坏

          6。 朴素贝叶斯的意思就是: 分词后的, 两个词之间是相互独立的, 也就是后者的出现与前者无关

    5. 这个只是初级版, 很简单, 需要再加点东西, 结果会更加的完美.. 当然, 按照做事情的原则, 都是从简单开始做的, 再努力
      
    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.Collections;
    using System.Windows.Forms;
    using System.IO;
    using System.Diagnostics;
    
    namespace ChineseWordSeg
    {
        class NaiveBayes
        {
            private string wordLibPath = "../WordLib/pku_training.txt";//所用的训练库是pku的语料库.
            bool trained = false;
            private Dictionary<string, long> wordLib = new Dictionary<string, long>();
            private Dictionary<string, long> singleWordLib = new Dictionary<string, long>();
            int maxLen = 0;
            long maxScore = 0;
            private string segPos = " "; //记录单句的分割点, 按照标点等非汉字的字符分开
            private string segSentence = " "; // 记录整个段落的
    
            // 是不是中文字符
            bool isChineseWord(char chr ){
                if (chr >= 0x4E00 && chr <= 0x9FFF) return true;
                return false;
            }
    
            public void trainDate( string path ) {
     // 统计每个词出现的次数           

    //1. 统计每个词组频率, naiveBayes消歧. 将一个组合不同的方式取得较大概率的那个分组方式.
    // 难道每个词还是hash一下么?
    //2. 统计每个字的频率, 就像向心力那样... 看看到底哪两个字比较容易联系到一起 这个是一句废话,因为我没这么去做
                wordLib.Clear();

                DirectoryInfo dirInfo = new DirectoryInfo(path);
                DirectoryInfo tmpDir = dirInfo.Parent;
                string savePath = tmpDir.FullName;
                FileInfo fInfo = new FileInfo(wordLibPath);
                string fileNamePre = fInfo.Name;
                savePath += "\\" + fileNamePre + "_trained";
                FileInfo infoOfDB = new FileInfo(savePath);

                if( File.Exists(savePath) && infoOfDB.Length > 0 ){
                  
                    StreamReader sr1 =
                                    new StreamReader(@savePath);
                    char[] sep = { ' '};
                   
                    while (sr1.Peek()!=-1)
                    {
                        string[] keyValue = sr1.ReadLine().Split(sep);

                        wordLib[keyValue[0]] = Convert.ToInt32(keyValue[1]);

                    }

                        return;
                }
               
                if ( !File.Exists( path ) ) {
                    MessageBox.Show("ÓïÁÏ¿â·¾¶ÓÐ´í£¬Çë¼ì²é");
                    return;
                }

                Stopwatch tm = new Stopwatch();
                tm.Start();
                StreamReader sr =
                            new StreamReader(@path,
                            System.Text.Encoding.GetEncoding("gb2312"));
               
                char tmpChar;
                string tmpStr;
                char[] tmpCArray = new char[100];

                {
                    tmpStr = " ";
                    bool flag = false;
                    long tmpVal = 0;
                    while (sr.Peek() != -1 ) {
                        tmpChar = (char)sr.Read();


                        if (isChineseWord( tmpChar ) )
                        {
                            flag = true;
                      /*
                                    if (flag == true)
                                    {
                                        string singleWord = (tmpChar).ToString();
                                        if (singleWordLib.ContainsKey(singleWord))
                                        {
                                            singleWordLib.TryGetValue(singleWord, out tmpVal);
                                            singleWordLib[singleWord] = tmpVal + 1;
                                        }
                                        else
                                            singleWordLib.Add(singleWord, 1);
                                        // ͳ¼Æÿ¸ö×ÖµÄ
           
                                    }*/

           
                            tmpStr += (char)tmpChar;
                        }
                        else
                        {
                            tmpStr = tmpStr.Trim();
                            if (flag == true)
                            {
                                if( tmpStr.Length > 1 ){
                                    if (wordLib.ContainsKey(tmpStr))
                                    {
                                        wordLib.TryGetValue(tmpStr, out tmpVal);
                                        wordLib[tmpStr]=tmpVal + 1;
                                    }
                                    else
                                        wordLib.Add(tmpStr, 1);
                                }
                                else{
                                    if (singleWordLib.ContainsKey(tmpStr))
                                    {
                                        singleWordLib.TryGetValue(tmpStr, out tmpVal);
                                        singleWordLib[tmpStr] = tmpVal + 1;
                                    }
                                    else
                                        singleWordLib.Add(tmpStr, 1);
                                }
                                // ͳ¼Æÿ¸ö´Ê×éµÄ
                            }
                            tmpStr = " ";
                            flag = false;
                        }

                        if (maxLen < tmpStr.Length)
                        {
                            maxLen = tmpStr.Length;
                            // ¼Ç¼µ¥´Ê×î´óµÄ³¤¶È...
                        }
                    }
                }
                sr.Close();

                StreamWriter sw = new StreamWriter(savePath);

               foreach ( string key in wordLib.Keys ) {
                   sw.WriteLine( key + " " + wordLib[key]);
               }
               

               sw.Close();
              
               tm.Stop();

               MessageBox.Show(tm.Elapsed.Milliseconds.ToString(), "training done");
            }

    //将分段好的结果传回.
            public string getSegedString( string  strIn ) {
                char[] seprator = { 's' };
                string[] segSplit = segSentence.Split(seprator);
                List<int> segP = new List<int>();
                segP.Clear();
                int j, i;
                int cntSegPos = 0;

                for( i = 0; i < segSplit.Length; i ++ ){
                    if (segSplit[i].Length > 0)
                    {
                        segP.Add(Convert.ToInt16(segSplit[i]));
                        cntSegPos++;
                    }
                }

                char[] cArray = new char[512];
                cArray = strIn.ToCharArray();

                string strOut = " ";
                bool flag = true;
            
                for (i = 0, j = 0; i < strIn.Length; i++)
                {
                   
                   
                    while (j < cntSegPos && segP.Contains(i))
                    {
                        segP.Remove(i);
                        flag = !flag;
                        if (flag)
                            strOut += ")";
                        else strOut += "(";
                        j++;
                    }
                    strOut += cArray[i];
                }
                if (j < cntSegPos) strOut += ")";
                return strOut;
            }

    // 恩, 做朴素贝叶斯分词
            public string doNaiveBayesSegmentation(string strIn, string trainDataPath){

                if( !trained )
                {
                    trained = true;
                    trainDate(trainDataPath);
                }

                string strTmp = " ";
                char[] charBuffer = new char[4096];
                charBuffer = strIn.ToCharArray();
                int i = 0, len = strIn.Length;

                while (  i < len )
                {
                    while (  i < len && isChineseWord(charBuffer[i]) ) strTmp += charBuffer[i++];

                    {
                       if(strTmp.Length > 0)
                       {
                            maxScore = 0;
                            segPos = " ";
                            naviveBayesSeg(strTmp, 0, "", i-strTmp.Length);
                            segSentence += segPos;
                       }
                       strTmp = "";
                    }

                    while (i < len && !isChineseWord(charBuffer[i])) i++;
                }

                return getSegedString(strIn);

            }

    // 分词的具体实现, bestSegSolution = max( solutions(segSlution[i] ));

                                        对于一句汉字的分词可以看做 seg( StringIn ) =  firPart + seg(StringIn – firPart);

                                        我用score来衡量当前分词结果的好坏
            public void naviveBayesSeg(string strIn, long score, string seg, int tPos){

                if ( true ) {
                    if( score > maxScore ) {
                        segPos = seg;
                        maxScore = score;

                    }
                   // return;
                }
                int strLen = strIn.Length;
                string firStr = "";
                int i = 0;
                for ( i = 1; i <= strIn.Length; i++) {
                    firStr = strIn.Substring(0, i);
                    if (wordLib.ContainsKey(firStr))
                    {
                        naviveBayesSeg(strIn.Substring(i), score + wordLib[firStr], seg + (tPos + i - firStr.Length).ToString() + "s" + (tPos + i).ToString()+"s", tPos + i);
                    }
                }

                if( i > strIn.Length && i > 1)
                    naviveBayesSeg(strIn.Substring(1), score, seg, tPos + 1);

            }
        }
    }

    _ _ _ ******************************************** ****** Never ever let you down. ***** ********************************************
  • 相关阅读:
    redis参数AOF参数的bug
    tidb损坏tikv节点怎么恢复集群
    mysql主从延时临时解决办法
    python脚本批量杀死redis链接
    pt-online-schema-change 脚本化
    mysql查看锁等信息SQL
    mongo复制集脑裂问题如何处理
    日志收集及网络包收集方案
    各浏览器下载文件名不乱码的解决办法
    java 中 byte[]、File、InputStream 互相转换
  • 原文地址:https://www.cnblogs.com/westfruit/p/1813538.html
Copyright © 2011-2022 走看看