zoukankan      html  css  js  c++  java
  • 也谈贝叶斯分类(C#)版本

       

    代码下载

     最近在做一个大作业。搭建一个信息检索平台。用到了贝叶斯分类参考了洞庭散人大哥的技术博客

    http://www.cnblogs.com/phinecos/archive/2008/10/21/1316044.html

    但是,他的算法运行起来很慢,原因是IO操作过于频繁,而且有些IO操作是可以避免的。下面开始介绍我的贝叶斯分类算法实现。

    采用分词器为河北理工大学吕震宇老师的SHARPICTCLAS 该分词器没有Lucene接口,自己实现Analyzer 和Tokenizer 类如下

    ICTCLASAnalyzer
    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.IO;
    using Lucene.Net.Analysis;
    using Lucene.Net.Analysis.Standard;

    namespace Bayes
    {
        
    class ICTCLASAnalyzer:Analyzer
        {
            
    public static readonly System.String[] CHINESE_ENGLISH_STOP_WORDS = new string[400];
            
    public string NoisePath = Environment.CurrentDirectory + "\\data\\stopwords.txt";
            
    public ICTCLASAnalyzer()
            {
               StreamReader reader 
    = new StreamReader(NoisePath, System.Text.Encoding.Default);
                
    string noise = reader.ReadLine();
                
    int i = 0;
               
                
    while (!string.IsNullOrEmpty(noise)&&i<400)
                {
                    CHINESE_ENGLISH_STOP_WORDS[i] 
    = noise;
                   noise 
    = reader.ReadLine();
                   i
    ++;
                 }
                
          }

                   
    /**//**//**//// Constructs a {@link StandardTokenizer} filtered by a {@link
           
    /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. 
           
    /// 
            public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
            {
               TokenStream result 
    = new ICTCLASTokenizer(reader);
                result 
    = new StandardFilter(result);
                result 
    = new LowerCaseFilter(result);
                result 
    = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
               
    return result;
            }


        }
    }
    ICTCLASTokenizer
    using System;
    using System.Collections.Generic;
    using System.Text;
    using Lucene.Net.Analysis;
    using Lucene.Net.Documents;
    using Lucene.Net.Analysis.Standard;
    using System.IO;
    using SharpICTCLAS;


    namespace Bayes
    {
        
    class ICTCLASTokenizer:Tokenizer
        {
             
    int nKind = 1;
             List
    <WordResult[]> result;
             
    int startIndex = 0;
             
    int endIndex = 0;
             
    int i = 1;
             
    /**//**/
             
    /**//// 
            
    /// 待分词的句子
            
    /// 
            private string sentence;
             
    /**//**/
            
    /**//// Constructs a tokenizer for this Reader. 
            public ICTCLASTokenizer(System.IO.TextReader reader)
            {
                 
    this.input = reader;
                 sentence 
    = input.ReadToEnd();
                 sentence 
    = sentence.Replace("\r\n""");
                 
    string DictPath = Path.Combine(Environment.CurrentDirectory, "Data"+ Path.DirectorySeparatorChar;
                
    //Console.WriteLine("正在初始化字典库,请稍候");
                WordSegment wordSegment = new WordSegment();
                 wordSegment.InitWordSegment(DictPath);
                 result 
    = wordSegment.Segment(sentence, nKind);
             }
     
             
    /**//**/
             
    /**//// 进行切词,返回数据流中下一个token或者数据流为空时返回null
             
    /// 
             public override Token Next()
             {
                 Token token 
    = null;
                
    while (i < result[0].Length - 1)
                 {
                     
    string word = result[0][i].sWord;
                     endIndex 
    = startIndex + word.Length - 1;
                     token 
    = new Token(word, startIndex, endIndex);
                    startIndex 
    = endIndex + 1;

                     i
    ++;
                     
    return token;

                }
                
    return null;
             }

        }
    }

     下面开始介绍我的实现:分为五个类: ChineseSpliter用于分词,ClassifyResult用于储存结果。MemoryTrainingDataManager,用于管理IO操作 FastNaiveBayesClassification 用于实现贝叶斯算法。和洞庭散人不同之处在于我的各个计算前向概率,条件概率,联合概率的函数写在了一个类里,而不是多个类,这样做的目的在于避免不必要的IO操作。

    ClassifyResult
    using System;
    using System.Collections.Generic;
    using System.Text;

    namespace Bayes
    {
        
    class ClassifyResult
        {
            
    public string className;
            
    public float score;
            
    public ClassifyResult()
            {
                className 
    = "";
                score 
    = 0;
            }
        
        
        }
    }
    ChineseSpliter
    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.IO;
    using Lucene.Net.Analysis;


    namespace Bayes
    {
        
    class ChineseSpliter
        {    
    public string Split(string text,string splitToken)
            {
              StringBuilder sb 
    = new StringBuilder();

                Analyzer an 
    = new ICTCLASAnalyzer();

                
    //TokenStream ts = an.ReusableTokenStream("", new StringReader(text));

               TokenStream ts 
    = an.TokenStream(""new StringReader(text));

                 Lucene.Net.Analysis.Token token;
                  
    while ((token = ts.Next()) != null)
                  {
                       sb.Append(splitToken 
    + token.TermText());
                   }
     
                 
    return sb.ToString().Substring(1);
             }
            
    public string[] GetTerms(string result, string spliter)
            {
                
    string[] terms = result.Split(new string[] { spliter }, StringSplitOptions.RemoveEmptyEntries);
                
    return terms;

            }

        }
    }

      

    MemoryTrainingDataManager
    using System;
    using System.Collections.Generic;
    using System.Text;
    using System.IO;



    namespace Bayes
    {
        
    class MemoryTrainingDataManager
        {   
    //调用 函数GetClassifications()获取类别子目录在磁盘中的储存位置,为公有成员变量 txtClassification赋值
            
    //调用 GetTtotalFileCount() 获取总共的样本集文章数目,为公有成员变量 totalFileCount赋值
            public String[] txtClassifications;//训练语料分类集合
            private static String defaultPath = "F:\\TrainingSet";
            
    public int totalFileCount;
            
    public void   GetClassifications()
            {
                
    this.txtClassifications = Directory.GetDirectories(defaultPath);
               
            }

            
    public int GetSubClassFileCount(string subclass)
            {
                
    string[] paths = Directory.GetFiles(subclass);
                
    return paths.Length;
            }
            
    public void  GetTotalFileCount()
            {
                
    int count = 0;
                
    for (int i = 0; i < txtClassifications.Length; i++)
                {
                    count 
    += GetSubClassFileCount(txtClassifications[i]);
                }
                totalFileCount 
    = count;
            }
           
            
    public string GetText(string filePath)
            {
                StreamReader sr 
    = new StreamReader(filePath, Encoding.Default);
                
    string text = sr.ReadToEnd();
                sr.Close();
                
    return text;
            }
            
    public void  SetMainMemoryStructure(ref StoreClass sc ,string subclass)
            {
               
                   
    string []paths=Directory.GetFiles(subclass);
                    sc.classificationName 
    = subclass;
                   sc.classificationCount 
    = paths.Length;
                   sc.strFileContentList 
    = new string[sc.classificationCount];
                    
    for (int k = 0; k < paths.Length; k++)
                    {
                        sc.strFileContentList[k]
    =GetText(paths[k]);
                    }
               }

            
    public int GetKeyCountOfSubClass(string key, ref StoreClass sc)
            {
                
    int count = 0;
                
    for (int i = 0;  i < sc.classificationCount; i++)
                {
                    
    if (sc.strFileContentList[i].Contains(key))
                    {
                        count
    ++;
                    }
                }
                    
    return count;


            }
             
            




        }
    }
    FastNaiveBayesClassification
    using System;
    using System.Collections.Generic;
    using System.Text;

    namespace Bayes
    {
        
    class FastNaiveBayesClassification
        {
           
    // public  StoreClass memorystore=new StoreClass();
            public MemoryTrainingDataManager mtdm=new MemoryTrainingDataManager();
            
    private ChineseSpliter spliter = new ChineseSpliter();
            
    private static float ZoomFactor = 10;
           
            
    public FastNaiveBayesClassification()
            {
                mtdm.GetClassifications();
                mtdm.GetTotalFileCount();
            }
            
    /// <summary>
            
    /// Nc 表示属于c类的文本数,N表示总文件数
            
    /// </summary>
            
    /// <param name="Nc"></param>
            
    /// <param name="N"></param>
            
    /// <returns></returns>
            public float CalculatePriorProbability(float Nc,float N)
            {
                
    float ret = 0F;
                ret 
    = Nc / N;
                
    return ret;
            }
            
    /// <summary>
            
    /// 
            
    /// </summary>
            
    /// <param name="NxC">某一类别中某一词频出现的文件数</param>
            
    /// <param name="Nc">该类别文件总数</param>
            
    /// <returns></returns>
            public float CalculateConditionalProbability(float NxC, float Nc)
            {
                
    float M = 0F;
                
    float ret = 0F;
                ret 
    = (NxC + 1/ (Nc + M + mtdm.txtClassifications.Length);
                
    return ret;
            }
            
    public float CalculateJointProbability(float []NxC, float Nc, float  N)
            {
                
    float ret = 1;
                
    for (int i = 0; i < NxC.Length; i++)
                {
                    ret 
    *= CalculateConditionalProbability(NxC[i], Nc) * ZoomFactor;
                }
                ret 
    = ret * CalculatePriorProbability(Nc, N) ;
                
    return ret;

            }
            
    public string[] SplitTerms(string text)
            {
                
    //string result = tokenizer.TextSplit(text, "@@@");
                
    // string[] terms = tokenizer.GetTerms(result, "@@@");
                string result = spliter.Split(text, "@@@");
                
    string[] terms = spliter.GetTerms(result, "@@@");
                
    return terms;
            }

            
    public ClassifyResult Classify(string text)
            {   
    int end=mtdm.txtClassifications.Length;
                ClassifyResult[] results 
    = new ClassifyResult[end];
                
    for (int i = 0; i < end; i++)
                {
                    results[i] 
    = new ClassifyResult();
                }
                
    string[] terms = SplitTerms(text);
                
    float N = mtdm.totalFileCount;
                
    for (int i = 0; i < end; i++)
                {
                    StoreClass sc 
    = new StoreClass();
                    mtdm.SetMainMemoryStructure(
    ref sc,  mtdm.txtClassifications[i]);
                    
    float  Nc = sc.classificationCount;
                    
    float[] Nxc = new float[terms.Length];
                   
                    
    for(int k=0;k<terms.Length;k++)
                    {
                      Nxc[k]
    =mtdm.GetKeyCountOfSubClass(terms[k],ref sc);
                     
    // Console.WriteLine("含有的关键词数量{0}",Nxc[k]);
                    }
                     results[i].score
    = CalculateJointProbability(Nxc, Nc, N);  
                     results[i].className 
    = sc.classificationName;
                     Console.WriteLine(
    "类别{0},分数{1}", results[i].className, results[i].score);
                
                }
                
    //选择法排序
                for (int m = 0; m < results.Length - 1; m++)
                {
                    
    int k = m;
                    
    for (int n = m + 1; n < results.Length; n++)
                    {
                        
    if (results[n].score > results[k].score)
                        {
                            k 
    = n;
                        }
                    }
                    
    if (k != m)
                    {
                        ClassifyResult temp 
    = new ClassifyResult();
                        temp.score 
    = results[k].score;
                        temp.className 
    = results[k].className;
                        results[k].className 
    = results[m].className;
                        results[k].score 
    = results[m].score;
                        results[m].score 
    = temp.score;
                        results[m].className 
    = temp.className;
                    }
                }
                
    return results[0];

            }
        }
    }
  • 相关阅读:
    接口的上溯造型——《Thinking in Java》随笔015
    数据库
    小结
    异常及String
    多态&接口
    继承&封装
    Java 类 对象 包
    Java 方法的应用
    Java数组的运用
    Java代码运用及算法思路养成——用*号输出形状
  • 原文地址:https://www.cnblogs.com/finallyliuyu/p/1631159.html
Copyright © 2011-2022 走看看