zoukankan      html  css  js  c++  java
  • 简单的中文分词加上kmean聚类 (c++)

    程序代码参考了csdn某博客,具体名字忘记了 

    变量命名的头文件

    //common.h
    #ifndef COMM_H
    #define COMM_H
    
    #include <iostream>
    #include <vector>
    #include <string>
    #include <algorithm>
    #include  <iterator>
    using namespace std;
    
    typedef vector<string> StrVec; //字符串向量
    typedef vector<int> IntVec; //整数向量
    typedef vector<vector<int> > Int2DVec;//整数二维向量
    typedef vector<vector<double>> Double2DVec;//浮点数二维向量
    typedef vector<double> DoubleVec;//浮点数向量
    #endif

    去除停用词语

    #pragma once
    #include "common.h"
    
    // 用于移除停止词
    class StopWordsHandler
    {
    public:
        StopWordsHandler(void);
        ~StopWordsHandler(void);
        bool IsStopWord(string& str);
    private:
        StrVec stopwords;
    };
    #include "StopWordHandler.h"
    
    string StopWordList[] = {"", "我们","","自己","","","","","","","","","","","","","","","","","","","","","","","","",""};//停用词
    int strwordlen = sizeof(StopWordList) / sizeof(StopWordList[0]);
    StopWordsHandler::StopWordsHandler()
    {
        for ( int i = 0 ; i < strwordlen ; i++)
            stopwords.push_back(StopWordList[i]);
    }
    StopWordsHandler::~StopWordsHandler()
    {
    
    }
    
    bool StopWordsHandler::IsStopWord(string& str)
    {
        transform(str.begin(),str.end(),str.begin(),tolower);//确保小写化
        return find(stopwords.begin(),stopwords.end(),str)!=stopwords.end();
    }

    分词选用了最简单的分词方法,预先用空格做好了分词

    #pragma once
    #include "Common.h"
    
    class ITokeniser
    {
    public:
        virtual void Partition(string input,StrVec& retWords)=0;//分词算法
    };
    #pragma once
    #include "Itokenisher.h"
    
    class Tokeniser :public  ITokeniser
    {
    public:
        Tokeniser();
        ~Tokeniser();
        void Partition(string input , StrVec& retWords);
    };
    #include "Tokeniser.h"
    #include "StopWordHandler.h"
    #include <iterator>
    Tokeniser::Tokeniser()
    {
    }
    Tokeniser::~Tokeniser()
    {
    
    }
    void Tokeniser::Partition(string input ,StrVec& retWord)
    {
        transform(input.begin() , input.end(),input.begin(),tolower);
        string::iterator pos = input.begin();
        StopWordsHandler stopHandler;
        do
        {
            string temp;
            pos = find(input.begin() , input.end(),' ');
            copy(input.begin() , pos ,back_inserter(temp));
            if ( !stopHandler.IsStopWord(temp))
                retWord.push_back(temp);
            if ( pos == input.end())
                break;
            else
                input.erase(input.begin() ,++pos);
        }while ( pos != input.end());
    }

    TFIDF的计算

    #pragma once
    
    #include "Itokenisher.h"
    #include <map>
    
    class TFIDFMeasure
    {
    private:
        StrVec _docs; //文档集合 , 每一行字符串代表一个文档
        int _numDocs; //文档数目
        int _numTerms;//单词数目
        StrVec _terms;//单词集合
        Int2DVec _termFreq ;//每个单词出现在每份文档的频率
        Double2DVec _termWeight;//每个单词在每份文档的权重
        IntVec _maxTermFreq ;//记录每份文档的最大词频
        IntVec _docFreq;//出现这个单词的文档频率
        ITokeniser* _tokeniser;//分词器
        map<string , int > _wordIndex;//单词映射表
    public :
        TFIDFMeasure(const StrVec& document , ITokeniser * tokeniser);
        ~TFIDFMeasure();
        inline int NumTerm( ) const
        {
            return this->_numTerms;
        }
        void GetTermVector(int doc , DoubleVec& vec);//获取项向量
    
    protected:
        void init();//初始化tf-idf计数
        void GenerateTerms(const StrVec& ,StrVec& terms);//分词处理
        void GenerateTermFrequency();//计算词频
        void GenerateTermWeight();//计算词的权重
        void GetWordFrequency( string & input ,map<string,int> &freq);
        int CountWords(string& word ,const StrVec& words);
        int GetTermIndex(const string& term);//查询词语对应的下标
        double ComputeTermWeight(int term ,int doc);//计算词语在制定文档的频率
        double GetTermFrequency(int term , int doc);//获取词语在文档的频率
        double GetInverseDoucumentFrequency(int term); //计算逆文档频率
    
        
    
    };
    #include "TF_IDF.h"
    
    TFIDFMeasure::~TFIDFMeasure()
    {
        if (this->_tokeniser != NULL)
        {
            delete _tokeniser;
            _tokeniser = NULL;
        }
        _docs.clear();
        _terms.clear();
        _wordIndex.clear();
    }
    TFIDFMeasure::TFIDFMeasure(const StrVec& document , ITokeniser * tokeniser )
    {
        _docs = document;
        _numDocs = document.size();
        _tokeniser = tokeniser;
        this->init();
    }
    void TFIDFMeasure::init()
    {
        //初始化
        this->GenerateTerms(_docs,_terms); //分词
        this->_numTerms = _terms.size(); //所有文档中的词项数目
    
        //申请空间
        _maxTermFreq.resize(_numDocs);
        _docFreq.resize(_numTerms);
        _termFreq.resize(_numTerms);
        _termWeight.resize(_numTerms);
    
        for (int i = 0 ; i < _terms.size() ; i++)
        {
            _termWeight[i].resize(_numDocs);
            _termFreq[i].resize(_numDocs);
            _wordIndex[_terms[i]] = i; //将单词放入单词映射表中
    
        }
        this->GenerateTermFrequency();
        this->GenerateTermWeight();
    
    }
    void TFIDFMeasure::GenerateTerms(const StrVec& docs ,StrVec &terms)
    {
        for (int i = 0 ; i < docs.size() ;  i++)
        {
            StrVec words;
            _tokeniser->Partition(docs[i] , words); //分词部分
    
            for ( int j = 0 ; j < words.size() ; j++)
            {
                if ( find(terms.begin() , terms.end(),words[j] ) == terms.end())
                    terms.push_back(words[j]);
            }
    
        }
    }
    void TFIDFMeasure::GenerateTermFrequency()
    {
        //计算每个单词在每份文档中出现的概率
        for ( int i = 0 ; i < _numDocs ; i++)
        {
            string curDoc = _docs[i]; //当前待处理的文档
            map<string,int> freq;
            this->GetWordFrequency(curDoc ,freq);
            map<string,int>::iterator iter;
            _maxTermFreq[i] = numeric_limits<int>::min();
            for ( iter = freq.begin() ; iter != freq.end() ; iter++)
            {
                string word = iter->first;
                int wordFreq = iter->second;
                int termIndex = GetTermIndex(word); //单词下标
                if ( termIndex == -1)
                    continue;
                _termFreq[termIndex][i] = wordFreq;
                _docFreq[termIndex]++;
    
                if ( wordFreq > _maxTermFreq[i]) _maxTermFreq[i] = wordFreq;
            }
        }
    }
    int TFIDFMeasure::GetTermIndex(const string & term)
    {
        map<string , int> ::iterator pos = _wordIndex.find(term);
        if ( pos != _wordIndex.end())
            return pos->second;
        else
            return -1;
    }
    class WordComp 
    {
    public:
        WordComp(string& sWord) : word(sWord)
          {
    
          }
          bool operator() (const string& lhs) 
          {
              return lhs.compare(word)==0;
          }       
    private:
        string word;        
    };
    void TFIDFMeasure::GetWordFrequency( string & input , map<string,int>& freq)
    {
        //计算单词频率
        transform(input.begin(),input.end(),input.begin(),tolower);
        StrVec temp;
        this->_tokeniser->Partition(input , temp);
        unique(temp.begin() , temp.end());
        StrVec::iterator iter;
        for ( iter = temp.begin() ; iter != temp.end() ; iter++)
        {
            int count = CountWords(*iter , temp); //计算单词在文档中出现的次数
            freq[*iter] = count;
        }
    
    
    }
    int TFIDFMeasure::CountWords(string & word ,const StrVec& temp)
    {
        //计算每个单词在该文档的词频数目
        int ncount = 0 ;
        ncount = count_if(temp.begin() , temp.end() , WordComp(word));
        return ncount ;
    }
    void TFIDFMeasure::GenerateTermWeight()
    {
        for (int i = 0 ; i < _numTerms ; i++)
            for (int j = 0 ; j < _numDocs ; j++)
                _termWeight[i][j] = ComputeTermWeight( i , j );
    }
    double TFIDFMeasure::ComputeTermWeight(int term , int doc)
    {
        float tf = GetTermFrequency(term , doc);
        float idf = GetInverseDoucumentFrequency(term);
        return tf * idf ;
    }
    double TFIDFMeasure::GetTermFrequency(int term , int doc)
    {
        int freq = _termFreq[term][doc]; //词频
        int maxfreq = _maxTermFreq[doc];
        return ((float) freq /(float)maxfreq);
    }
    double TFIDFMeasure::GetInverseDoucumentFrequency(int term)
    {
        int df = _docFreq[term];
        return log((float)(_numDocs)/(float)df);
    
    }
    void TFIDFMeasure::GetTermVector(int doc ,DoubleVec& vec)
    {
        vec.resize(this->_numTerms);
        for ( int i = 0 ; i < this->_numTerms ; i++)
            vec[i] = _termWeight[i][doc];
    }

    计算余弦相似性距离

    #pragma once
    #include "common.h"
    
    class TermVector
    {
    public:
        static double ComputerCosineSimilarity(const DoubleVec& vector1 , const DoubleVec& vector2 );
        static double innerProduct(const DoubleVec& v1 ,const DoubleVec& v2);
        static double VectorLength(const DoubleVec & v);
    };
    #include "TermVector.h"
    #include <cmath>
    
    double TermVector::ComputerCosineSimilarity(const DoubleVec & v1 , const DoubleVec& v2)
    {
        if ( v1.size() != v2.size())
            throw string("different length");
    
        double denom = (VectorLength(v1) * VectorLength(v2));
    
        if ( denom == 0 )
            return 0 ;
        else
            return (innerProduct(v1 , v2) / denom);
    }
    
    double TermVector::innerProduct(const DoubleVec & v1 , const DoubleVec& v2)
    {
        if ( v1.size() != v2.size())
            throw string ("different length");
    
        double result = 0.0f;
        for ( int i = 0 ; i < v1.size() ; i++)
            result+=v1[i]*v2[i];
        return result;
    
    }
    double TermVector::VectorLength(const DoubleVec & v)
    {
        double sum = 0.0f;
        for ( int i = 0 ; i < v.size() ; i++)
            sum= sum+(v[i] * v[i]);
        return (double)sqrt(sum);
    }

    定义cluster的类

    #pragma once
    #include "common.h"
    class Cluster
    {
    public:
        IntVec CurrentMembership; //该类簇的数据成员索引
        DoubleVec Mean ; //该簇类的聚类中心
        Cluster();
        ~Cluster();
        Cluster(int dataindex , DoubleVec & data);
        void UpdateMean(Double2DVec & coordinates);
    };
    #include "cluster.h"
    
    Cluster::Cluster()
    {
        
    }
    Cluster::Cluster(int dataindex , DoubleVec& data)
    {
        CurrentMembership.push_back(dataindex);
        copy(data.begin() , data.end() ,back_inserter(Mean));
    }
    
    void Cluster::UpdateMean(Double2DVec & coordinates)
    {
        //根据 mcurrentmembership取得原始资料点对象
        //根据该子集的均值,corrdinate是一个m* n的矩阵,其实就是要求每列的均值
        for (int i = 0 ; i< CurrentMembership.size();i++)
        {
            DoubleVec& coord = coordinates[CurrentMembership[i]];
            for ( int j = 0 ; j < coord.size() ; j++)
                Mean[j]+=coord[j];
            for (int k = 0 ; k <Mean.size() ; k++)
                Mean[k] /= coord.size();
        }
    }
    Cluster::~Cluster()
    {
    
    }
    #pragma once
    #include "common.h"
    
    class Cluster;
    
    class KMeans
    {
    public:
        vector<Cluster*> _clusters;
        KMeans(Double2DVec& data, int K);
        void Start();
        ~KMeans();
    private:
        int _coordCount; //数据的数量
        Double2DVec _coordinates;//原始数据
        int _k;  //聚类的簇个数
        IntVec _clusterAssignments;
    
        IntVec _nearestCluster;
    
        Double2DVec _distanceCache;
        void InitRandom();
        static double getDistance(const DoubleVec & coord ,const DoubleVec& center);
        int NearestCluster(int ndx);
        
        
    };
    #include "kmean.h"
    #include <time.h>
    #include "cluster.h"
    #include "TermVector.h"
    #include <limits>
    KMeans::KMeans(Double2DVec &data , int k )
    {
        int i ;
        this->_coordinates.resize(data.size());
        for ( i = 0 ; i <data.size() ; i++)
            copy(data[i].begin() , data[i].end(),back_inserter(_coordinates[i]));
        _coordCount = data.size();
        _k = k;
        _clusters.resize(k);
        _clusterAssignments.resize(_coordCount);
        _nearestCluster.resize(_coordCount);
        _distanceCache.resize(_coordCount);
        for ( int i = 0 ; i <_coordCount ; i++)
            _distanceCache[i].resize(_coordCount);
        InitRandom();
    }
    void KMeans::InitRandom()
    {
        srand(unsigned(time(NULL)));
        for (int i = 0 ; i < _k ; i++)
        {
            int temp = rand() %(_coordCount); //产生随机数
            _clusterAssignments[temp] = i; 
            _clusters[i] = new Cluster(temp ,_coordinates[temp]);
        }
    }
    
    void KMeans::Start()
    {
        int iter = 0 , i , j ;
        while ( true)
        {
            cout <<"Iteration " << iter++ << " ...." <<endl;
    
            //重新计算每个簇类的均值
            for ( int i = 0 ; i <_k ; i++)
            {
                _clusters[i]->UpdateMean(_coordinates);
            }
            //计算每个数据和每个簇类中心的距离
            for ( i = 0 ; i <_coordCount ; i++)
            {
                for ( j = 0 ; j <_k ; j++)
                {
                    double dist = getDistance(_coordinates[i],_clusters[j]->Mean);
                    _distanceCache[i][j] = dist;
                }
            }
            //计算每个数据离簇类最近
    
            for ( i = 0 ; i <_coordCount ; i++)
                _nearestCluster[i] = this->NearestCluster(i);
            int k = 0 ; 
            for ( i = 0 ; i <_coordCount ; i++)
            {
                if (_nearestCluster[i] == _clusterAssignments[i])
                    k++;
            }
            if ( k == _coordCount)
                break;
    
            for ( j = 0 ; j < _k ; j++)
            {
                _clusters[j]->CurrentMembership.clear();
            }
            for ( i = 0 ; i <_coordCount ; i++)
            {
                _clusters[_nearestCluster[i]]->CurrentMembership.push_back(i);
                _clusterAssignments[i] = _nearestCluster[i];
            }
    
        }
    }
    
    double KMeans::getDistance(const DoubleVec& coord , const DoubleVec& center)
    {
        return 1 - TermVector::ComputerCosineSimilarity(coord,center);
    }
    int KMeans::NearestCluster(int ndx)
    {
        int near = -1 ;
        double min = numeric_limits<double>::max();
        for ( int c = 0 ; c <_k ; c++)
        {
             double d = _distanceCache[ndx][c];
             if ( d < min)
             {
                min = d ;
                near = c ;
             }
        }
        return near;
    }
    KMeans::~KMeans()
    {
        vector<Cluster*>::iterator iter;
        for ( iter = this->_clusters.begin(); iter!=_clusters.end() ; iter++)
            delete (*iter);
        _clusters.clear();
    }
    #include "TF_IDF.h"
    #include "Tokeniser.h"
    #include <fstream>
    #include "kmean.h"
    #include "cluster.h"
    int main()
    {
        // 读入文档数据 
        StrVec strVec;
        ifstream inFile("c:\\input.txt");
        string tempstr;
        while ( getline(inFile , tempstr))
        {
            strVec.push_back(tempstr);
        }
        TFIDFMeasure tf(strVec , new Tokeniser());
    
        int K =3 ; //聚类的个数
        int docCount = strVec.size();
        //生成k-mean的输入数据
        Double2DVec data;
        data.resize(docCount);
        int dimension = tf.NumTerm();
        for ( int i = 0 ; i < docCount ; i++)
        {
            
                tf.GetTermVector( i , data[i]); //获取第i个文档的TFIDF权重向量
        }
        KMeans kmeans(data , K );
        kmeans.Start();
    
        vector<Cluster*> clusters = kmeans._clusters;
        vector<Cluster*>::iterator iter;
        IntVec::iterator it2 ;
        for ( iter = clusters.begin() ; iter != clusters.end() ; iter++)
        {
            cout <<"------------------------------------" <<endl;
            IntVec & vec = (*iter)->CurrentMembership;
            for ( it2 = vec.begin() ; it2 != vec.end() ; it2++)
                cout <<strVec[*it2] <<endl;
    
        }
        system("pause");
        return 0 ;
    }
  • 相关阅读:
    Sql优化(二) 快速计算Distinct Count
    Minor GC和Full GC区别(转)
    对于JVM中方法区,永久代,元空间以及字符串常量池的迁移和string.intern方法
    2017/6/29杂记
    java中代理,静态代理,动态代理以及spring aop代理方式,实现原理统一汇总
    ThreadLocal的设计理念与作用
    2017/6/28杂记
    关于hashmap 与concurrentHashMap
    2017/6/27杂记
    深入理解java异常处理机制
  • 原文地址:https://www.cnblogs.com/lzhenf/p/2442526.html
Copyright © 2011-2022 走看看