zoukankan      html  css  js  c++  java
  • 聚类算法效果评估entropy purity nmi

    1.数据管理脚本:原始文件格式id\tclusterId\tgoldstandardId

    DataManagement.py

    #!/usr/bin/python
    import cPickle as p;
    import sys;
    import re;
    if(__name__=="__main__"):
        filename=str(sys.argv[1]);
        preturn=re.compile('(^\s+|\s+$)');
        fidsrc=file(filename,'r');
        clusters={}; #(key,[])
        goldstandards={};#(key,[])
        for line in fidsrc.readlines():
            line=preturn.sub('',line);
            m=line.split('\t');
            #print m
            #s=raw_input('please enter');
            if(len(m)==3):#if 
                if(not clusters.has_key(int(m[1]))):
                    clusters[int(m[1])]=[];
                    clusters[int(m[1])].append(int(m[0]));
                else:
                     clusters[int(m[1])].append(int(m[0]));
                if(not goldstandards.has_key(int(m[2]))):
                    goldstandards[int(m[2])]=[];
                    goldstandards[int(m[2])].append(int(m[0]));
                else:
                    goldstandards[int(m[2])].append(int(m[0]));
        fidclusters=file(sys.argv[2],'w');
        fidgoldstandards=file(sys.argv[3],'w');
        p.dump(clusters,fidclusters);
        fidclusters.close();
        p.dump(goldstandards,fidgoldstandards);
        fidgoldstandards.close();
        fidsrc.close();
        print '%s has finished!'%sys.argv[0];

    EvaluationClusterAlgorithm.py

    #!/usr/bin/python
    #
    -*- coding:cp936 -*-
    import re;
    import cPickle as mypickle;
    import sys;
    import math;
    class Evaluation:
        def __init__(self,clusterfid,goldstandardfid):
            self.clusters=mypickle.load(file(clusterfid));#get the cluster algorithm results
            self.goldstandards=mypickle.load(file(goldstandardfid));#get the gold-standard answers
            tempclusterkeys=self.clusters.keys();
            tempclusterkeys.sort();
            tempgoldstandardkeys=self.goldstandards.keys();
            tempgoldstandardkeys.sort();
            self.k=len(tempclusterkeys);
            self.q=len(tempgoldstandardkeys);
            self.minclusterId=tempclusterkeys[0];#最小聚类ID
            self.maxclusterId=tempclusterkeys[self.k-1];#最大聚类ID
            self.mingoldstandardId=tempgoldstandardkeys[0];
            self.maxgoldstandardId=tempgoldstandardkeys[self.q-1];
            self.coocurrence={};#(clusterId,goldstandardId)=num;store the number of documents shared by clusterId and goldstandardId;
            N1=0;
            N2=0;
            for m in tempclusterkeys:
                N1=N1+len(self.clusters[m]);
            for m in tempgoldstandardkeys:
                N2=N2+len(self.goldstandards[m]);
            if(N1==N2):
                self.N=N1;#num of documents
            else:
                print 'there is a error N1=%d,N2=%d,please reexamine the data source'%(N1,N2);
        def GenerateCoocurrence(self):
            for key_cluster in self.clusters.keys():
                set1=set(self.clusters[key_cluster]);
                for key_gold in self.goldstandards.keys():
                    set2=set(self.goldstandards[key_gold]);
                    setintersect=set1&set2;
                    Num=len(setintersect);
                    if(not self.coocurrence.has_key((key_cluster,key_gold))):
                        self.coocurrence[(key_cluster,key_gold)]=Num;
                    
                
        def CalPurityForPerCluster(self,clusterId):
            result=0.0;
            NumCollection=[];
            for Id in range(self.mingoldstandardId,self.maxgoldstandardId+1):
                NumCollection.append(self.coocurrence[(clusterId,Id)]);
            NumCollection.sort();
            result=float(NumCollection[len(NumCollection)-1])/float(len(self.clusters[clusterId]));
            
            return result;
        def CalPurity(self):
            result=0.0;
            for clusterId in range(self.minclusterId,self.maxclusterId+1):
                purityPer=self.CalPurityForPerCluster(clusterId);
                result=result+float(len(self.clusters[clusterId]))*purityPer/float(self.N);
            return result;
        def CalEntropyFormula(self,seq):
            result=0.0;
            for elemP in seq:
                if(elemP>0):
                    result=result+elemP*math.log(elemP,2);
            return -result;
        def CalEntropyForPerCluster(self,clusterId):
            seq=[];
            result=0;
            for Id in range(self.mingoldstandardId,self.maxgoldstandardId+1):
                Prob=float(self.coocurrence[(clusterId,Id)])/float(len(self.clusters[clusterId]));
                seq.append(Prob);
            result=self.CalEntropyFormula(seq);
            return result;
        def CalEntropy(self):
            result=0;
            for clusterId in range(self.minclusterId,self.maxclusterId+1):
                entropyPer=self.CalEntropyForPerCluster(clusterId);
                result=result+float(len(self.clusters[clusterId]))*entropyPer/float(self.N);
            return result;
        def CalMutualInformation(self):
            result=0.0;
            for clusterId in range(self.minclusterId,self.maxclusterId+1):
                N_c=len(self.clusters[clusterId]);
                for goldId in range(self.mingoldstandardId,self.maxgoldstandardId+1):
                    N_g=len(self.goldstandards[goldId]);
                    N_cg=self.coocurrence[(clusterId,goldId)];
                    part=float(self.N)*float(N_cg)/(N_c*N_g);
                    if(part>0):
                        result=result+(float(N_cg)/float(self.N))*math.log(part,2);
            return result;
        def CalNMI(self):
            NMI=0.0;
            seq1=[];#calculate the entropy of automated clusters
            seq2=[];#calculate the entropy of gold-standard clusters
            for clusterId in range(self.minclusterId,self.maxclusterId+1):
                Prob=float(len(self.clusters[clusterId]))/float(self.N);
                seq1.append(Prob);
            for goldId in range(self.mingoldstandardId,self.maxgoldstandardId+1):
                Prob=float(len(self.goldstandards[goldId]))/float(self.N);
                seq2.append(Prob);
            H1=self.CalEntropyFormula(seq1);
            H2=self.CalEntropyFormula(seq2);
            IG=self.CalMutualInformation();
            NMI=2*IG/(H1+H2);
            return NMI;
            
            
            
    if(__name__=="__main__"):
        clusterAddress=str(sys.argv[1]);
        goldAddress=str(sys.argv[2]);
        e= Evaluation(clusterAddress,goldAddress);
        print '聚类算法产生簇个数%d'%e.k;
        print  '人工标注的标准答案中簇个数%d'%e.q;
        print '文档总数%d'%e.N;
        print '最小聚类ID标号%d'%e.minclusterId;
        print '最大聚类ID标号%d'%e.maxclusterId;
        print '标准答案中最小聚类ID标号%d'%e.mingoldstandardId;
        print '标准答案中最大聚类ID标号%d'% e.maxgoldstandardId;
        e.GenerateCoocurrence();
        #for m in e.coocurrence:
           # print m;
           # print e.coocurrence[m];
           # print '***************************'
        purity=e.CalPurity();
        print '纯度为%f'% purity;
        #a=[0.2,0.3,0.5,0];
        #print e.CalEntropyFormula(a);
        entropy= e.CalEntropy();
        print '熵为%f'%entropy;
        nmi=e.CalNMI();
        print '归一化互信息为%f'%nmi

       
            

    代码调用示意图

     

     

  • 相关阅读:
    VC++6.0编译环境介绍
    (六)flask搭建博客系列之HTTPTokenAuth
    (五)flask搭建博客系列之LoginManager
    (四)flask搭建博客系列之FlaskForm
    (三)flask搭建博客系列之BootStrap
    (二)flask搭建博客系列之SQLAlchemy
    (一)flask搭建博客系列之环境项目搭建
    (十)python语法之图像处理
    (九)python语法之机器学习
    (八)python语法之Tkinter
  • 原文地址:https://www.cnblogs.com/finallyliuyu/p/2361142.html
Copyright © 2011-2022 走看看