zoukankan      html  css  js  c++  java
  • 基于KNN的newsgroup 18828文本分类器的Python实现

    还是同前一篇作为学习入门。

    1. KNN算法描述:

    step1: 文本向量化表示,计算特征词的TF-IDF值

    step2: 新文本到达后,根据特征词确定文本的向量

    step3 : 在训练文本集中选出与新文本向量最相近的k个文本向量,相似度度量采用“余弦相似度”,根据实验测试的结果调整k值,此次选择20

    step4: 在新文本的k个邻居中,依次计算每类的权重,

    step5: 比较类的权重,将新文本放到权重最大的那个类中

    2. 文档TF-IDF计算和向量化表示

    # -*- coding: utf-8 -*-
    import time
    from os import listdir
    from math import log
    from numpy import *
    from numpy import linalg
    from operator import itemgetter
    
    ###################################################
    ## 计算所有单词的IDF值
    ###################################################
    def computeIDF():
        fileDir = 'processedSampleOnlySpecial_2'
        wordDocMap = {}  # <word, set(docM,...,docN)>
        IDFPerWordMap = {}  # <word, IDF值>
        countDoc = 0.0
        cateList = listdir(fileDir)
        for i in range(len(cateList)):
            sampleDir = fileDir + '/' + cateList[i]
            sampleList = listdir(sampleDir)
            for j in range(len(sampleList)):
                sample = sampleDir + '/' + sampleList[j]
                for line in open(sample).readlines():
                    word = line.strip('
    ')
                    if word in wordDocMap.keys():
                        wordDocMap[word].add(sampleList[j]) # set结构保存单词word出现过的文档
                    else:
                        wordDocMap.setdefault(word,set())
                        wordDocMap[word].add(sampleList[j])
            print 'just finished %d round ' % i
    
        for word in wordDocMap.keys():
            countDoc = len(wordDocMap[word]) # 统计set中的文档个数
            IDF = log(20000/countDoc)/log(10)
            IDFPerWordMap[word] = IDF
     
        return IDFPerWordMap
    
    ###################################################
    ## 将IDF值写入文件保存
    ###################################################    
    def main():
        start=time.clock()
        IDFPerWordMap = computeIDF()
        end=time.clock()
        print 'runtime: ' + str(end-start)
        fw = open('IDFPerWord','w')
        for word, IDF in IDFPerWordMap.items():
            fw.write('%s %.6f
    ' % (word,IDF))
        fw.close()
        
    ########################################################
    ## 生成训练集和测试集的文档向量,向量形式<cate, doc, (word1, tdidf1), (word2, tdidf2),...> 存入文件
    ## @param indexOfSample 迭代的序号
    ## @param trainSamplePercent 训练集合和测试集合划分百分比
    ########################################################
    def computeTFMultiIDF(indexOfSample, trainSamplePercent):
        IDFPerWord = {} # <word, IDF值> 从文件中读入后的数据保存在此字典结构中
        for line in open('IDFPerWord').readlines():
            (word, IDF) = line.strip('
    ').split(' ')
            IDFPerWord[word] = IDF        
        
        fileDir = 'processedSampleOnlySpecial_2'
        trainFileDir = "docVector/" + 'wordTFIDFMapTrainSample' + str(indexOfSample)
        testFileDir = "docVector/" + 'wordTFIDFMapTestSample' + str(indexOfSample)
    
        tsTrainWriter = open(trainFileDir, 'w')
        tsTestWriter = open(testFileDir, 'w')
    
            
        cateList = listdir(fileDir)
        for i in range(len(cateList)):
            sampleDir = fileDir + '/' + cateList[i]
            sampleList = listdir(sampleDir)
            
            testBeginIndex = indexOfSample * ( len(sampleList) * (1-trainSamplePercent) )
            testEndIndex = (indexOfSample+1) * ( len(sampleList) * (1-trainSamplePercent) )
            
            for j in range(len(sampleList)):
                TFPerDocMap = {} # <word, 文档doc下该word的出现次数>
                sumPerDoc = 0  # 记录文档doc下的单词总数
                sample = sampleDir + '/' + sampleList[j]
                for line in open(sample).readlines():
                    sumPerDoc += 1
                    word = line.strip('
    ')
                    TFPerDocMap[word] = TFPerDocMap.get(word, 0) + 1
                
                if(j >= testBeginIndex) and (j <= testEndIndex):
                    tsWriter = tsTestWriter
                else:
                    tsWriter = tsTrainWriter
    
                tsWriter.write('%s %s ' % (cateList[i], sampleList[j])) # 写入类别cate,文档doc
    
                for word, count in TFPerDocMap.items():
                    TF = float(count)/float(sumPerDoc)
                    tsWriter.write('%s %f ' % (word, TF * float(IDFPerWord[word]))) # 继续写入类别cate下文档doc下的所有单词及它的TF-IDF值
    
                tsWriter.write('
    ')
    
            print 'just finished %d round ' % i
    
            #if i==0: break
    
        tsTrainWriter.close()
        tsTestWriter.close()
        tsWriter.close()

    3. KNN算法的实现

    def doProcess():
        trainFiles = 'docVector/wordTFIDFMapTrainSample0'
        testFiles = 'docVector/wordTFIDFMapTestSample0'
        kNNResultFile = 'docVector/KNNClassifyResult'
    
        trainDocWordMap = {}  # 字典<key, value> key=cate_doc, value={{word1,tfidf1}, {word2, tfidf2},...}
    
        for line in open(trainFiles).readlines():
            lineSplitBlock = line.strip('
    ').split(' ')
            trainWordMap = {}
            m = len(lineSplitBlock)-1
            for i in range(2, m, 2):  # 在每个文档向量中提取(word, tfidf)存入字典
                trainWordMap[lineSplitBlock[i]] = lineSplitBlock[i+1]
    
            temp_key = lineSplitBlock[0] + '_' + lineSplitBlock[1]  # 在每个文档向量中提取类目cate,文档doc,
            trainDocWordMap[temp_key] = trainWordMap 
    
        testDocWordMap = {}
    
        for line in open(testFiles).readlines():
            lineSplitBlock = line.strip('
    ').split(' ')
            testWordMap = {} 
            m = len(lineSplitBlock)-1
            for i in range(2, m, 2):
                testWordMap[lineSplitBlock[i]] = lineSplitBlock[i+1]
    
            temp_key = lineSplitBlock[0] + '_' + lineSplitBlock[1]
            testDocWordMap[temp_key] = testWordMap #<类_文件名,<word, TFIDF>>
    
        #遍历每一个测试样例计算与所有训练样本的距离,做分类
        count = 0
        rightCount = 0
        KNNResultWriter = open(kNNResultFile,'w')
        for item in testDocWordMap.items():
            classifyResult = KNNComputeCate(item[0], item[1], trainDocWordMap)  # 调用KNNComputeCate做分类
    
            count += 1
            print 'this is %d round' % count
    
            classifyRight = item[0].split('_')[0]
            KNNResultWriter.write('%s %s
    ' % (classifyRight,classifyResult))
            if classifyRight == classifyResult:
                rightCount += 1
            print '%s %s rightCount:%d' % (classifyRight,classifyResult,rightCount)
    
        accuracy = float(rightCount)/float(count)
        print 'rightCount : %d , count : %d , accuracy : %.6f' % (rightCount,count,accuracy)
        return accuracy
                
    
    
    #########################################################
    ## @param cate_Doc 测试集<类别_文档>
    ## @param testDic 测试集{{word, TFIDF}}
    ## @param trainMap 训练集<类_文件名,<word, TFIDF>>
    ## @return sortedCateSimMap[0][0] 返回与测试文档向量距离和最小的类
    #########################################################
    def KNNComputeCate(cate_Doc, testDic, trainMap):
        simMap = {} #<类目_文件名,距离> 后面需要将该HashMap按照value排序
        for item in trainMap.items():
            similarity = computeSim(testDic,item[1])  # 调用computeSim()
            simMap[item[0]] = similarity
    
        sortedSimMap = sorted(simMap.iteritems(), key=itemgetter(1), reverse=True) #<类目_文件名,距离> 按照value排序
    
        k = 20
        cateSimMap = {} #<类,距离和>
        for i in range(k):
            cate = sortedSimMap[i][0].split('_')[0]
            cateSimMap[cate] = cateSimMap.get(cate,0) + sortedSimMap[i][1]
    
        sortedCateSimMap = sorted(cateSimMap.iteritems(),key=itemgetter(1),reverse=True)
    
        return sortedCateSimMap[0][0]   
            
        
    #################################################
    ## @param testDic 一维测试文档向量<<word, tfidf>>
    ## @param trainDic 一维训练文档向量<<word, tfidf
    ## @return 返回余弦相似度
    def computeSim(testDic, trainDic):
        testList = []  # 测试向量与训练向量共有的词在测试向量中的tfidf值
        trainList = []  # # 测试向量与训练向量共有的词在训练向量中的tfidf值
        
        for word, weight in testDic.items():
            if trainDic.has_key(word):
                testList.append(float(weight)) # float()将字符型数据转换成数值型数据,参与下面运算
                trainList.append(float(trainDic[word]))
    
        testVect = mat(testList)  # 列表转矩阵,便于下面向量相乘运算和使用Numpy模块的范式函数计算
        trainVect = mat(trainList)
        num = float(testVect * trainVect.T)
        denom = linalg.norm(testVect) * linalg.norm(trainVect)
        #print 'denom:%f' % denom
        return float(num)/(1.0+float(denom))


    输出结果:

    运行时遇到几种语法错误:

    Error1:

    split(' ')按空格分割后最后一位是空串,不检查分割后数组的最后一位很难发现,以致产生越界

    Error2:

    因为导入数据用字符串运算strip切分后,返回的都是字符型数据,而string不能计算,需int(string) float(string)转换后参与计算,错误如下:

    Error3:

    sorted()对字典中每对<key, value>数据进行排序,返回的是包含tuple(key, value)的列表,之前不了解这一点,出现了index的错误:

     sorted()的返回形式:

  • 相关阅读:
    Java程序:从命令行接收多个数字,求和并输出结果
    大道至简读后感
    大道至简第一章读后感Java伪代码
    Creating a SharePoint BCS .NET Connectivity Assembly to Crawl RSS Data in Visual Studio 2010
    声明式验证超时问题
    Error message when you try to modify or to delete an alternate access mapping in Windows SharePoint Services 3.0: "An update conflict has occurred, and you must re-try this action"
    Upgrading or Redeploying SharePoint 2010 Workflows
    Upgrade custom workflow in SharePoint
    SharePoint 2013中Office Web Apps的一次排错
    How to upgrade workflow assembly in MOSS 2007
  • 原文地址:https://www.cnblogs.com/ffan/p/4043562.html
Copyright © 2011-2022 走看看