zoukankan      html  css  js  c++  java
  • python使用KNN文本分类

     上次爬取的爸爸、妈妈、老师和自己的作文,利用sklearn.neighbors.KNeighborsClassifier进行分类。

    import jieba
    import pandas as pd
    import numpy as np
    import os
    import itertools 
    import matplotlib.pyplot as plt
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.metrics import confusion_matrix
    from sklearn.decomposition import PCA
    
    #读取文件内容
    path = 'E:作文'
    corpos = pd.DataFrame(columns=['filepath','text','kind'])
    for root,dirs,files in os.walk(path):
        for name in files:
            filepath = root+'\'+name
            f = open(filepath,'r',encoding='utf-8')
            text = f.read()
            txt = ''.join(text.split('
    '))
            kind = root.split('\')[-1]
            corpos.loc[len(corpos)] = [filepath,text.strip(),kind]
    
    #设置停用词,构建词频矩阵
    stopwords = pd.read_csv(r'Stopwords.txt', 
                            encoding='utf-8',sep='
    ')
    def tokenizer(s):
        words=[]
        cut = jieba.cut(s)
        for word in cut:
            words.append(word)
        return words
    count = CountVectorizer(tokenizer=tokenizer,
                            stop_words=list(stopwords['stopword']))
    countvector = count.fit_transform(corpos.iloc[:,1]).toarray()
    
    #将类别转化为数字
    kind = np.unique(corpos['kind'].values)
    nkind = np.zeros(700) 
    for i in range(len(kind)):
        index = corpos[corpos['kind']==kind[i]].index
        nkind[index] = i+1
             
    #将词频矩阵转化为二维数据,画图    
    pca = PCA(n_components=2)
    newvector = pca.fit_transform(countvector)
    plt.figure()
    for i,c,m in zip(range(len(kind)),['r','b','g','y'],['o','^','>','<']):
        index = corpos[corpos['kind']==kind[i]].index
        x = newvector[index,0]
        y = newvector[index,1]
        plt.scatter(x,y,c=c,marker=m,label=kind[i])
    plt.legend()
    plt.xlim(-5,10)
    plt.ylim(-20,50)
    plt.xlabel('X Label')
    plt.ylabel('Y Label')
    
    #随机选出测试集    
    index = np.random.randint(0,700,200) 
    x_test = countvector[index]
    y_test = corpos.iloc[index,2]
    
    
    #利用knn分类
    knn = KNeighborsClassifier()
    knn.fit(countvector,corpos.iloc[:,2])
    y_pred = knn.predict(x_test)
    knn.score(x_test,y_test)
    
    #画knn分类结果的混淆矩阵
    knn_confusion = confusion_matrix(y_test,y_pred)
    '''
    array([[61, 1, 0, 3],
           [ 8, 35,  0,  1],
           [ 1,  0, 53,  1],
           [ 9,  1,  2, 24]])
    '''
    plt.imshow(knn_confusion,interpolation='nearest',cmap=plt.cm.Oranges) 
    plt.xlabel('y_pred')
    plt.ylabel('y_True')
    tick_marks = np.arange(len(kind))
    plt.xticks(tick_marks,kind,rotation=90)
    plt.yticks(tick_marks,kind)
    plt.colorbar()
    plt.title('confustion_matrix')
    for i,j in itertools.product(range(len(knn_confusion)),range(len(knn_confusion))):
        plt.text(i,j,knn_confusion[j,i],
                 horizontalalignment="center")

    数据散点图如下所示:

    

    knn分类结果的混淆矩阵图如下所示:

  • 相关阅读:
    CUDA 纹理内存
    CUDA三维数组
    cutil.h问题
    GPU和CPU耗时统计方法
    NVIDIA CUDA Library Documentation
    device not ready cuda
    送给女朋友的礼物
    手机屏幕录制软件分享
    统计函数运行时间-CPU端
    二十四孝,图文并茂,古今必读!
  • 原文地址:https://www.cnblogs.com/chenyaling/p/7461386.html
Copyright © 2011-2022 走看看