zoukankan      html  css  js  c++  java
  • Kmeans中文聚类


    中文文本kmeans聚类
    原理:
    K就是将原始数据分为K类,Means即均值点。K-Means的核心就是将一堆数据聚集为K个簇,每个簇中都有一个中心点称为均值点,簇中所有点到该簇的均值点的距离都较到其他簇的均值点更近。
    实现步骤:

    1、给出k个初始聚类中心

    2、重复执行:

          把每一个数据对象重新分配到k个聚类中心处,形成k个簇

          重新计算每一个簇的聚类中心

    3、直到聚类中心不在发生变化时,此时分类结束

    两种方法:

    from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
    from sklearn.cluster import KMeans
    from sklearn import metrics
    import numpy as np;
    import jieba
    from DBUtils import update_keyword
    def easy_get_parameter_k_means():
        data = []
        datas = []
        file = open("keyword.txt" , encoding='utf-8')
        for post in file:
            data.append(post.replace('
    ',''))
    
        datas = data
    
        vec = CountVectorizer()
        X = vec.fit_transform([" ".join([b for b in jieba.cut(a)]) for a in data])
        tf = TfidfTransformer()
        X = tf.fit_transform(X.toarray())
    
        data = X.toarray()
    
        test_score = []
        n_clusters_end = 20  # 聚类个数
        n_clusters_start = 20  # 聚类个数
        while n_clusters_start <= n_clusters_end:
            km = KMeans(n_clusters=n_clusters_start)
            km.fit(data)
            clusters = km.labels_.tolist()
            # print(type(clusters))
            # print(clusters)
            score = metrics.silhouette_score(X=X, labels=clusters)
            num = sorted([(np.sum([1 for a in clusters if a == i]), i) for i in set(clusters)])[-1]
            test_score.append([n_clusters_start, score, num[0], num[1]])
            #print([n_clusters_start, score, num[0], num[1]]) # 输出分数
            n_clusters_start += 1
    
            for i in range(0, 20):
                result = []
                #print('len(clusters):',len(clusters))
                for index in range(len(clusters)):
                    if clusters[index] == i:
                        res = datas[index]
                        update_keyword(res,str(i))
                        print("更新关键词为:",res,'的分类号为',i)
                        result.append(res)
                        #print('res',res)
    
                #print("第",i,"类,共", len(result), "个")
    
    
        return clusters
    
    
    # easy_get_parameter_k_means()  # 得到最佳参数
    print("arrs",easy_get_parameter_k_means())
    print("arrs[length]",len(easy_get_parameter_k_means()))
    

     

    ②此方法是读取多个文件构建词库空间,但是当数据文件过多时运行很慢很慢

    求特征值  求TF-IDF   利用kmeans算法求聚类中心  和 聚类分类以及各点距离其聚类中心的距离

    import os;
    import jieba;
    import numpy as np;
    from numpy import *
    import matplotlib.pyplot as plt
    import os
    
    
    
    
    
    def read_from_file(file_name):  # 读取原文章
    
        with open(file_name, "r", encoding='UTF8') as fp:
            words = fp.read()
        return words
    
    
    def stop_words(stop_word_file):
        words = read_from_file(stop_word_file)
        result = jieba.cut(words)
        new_words = []
        for r in result:
            new_words.append(r)
        return set(new_words)
    
    
    def del_stop_words(words, stop_words_set):
        #   words是已经切词但是没有去除停用词的文档。
        #   返回的会是去除停用词后的文档
        result = jieba.cut(words)
        new_words = []
        for r in result:
            if r not in stop_words_set:
                new_words.append(r)
        return new_words
    
    
    def get_all_vector(stop_words_set):
        # names = [os.path.join(file_path, f) for f in os.listdir(file_path)]
    
    
        docs = []
        word_set = set()
        file = open("keyword.txt" , encoding='utf-8')
        for post in file:
            doc = del_stop_words(post, stop_words_set)
            docs.append(doc)
            word_set |= set(doc)
            # print len(doc),len(word_set)
        # print("word_Set:",word_set)
        # print("docs:", docs)
        word_set = list(word_set)
        docs_vsm = []
        # for word in word_set[:30]:
        # print word.encode("utf-8"),
        for doc in docs:
            temp_vector = []
            for word in word_set:
                temp_vector.append(doc.count(word) * 1.0)
            # print temp_vector[-30:-1]
            docs_vsm.append(temp_vector)
    
        docs_matrix = np.array(docs_vsm)
        print("docs_matrix:", docs_matrix)
    
        column_sum = [float(len(np.nonzero(docs_matrix[:, i])[0])) for i in range(docs_matrix.shape[1])]
        column_sum = np.array(column_sum)
        column_sum = docs_matrix.shape[0] / column_sum
        idf = np.log(column_sum)
        idf = np.diag(idf)
        # 注意一下计算都是矩阵运算,不是单个变量的运算。
        for doc_v in docs_matrix:
            if doc_v.sum() == 0:
                doc_v = doc_v / 1
            else:
                doc_v = doc_v / (doc_v.sum())
            tfidf = np.dot(docs_matrix, idf)
        # return names, tfidf
        print("idf:", tfidf)
        f = "tezheng.txt"
        with open(f, "w", encoding='utf8') as file:  # ”w"代表着每次运行都覆盖内容
            for i in tfidf:
                for j in i:
                    datafl = str(format(float(j), '.2f'))
    
                    file.write(datafl + "	")
                file.write("
    ")
    
    
    def loadDataSet(fileName):
        dataSet = []  # 初始化一个空列表
        fr = open(fileName)
        for line in fr.readlines():
            # 切割每一行的数据
            curLine = line.strip().split('	')
            # 将数据追加到dataMat,映射所有的元素为 float类型
            fltLine = list(map(float, curLine))
            dataSet.append(fltLine)
        return mat(dataSet)
    
    
    '''
    def randCent(dataSet, k):
        n = shape(dataSet)[1]
        centroids = mat(zeros((k,n)))#用mat函数转换为矩阵之后可以才进行一些线性代数的操作
        for j in range(n):#在每个维度的边界内,创建簇心。
            minJ = min(dataSet[:,j])
            rangeJ = float(max(dataSet[:,j]) - minJ)
            centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
        return centroids
    
    def randCent(dataSet,k):
        m,n = dataSet.shape
        centroids = np.zeros((k,n))
        for i in range(k):
            index = int(np.random.uniform(0,m)) #
    
            centroids[i,:] = dataSet[index,:]
        return centroids
    '''
    
    
    def randCent(dataSet, k):
        n = shape(dataSet)[1]
        centroids = mat(zeros((k, n)))  # create centroid mat
        for j in range(n):  # create random cluster centers, within bounds of each dimension
            minJ = min(dataSet[:, j])
            rangeJ = float(max(dataSet[:, j]) - minJ)
            centroids[:, j] = mat(minJ + rangeJ * random.rand(k, 1))
        return centroids
    
    
    def distEclud(vecA, vecB):
        return math.sqrt(sum(power(vecA - vecB, 2)))
    
    
    # dataSet样本点,k 簇的个数
    # disMeas距离量度,默认为欧几里得距离
    # createCent,初始点的选取
    '''
    def K_means(dataSet,k,distMeas = distEclud,createCent = randCent):
        print("样本点:::",dataSet)
        m = shape(dataSet)[0]#样本数
        print('样本数:',m)
        clusterAssment = mat(zeros((m,2)))#m*2的矩阵
        centroids = createCent(dataSet,k)#初始化k个中心
        clusterChanged = True
        while clusterChanged:#当聚类不再变化
            clusterChanged = False
            for i in range(m):
                minDist = math.inf;minIndex = -1
                for j in range(k):#找到最近的质心
                    distJI = distMeas(centroids[j,:],dataSet[i,:])
                    if distJI < minDist:
                        minDist = distJI;minIndex = j
                if clusterAssment[i,0] !=minIndex:clusterChanged = True
                #第一列为所属质心,第二列为距离
                clusterAssment[i,:] = minIndex,minDist**2
            print(centroids)
    
            #更改质心位置
            for cent in range(k):
                ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]
                centroids[cent,:] = mean(ptsInClust,axis=0)
        return centroids,clusterAssment
    '''
    
    
    def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
        m = shape(dataSet)[0]  # 样本数
        clusterAssment = mat(zeros((m, 2)))  # m*2的矩阵
        centroids = createCent(dataSet, k)  # 初始化k个中心
        clusterChanged = True
        while clusterChanged:  # 当聚类不再变化
            clusterChanged = False
            for i in range(m):
                minDist = inf;
                minIndex = -1
                for j in range(k):  # 找到最近的质心
                    distJI = distMeas(centroids[j, :], dataSet[i, :])
                    if distJI < minDist:
                        minDist = distJI;
                        minIndex = j
                if clusterAssment[i, 0] != minIndex: clusterChanged = True
                # 第1列为所属质心,第2列为距离
                clusterAssment[i, :] = minIndex, minDist ** 2
            print(centroids)
    
            # 更改质心位置
            for cent in range(k):
                ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]]
                centroids[cent, :] = mean(ptsInClust, axis=0)
        return centroids, clusterAssment
    
    
    if __name__ == '__main__':
        wenzhang = read_from_file('keyword.txt')
        # print(wenzhang)
        wenzhang1 = stop_words('stopword.txt')
        # print(wenzhang1)
        wenzhang2 = del_stop_words(wenzhang, wenzhang1)
        # print(wenzhang2)
        wenzhang3 = get_all_vector( wenzhang1)
        # kMeans(dataSet, k, distMeas=gen_sim, createCent=randCent)
        dataSet = loadDataSet('tezheng.txt')
        centroids, clusterAssment = kMeans(dataSet, 10, distMeas=distEclud, createCent=randCent)
        print("centroids:", centroids)
        print("clusterAssment :", clusterAssment)
        print("clusterAssmentlengh :", len(clusterAssment))
    
    
    
    
    
    
     
    '''
    import os;
    import jieba;
    import numpy as np;
    from numpy import *
    import matplotlib.pyplot as plt
    import os
    
    
    def file_name(file_dir):
        filesname = []
        for root, dirs, files in os.walk(file_dir):
            for file in files:
                filename = 'keywordfile/' + file
                filesname.append(filename)
        print("filesname length:", len(filesname))
        return filesname
    
    
    def read_from_file(file_name):  # 读取原文章
    
        with open(file_name, "r", encoding='UTF8') as fp:
            words = fp.read()
        return words
    
    
    def stop_words(stop_word_file):
        words = read_from_file(stop_word_file)
        result = jieba.cut(words)
        new_words = []
        for r in result:
            new_words.append(r)
        return set(new_words)
    
    
    def del_stop_words(words, stop_words_set):
        #   words是已经切词但是没有去除停用词的文档。
        #   返回的会是去除停用词后的文档
        result = jieba.cut(words)
        new_words = []
        for r in result:
            if r not in stop_words_set:
                new_words.append(r)
        return new_words
    
    
    def get_all_vector(file_path, stop_words_set):
        # names = [os.path.join(file_path, f) for f in os.listdir(file_path)]
        names = file_name('keyfile')
        posts = [open(name, encoding='utf-8').read() for name in names]
        docs = []
        word_set = set()
        for post in posts:
            print('post', post)
            doc = del_stop_words(post, stop_words_set)
            docs.append(doc)
            word_set |= set(doc)
            # print len(doc),len(word_set)
        # print("word_Set:",word_set)
        # print("docs:", docs)
        word_set = list(word_set)
        docs_vsm = []
        # for word in word_set[:30]:
        # print word.encode("utf-8"),
        for doc in docs:
            temp_vector = []
            for word in word_set:
                temp_vector.append(doc.count(word) * 1.0)
            # print temp_vector[-30:-1]
            docs_vsm.append(temp_vector)
    
        docs_matrix = np.array(docs_vsm)
        print("docs_matrix:", docs_matrix)
    
        column_sum = [float(len(np.nonzero(docs_matrix[:, i])[0])) for i in range(docs_matrix.shape[1])]
        column_sum = np.array(column_sum)
        column_sum = docs_matrix.shape[0] / column_sum
        idf = np.log(column_sum)
        idf = np.diag(idf)
        # 注意一下计算都是矩阵运算,不是单个变量的运算。
        for doc_v in docs_matrix:
            if doc_v.sum() == 0:
                doc_v = doc_v / 1
            else:
                doc_v = doc_v / (doc_v.sum())
            tfidf = np.dot(docs_matrix, idf)
        # return names, tfidf
        print("idf:", tfidf)
        f = "tezheng.txt"
        with open(f, "w", encoding='utf8') as file:  # ”w"代表着每次运行都覆盖内容
            for i in tfidf:
                for j in i:
                    datafl = str(format(float(j), '.2f'))
    
                    file.write(datafl + "	")
                file.write("
    ")
    
    
    def loadDataSet(fileName):
        dataSet = []  # 初始化一个空列表
        fr = open(fileName)
        for line in fr.readlines():
            # 切割每一行的数据
            curLine = line.strip().split('	')
            # 将数据追加到dataMat,映射所有的元素为 float类型
            fltLine = list(map(float, curLine))
            dataSet.append(fltLine)
        return mat(dataSet)
    
    
    def randCent(dataSet, k):
        n = shape(dataSet)[1]
        centroids = mat(zeros((k,n)))#用mat函数转换为矩阵之后可以才进行一些线性代数的操作
        for j in range(n):#在每个维度的边界内,创建簇心。
            minJ = min(dataSet[:,j])
            rangeJ = float(max(dataSet[:,j]) - minJ)
            centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
        return centroids
    
    def randCent(dataSet,k):
        m,n = dataSet.shape
        centroids = np.zeros((k,n))
        for i in range(k):
            index = int(np.random.uniform(0,m)) #
    
            centroids[i,:] = dataSet[index,:]
        return centroids
    
    
    
    def randCent(dataSet, k):
        n = shape(dataSet)[1]
        centroids = mat(zeros((k, n)))  # create centroid mat
        for j in range(n):  # create random cluster centers, within bounds of each dimension
            minJ = min(dataSet[:, j])
            rangeJ = float(max(dataSet[:, j]) - minJ)
            centroids[:, j] = mat(minJ + rangeJ * random.rand(k, 1))
        return centroids
    
    
    def distEclud(vecA, vecB):
        return math.sqrt(sum(power(vecA - vecB, 2)))
    
    
    # dataSet样本点,k 簇的个数
    # disMeas距离量度,默认为欧几里得距离
    # createCent,初始点的选取
    
    def K_means(dataSet,k,distMeas = distEclud,createCent = randCent):
        print("样本点:::",dataSet)
        m = shape(dataSet)[0]#样本数
        print('样本数:',m)
        clusterAssment = mat(zeros((m,2)))#m*2的矩阵
        centroids = createCent(dataSet,k)#初始化k个中心
        clusterChanged = True
        while clusterChanged:#当聚类不再变化
            clusterChanged = False
            for i in range(m):
                minDist = math.inf;minIndex = -1
                for j in range(k):#找到最近的质心
                    distJI = distMeas(centroids[j,:],dataSet[i,:])
                    if distJI < minDist:
                        minDist = distJI;minIndex = j
                if clusterAssment[i,0] !=minIndex:clusterChanged = True
                #第一列为所属质心,第二列为距离
                clusterAssment[i,:] = minIndex,minDist**2
            print(centroids)
    
            #更改质心位置
            for cent in range(k):
                ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]
                centroids[cent,:] = mean(ptsInClust,axis=0)
        return centroids,clusterAssment
    
    
    
    def K_Means(dataSet, k, distMeas=distEclud, createCent=randCent):
        m = shape(dataSet)[0]  # 样本数
        clusterAssment = mat(zeros((m, 2)))  # m*2的矩阵
        centroids = createCent(dataSet, k)  # 初始化k个中心
        clusterChanged = True
        while clusterChanged:  # 当聚类不再变化
            clusterChanged = False
            for i in range(m):
                minDist = inf;
                minIndex = -1
                for j in range(k):  # 找到最近的质心
                    distJI = distMeas(centroids[j, :], dataSet[i, :])
                    if distJI < minDist:
                        minDist = distJI;
                        minIndex = j
                if clusterAssment[i, 0] != minIndex: clusterChanged = True
                # 第1列为所属质心,第2列为距离
                clusterAssment[i, :] = minIndex, minDist ** 2
            print(centroids)
    
            # 更改质心位置
            for cent in range(k):
                ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]]
                centroids[cent, :] = mean(ptsInClust, axis=0)
        return centroids, clusterAssment
    
    
    if __name__ == '__main__':
        wenzhang = read_from_file('input.txt')
        # print(wenzhang)
        wenzhang1 = stop_words('stopword.txt')
        # print(wenzhang1)
        wenzhang2 = del_stop_words(wenzhang, wenzhang1)
        # print(wenzhang2)
        wenzhang3 = get_all_vector('D:/Pycharm/项目存储/input/', wenzhang1)
        # kMeans(dataSet, k, distMeas=gen_sim, createCent=randCent)
        dataSet = loadDataSet('tezheng.txt')
        centroids, clusterAssment = K_Means(dataSet, 3, distMeas=distEclud, createCent=randCent)
    
        print("centroids:", centroids)
        print("clusterAssment :", clusterAssment)
        print("clusterAssmentlengh :", len(clusterAssment))
          '''
    

      

  • 相关阅读:
    MVC3基础嵌套总结
    List之Union(),Intersect(),Except() 亦可以说是数学中的并集,交集,差集
    sqlserver查询记录数某个区间内记录
    各语言的unix时间戳 【转】
    SQLserver删除某数据库中所有表
    C#日期格式转换大全
    【转】Linq之动态排序(字符传入)
    谈谈Equals和GetHashcode
    JS常用代码收集
    ROW_NUMBER()、RANK()、DENSE_RANK()、NTILE(N)
  • 原文地址:https://www.cnblogs.com/zjl-0217/p/12601528.html
Copyright © 2011-2022 走看看