zoukankan      html  css  js  c++  java
  • 机器学习-文本聚类实例-kmeans

    机器学习-文本聚类实例-kmeans

    import os
    
    import gensim
    import jieba
    
    from gensim.models.doc2vec import Doc2Vec
    from sklearn.cluster import KMeans
    
    TaggededDocument = gensim.models.doc2vec.TaggedDocument
    
    
    # 定义两个函数读取和保存文件
    # 保存至文件
    def savefile(savepath, content, encode):
        fp = open(savepath, "w", encoding=encode)
        fp.write(content)
        fp.close()
    
    
    # 读文件
    def readfile(path, encode):
        content = None
        try:
            fp = open(path, "r", encoding=encode)
            content = fp.read()
            fp.close()
        except UnicodeDecodeError:
            print("Error: 文件读取失败")
        else:
            return content
    
    
    stop_words_path = '/Users/FengZhen/Desktop/accumulate/机器学习/kmeans聚类/中文停用词表.txt'
    origin_text_path = '/Users/FengZhen/Desktop/accumulate/机器学习/kmeans聚类/测试文本集/'
    cut_combine_path = '/Users/FengZhen/Desktop/accumulate/机器学习/kmeans聚类/cut_combine.txt'
    corpus_text_path = '/Users/FengZhen/Desktop/accumulate/机器学习/kmeans聚类/corpus.txt'
    result_text_path = '/Users/FengZhen/Desktop/accumulate/机器学习/kmeans聚类/own_claasify.txt'
    
    # 分词-去停用词-保存
    def segment():
        stop_words = readfile(stop_words_path,'UTF-8')
        fileList = os.listdir(origin_text_path)
        save = open(cut_combine_path, 'w')
        for file in fileList:
            if not file.startswith("."):
                content_result = ''
                content = readfile(origin_text_path + file, 'GBK')
                content_words = jieba.cut(content)
                for content_word in content_words:
                    if content_word not in stop_words:
                        content_result = content_result + " " + content_word
                save.write(content_result.replace('
    ','').replace('
    ',''))
                save.write('
    ')
    
    
    def get_datasest():
        with open(cut_combine_path, 'r') as cf:
            docs = cf.readlines()
            print
            len(docs)
    
        x_train = []
        for i, text in enumerate(docs):
            word_list = text.split(' ')
            l = len(word_list)
            word_list[l - 1] = word_list[l - 1].strip()
            # 训练模型前,先将语料整理成规定的形式,这里用到TaggedDocument模型
            # 输入输出内容都为 词袋 + tag列表
            document = TaggededDocument(word_list, tags=[i])
            x_train.append(document)
    
        return x_train
    
    
    def train(x_train, size=200):
        # 初始化训练模型的参数,再保存训练结果以释放内存
        # 提供x_train可初始化, min_cout 忽略总频率低于这个的所有单词, window 预测的词与上下文词之间最大的距离, 用于预测  size 特征向量的维数 negative 接受杂质的个数 worker 工作模块数
        model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
        # corpus_count是文件个数  epochs 训练次数
        model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=100)
        # 保存模型训练结果,释放内存空间,后续可用load加载
        model_dm.save(corpus_text_path)
        return model_dm
    
    
    def cluster(x_train):
        infered_vectors_list = []
        print
        "load doc2vec model..."
        # 加载训练的模型   model_dm输出类似Doc2Vec(dm/m,d500,n5,w3,s0.001,t4)
        model_dm = Doc2Vec.load(corpus_text_path)
        print
        "load train vectors..."
        i = 0
        for text, label in x_train:
            vector = model_dm.infer_vector(text)
            infered_vectors_list.append(vector)
            i += 1
    
        print
        "train kmean model..."
        kmean_model = KMeans(n_clusters=15)
        kmean_model.fit(infered_vectors_list)
        labels = kmean_model.predict(infered_vectors_list[0:100])
        print(labels)
        cluster_centers = kmean_model.cluster_centers_
    
        with open(result_text_path, 'w') as wf:
            i = 0
            while i < len(x_train):
                string = ""
                text = x_train[i][0]
                for word in text:
                    string = string + word
                string = string + '	'
                string = string + str(labels[i])
                string = string + '
    '
                wf.write(string)
                i = i + 1
    
        return cluster_centers
    
    
    if __name__ == '__main__':
        segment()
        x_train = get_datasest()
        model_dm = train(x_train)
        cluster_centers = cluster(x_train)
  • 相关阅读:
    前端知识体系
    DOMContentLoaded与load的区别
    最佳网页宽度及其实现
    一些颜色工具网站
    Firebug入门指南
    CSS中背景图片定位方法
    字符编码笔记:ASCII,Unicode 和 UTF-8
    学JS的书籍
    深入理解定位父级offsetParent及偏移大小
    event——事件对象详解
  • 原文地址:https://www.cnblogs.com/EnzoDin/p/12404685.html
Copyright © 2011-2022 走看看