zoukankan      html  css  js  c++  java
  • 机器学习对文本的聚类KMeans

    使用kmeans进行文本的聚类

    from __future__ import print_function
    
    import time
    
    from sklearn.feature_extraction.text import TfidfVectorizer
    import matplotlib.pyplot as plt
    from sklearn.cluster import KMeans, MiniBatchKMeans
    import pymssql
    
    
    #读取sqlserver数据库
    def get_dbdata():
    conn_read = pymssql.connect("127.0.0.1", "sa", "###", "test", charset="GBK")
    dataset = []
    sql = "select guanjianci from julei_test"
    cursor = conn_read.cursor()
    cursor.execute(sql)
    data_count = 0
    for line in cursor:
    data_count +=1
    dataset.append(line[0])
    cursor.close()
    conn_read.close()
    print(dataset)
    return dataset
    
    def transform(dataset, n_features=1000):
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
    X = vectorizer.fit_transform(dataset)
    return X, vectorizer
    
    
    def train(X, vectorizer, true_k=10, minibatch=False, showLable=False):
    # 使用采样数据还是原始数据训练k-means,
    if minibatch:
    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
    init_size=1000, batch_size=1000, verbose=False)
    else:
    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=300, n_init=1,
    verbose=False)
    km.fit(X)
    if showLable:
    print("Top terms per cluster:")
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    print(vectorizer.get_stop_words())
    for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
    print(' %s' % terms[ind], end='')
    print()
    result = list(km.predict(X))
    print('Cluster distribution:')
    print(dict([(i, result.count(i)) for i in result]))
    return -km.score(X)
    
    #指定簇的个数k
    def k_determin():
    '''测试选择最优参数'''
    dataset = get_dbdata()
    print("%d documents" % len(dataset))
    X, vectorizer = transform(dataset, n_features=500)
    true_ks = []
    scores = []
    #中心点的个数从3到200(根据自己的数据量改写)
    for i in range(3, 200, 1):
    score = train(X, vectorizer, true_k=i) / len(dataset)
    print(i, score)
    true_ks.append(i)
    scores.append(score)
    plt.figure(figsize=(8, 4))
    plt.plot(true_ks, scores, label="error", color="red", linewidth=1)
    plt.xlabel("n_features")
    plt.ylabel("error")
    plt.legend()
    plt.show()
    
    
    def main():
    '''在最优参数下输出聚类结果'''
    dataset = get_dbdata()
    X, vectorizer = transform(dataset, n_features=500)
    score = train(X, vectorizer, true_k=25, showLable=True) / len(dataset)
    print(score)
    
    if __name__ == '__main__':
    start=time.time()
    #k_determin()#先确定k值
    main()
    end=time.time()
    print('程序运行时间',end-start)
    

      

  • 相关阅读:
    2020 商业计划书
    LBDP数据采集网关的设计要求
    Net学习日记_ASP.Net_Ajax
    Net学习日记_ASP.Net_WebForm_笔记
    Net学习日记_ASP.Net_WebForm
    Net学习日记_ASP.Net_一般处理程序_笔记
    Net学习日记_ASP.Net_一般处理程序
    Net学习日记_聊天室(基于Socket,Thread)_服务器软件
    Net学习日记_聊天室(基于Socket,Thread)
    Net学习日记_泛型与反射_笔记
  • 原文地址:https://www.cnblogs.com/xuange1/p/12550279.html
Copyright © 2011-2022 走看看