zoukankan      html  css  js  c++  java
  • K-means 不知k值 自动无监督分类

    代码:

      1 # -*- coding:UTF-8 -*-
      2 from numpy import *
      3 import jieba as jb
      4 import time
      5 # 计算权值,并存储为txt
      6 # 计算所有文本包含的总词数
      7 def wordsCount(dataSet):
      8     wordsCnt = 0
      9     for document in dataSet:
     10         wordsCnt += len(document)
     11     return wordsCnt
     12 
     13 # 创建不重复的词条列表
     14 def createVocabList(dataSet):
     15     vocabSet = set([])
     16     for document in dataSet:
     17         vocabSet = vocabSet | set(document)
     18     return list(vocabSet)
     19 
     20 # 将文本转化为词袋模型
     21 def bagOfWords2Vec(vocabList, inputSet):
     22     returnVec = [0] * len(vocabList)
     23     for word in inputSet:
     24         if word in vocabList:
     25             returnVec[vocabList.index(word)] += 1
     26         else:
     27             print("the word: %s is not in my Vocabulary!" % word)
     28     return returnVec
     29 
     30 # 计算包含某个词的文本数
     31 def wordInFileCount(word, cutWordList):
     32     fileCnt = 0
     33     for i in cutWordList:
     34         for j in i:
     35             if word == j:
     36                 fileCnt = fileCnt + 1
     37             else:
     38                 continue
     39     return fileCnt
     40 
     41 def calTFIDF(dataSet):
     42     fileCnt = len(dataSet)  # 文本数
     43     vocabList = createVocabList(dataSet)  # 词条列表
     44     tfidfSet = []
     45 
     46     for line in dataSet:
     47         wordsBag = bagOfWords2Vec(vocabList, line)  # 每行文本对应的词袋向量
     48         lineWordsCnt = 0
     49         for i in range(len(wordsBag)):
     50             lineWordsCnt += wordsBag[i]  # 计算每个文本中包含的总词数
     51         tfidfList = [0] * len(vocabList)
     52         for word in line:
     53             wordinfileCnt = wordInFileCount(word, dataSet)  # 包含该词的文本数
     54             wordCnt = wordsBag[vocabList.index(word)]  # 该词在文本中出现的次数
     55             tf = float(wordCnt) / lineWordsCnt
     56             idf = math.log(float(fileCnt) / (wordinfileCnt + 1))
     57             tfidf = tf * idf
     58             tfidfList[vocabList.index(word)] = tfidf
     59         print(tfidfList)
     60         print(map(str, tfidfList))
     61         tfidfSet.append(tfidfList)
     62 
     63     return tfidfSet
     64 
     65 # 计算余弦距离
     66 def gen_sim(A, B):
     67     num = float(dot(mat(A), mat(B).T))
     68     denum = linalg.norm(A) * linalg.norm(B)
     69     if denum == 0:
     70         denum = 1
     71     cosn = num / denum
     72     sim = 0.5 + 0.5 * cosn  # 余弦值为[-1,1],归一化为[0,1],值越大相似度越大
     73     sim = 1 - sim  # 将其转化为值越小距离越近
     74     return sim
     75 
     76 
     77 # 计算两个簇的评均距离
     78 def distAvg(dataSet1, dataSet2):
     79     avgD = 0
     80     sumD = 0
     81     m = shape(dataSet1)[0]
     82     n = shape(dataSet2)[0]
     83     for i in range(m):
     84         for j in range(n):
     85             dist = gen_sim(dataSet1[i], dataSet2[j])
     86             sumD += dist
     87     avgD = sumD / (m * n)
     88     return avgD
     89 
     90 # 找到距离最近的两个簇
     91 def findMin(M):
     92     minDist = inf
     93     m = shape(M)[0]
     94     for i in range(m):
     95         for j in range(m):
     96             if i != j and M[i, j] < minDist:
     97                 minDist = M[i, j]
     98                 minI = i
     99                 minJ = j
    100     return minI, minJ, minDist
    101 
    102 
    103 # 层次聚类算法
    104 def hCluster(dataSet, k, dist, distMeas=distAvg):
    105     m = shape(dataSet)[0]
    106     clusterAssment = mat(zeros((m, 1)))
    107     performMeasure = []
    108     M = mat(zeros((m, m)))  # 距离矩阵
    109     # 初始化聚类簇,每个样本作为一个类
    110     for ii in range(m):
    111         clusterAssment[ii, 0] = ii
    112 
    113     for i in range(m):
    114         for j in range(i + 1, m):
    115             dataSeti = dataSet[nonzero(clusterAssment[:, 0].A == i)[0], :]
    116             dataSetj = dataSet[nonzero(clusterAssment[:, 0].A == j)[0], :]
    117             M[i, j] = distMeas(dataSeti, dataSetj)
    118             M[j, i] = M[i, j]
    119         if mod(i,10) == 0: print(i)
    120     q = m  # 设置当前聚类个数
    121     minDist = 0
    122     # while (q > k):
    123     while (minDist < dist):
    124         i, j, minDist = findMin(M)  # 找到距离最小的两个簇
    125         # 把第j个簇归并到第i个簇
    126         clusterAssment[nonzero(clusterAssment[:, 0].A == j)[0], 0] = i
    127         for l in range(j + 1, q):  # 将j之后的簇重新编号
    128             clusterAssment[nonzero(clusterAssment[:, 0].A == l)[0], 0] = l - 1
    129         M = delete(M, j, axis=0)
    130         M = delete(M, j, axis=1)
    131         for l in range(q - 1):  # 重新计算第i个簇和其他簇直接的距离
    132             dataSeti = dataSet[nonzero(clusterAssment[:, 0].A == i)[0], :]
    133             dataSetl = dataSet[nonzero(clusterAssment[:, 0].A == l)[0], :]
    134             M[i, l] = distMeas(dataSeti, dataSetl)
    135             M[l, i] = M[i, l]
    136 
    137         # DBI = DBIvalue(dataSet, clusterAssment, q)
    138         # DI = DIvalue(dataSet, clusterAssment, q)
    139         DBI = 0
    140         DI = 0
    141 
    142         performMeasure.append([q - 1, minDist, DBI, DI])
    143 
    144         q = q - 1
    145 
    146         print(u'当前簇的个数是:', q)
    147         print(u'距离最小的两个簇是第%d个和第%d个,距离是%f,DBI值是%f,DI值是%f' % (
    148             i, j, minDist, DBI, DI))
    149 
    150     return clusterAssment, mat(performMeasure)
    151 
    152 def saveResult(clusterAssment):
    153     listResult = clusterAssment.tolist()  # 矩阵转换为list
    154     for i in range(len(listResult)):
    155         print(map(str, listResult[i]))
    156 
    157 
    158 if __name__ =='__main__':
    159     a=["实施", "效益","节本","10"]
    160     m=mat(calTFIDF(a))
    161     clustAssing, performMeasure = hCluster(m, 0, 0.3)
    162     print(clustAssing)
    163     saveResult(clustAssing)
  • 相关阅读:
    POJ 2236 Wireless Network(并查集)
    POJ 2010 Moo University
    POJ 3614 Sunscreen(贪心,区间单点匹配)
    POJ 2184 Cow Exhibition(背包)
    POJ 1631 Bridging signals(LIS的等价表述)
    POJ 3181 Dollar Dayz(递推,两个long long)
    POJ 3046 Ant Counting(递推,和号优化)
    POJ 3280 Cheapest Palindrome(区间dp)
    POJ 3616 Milking Time(dp)
    POJ 2385 Apple Catching(01背包)
  • 原文地址:https://www.cnblogs.com/smartisn/p/12522983.html
Copyright © 2011-2022 走看看