Sklearn库的学习之TF-IDF算法:
# coding:utf-8
import jieba
import jieba.posseg as pseg
import os
import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
one = open(r'one.txt',encoding = "utf-8")
onee = list(one)
two = open(r'two.txt',encoding = "utf-8")
twoo = list(two)
three = open(r'three.txt',encoding = "utf-8")
threee = list(three)
four = open(r'four.txt',encoding = "utf-8")
fourr = list(four)
five = open(r'five.txt',encoding = "utf-8")
fivee = list(five)
six = open(r'six.txt',encoding = "utf-8")
sixx = list(six)
one.close()
two.close()
three.close()
if __name__ == "__main__":
corpus= onee + twoo + threee + fourr + fivee + sixx
vectorizer=CountVectorizer()#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
word=vectorizer.get_feature_names()#获取词袋模型中的所有词语
weight=tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
for i in range(len(weight)):#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
print(u"-------这里输出第",i,u"类文本的词语tf-idf权重------")
for j in range(len(word)):
if(weight[i][j] >= 0.01):
#print(word[j])
#print(weight[i][j])
print(word[j],weight[i][j])
运行结果:
-------这里输出第 0 类文本的词语tf-idf权重------
app 0.011288689335804462
上架 0.025273080237494572
上级 0.017429710508616948
业务 0.02926738889571929
业绩 0.030429369596293752
亚马逊 0.015323620488825733
交易数据 0.020367968635621465
产品 0.2279660886939525
京东 0.042484919364753806
人员 0.01437951116960898
优化 0.1417616454700845
促销 0.022876495042559744
促销活动 0.027161298875928074
保证 0.016848720158329715
信息 0.046987594579479854
做出 0.012927035293890903
做好 0.029049517514361578
#省略后面n行输出