上一篇,已经实现了如何将一条语句在一个语料库中比较相似度,
发现运行的时候每次都要编译语料库,通过查找资料,可以一次性编译成预料库,存人文件
编译语料库代码 11_k.py
import sys
import jieba
reload(sys)
sys.setdefaultencoding("utf-8")
from gensim import corpora,models,similarities
alist = []
import json
def fenci():
for i_text in open("xaa.json"):
f_json = json.loads(i_text)
kk = f_json["content"]
item_str = jieba.cut(kk.encode("utf-8"),cut_all=True)
a = " ".join(item_str)
alist.append(a)
fenci()
class MyCorpus(object):
def __iter__(self):
for item_str in alist:
yield item_str.split(' ')
Corp = MyCorpus()
dictionary = corpora.Dictionary(Corp)
#dictionary = corpora.Dictionary(Corp)
dictionary.save("bbb.dict") #存入本地数据
corpus = [dictionary.doc2bow(text) for text in Corp]
corpora.MmCorpus.serialize('deerwester1.mm', corpus) # 存入硬盘,以备后需
编译好了 bbb.dict deerwester1.mm 文件,在一下一个代码中直接调用11_main.py
import jieba
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('bbb.dict') #调用
corpus = corpora.MmCorpus('deerwester1.mm')
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
def read_file(i):
with open("xaa.json","rt") as f:
y = f.readlines()[i]
print y
def test_kk(test):
test_data_1 = '请假洛阳牡丹'
test_cut_raw_1 = jieba.cut(test)
doc_new = " ".join(test_cut_raw_1)
test_corpus_1 = dictionary.doc2bow(doc_new.split())
vec_tfidf = tfidf[test_corpus_1]
index = similarities.MatrixSimilarity(corpus_tfidf)
sims = index[vec_tfidf]
similarit = list(sims)
#print(list(enumerate(sims)))
sims = sorted(enumerate(sims), key=lambda item: -item[1])
for i in range(20): #只读取前20 个数据,
print sims[i] #相似度是与元组的形式存在
k = sims[i]
read_file(k[0]) #将相似文件中相似的语句打印出来
def buss_mian():
while True:
test = raw_input("please input test:")
test_kk(test)
if __name__ == "__main__":
buss_mian()