from pprint import pprint
import xlrd #读取excel数据
import re
import jieba #使用结巴进行中文分词
path = r"D:1研1大四2020.3.13-国家突发卫生事件20201008lda.xlsx" #修改路径
data = xlrd.open_workbook(path)
sheet_1_by_index = data.sheet_by_index(0) #读取表一
title = sheet_1_by_index.col_values(1) #第二列
n_of_rows = sheet_1_by_index.nrows
doc_set = [] #空列表
for i in range(1,n_of_rows): #逐行读取
doc_set.append(title[i])
#从文件导入停用词表
def stopwordslist(filepath):
stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
return stopwords
stopwords=stopwordslist(r"D:1研1大四2020.3.13-国家突发卫生事件20201008stopwords.txt")
texts = []#每篇文章关键词
stpwrdlst2 = ['和', '等', '对', '的', '不','与', '一','化']#去停用词2自编,这里是我自己觉得需要去掉的词
for doc in doc_set:
#只保留中文
cleaned_doc = ''.join(re.findall(r'[u4e00-u9fa5]', doc))
#分词
doc_cut = jieba.lcut(cleaned_doc)
#去停用词
text_list0 = [word for word in doc_cut if word not in stopwords and len(word)>1]
text_list1 = [word for word in text_list0 if word not in stpwrdlst2]
#if len(doc_cut)>1 and doc_cut not in stopwords:
#texts.append(doc_cut)
#最终处理好的结果存放于text[]中
texts.append(text_list1)
#利用 gensim 库构建词篇矩阵
import gensim
from gensim import corpora
#构建字典,把刚刚处理好的词都存进去
dictionary = corpora.Dictionary(texts)
#构建文档-词频矩阵,得到的是词袋矩阵
corpus = [dictionary.doc2bow(text) for text in texts]
#print('
文档-词频矩阵:')
#pprint(corpus)
#pprint(corpus[0:19])
#for c in corpus:
#print(c)
#转换成稀疏矩阵
from gensim.matutils import corpus2dense
corpus_matrix=corpus2dense(corpus, len(dictionary))
[[1. 0. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]
[1. 0. 1. ... 1. 0. 0.]
...
[0. 0. 0. ... 0. 0. 1.]
[0. 0. 0. ... 0. 0. 1.]
[0. 0. 0. ... 0. 0. 1.]]