zoukankan html css js c++ java

python构建词篇矩阵

from pprint import pprint
import xlrd #读取excel数据
import re
import jieba #使用结巴进行中文分词

path = r"D:1研1大四2020.3.13-国家突发卫生事件20201008lda.xlsx" #修改路径
data = xlrd.open_workbook(path)

sheet_1_by_index = data.sheet_by_index(0) #读取表一
title = sheet_1_by_index.col_values(1) #第二列
n_of_rows = sheet_1_by_index.nrows
doc_set = [] #空列表
for i in range(1,n_of_rows): #逐行读取
    doc_set.append(title[i])

#从文件导入停用词表
def stopwordslist(filepath):
    stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords
stopwords=stopwordslist(r"D:1研1大四2020.3.13-国家突发卫生事件20201008stopwords.txt")

texts = []#每篇文章关键词
stpwrdlst2 = ['和', '等', '对', '的', '不','与', '一','化']#去停用词2自编，这里是我自己觉得需要去掉的词
for doc in doc_set:
    #只保留中文
    cleaned_doc = ''.join(re.findall(r'[u4e00-u9fa5]', doc))
    #分词
    doc_cut = jieba.lcut(cleaned_doc)
    #去停用词
    text_list0 = [word for word in doc_cut if word not in stopwords and len(word)>1]
    text_list1 = [word for word in text_list0 if word not in stpwrdlst2]
    #if len(doc_cut)>1 and doc_cut not in stopwords:
        #texts.append(doc_cut)
    #最终处理好的结果存放于text[]中
    texts.append(text_list1)

#利用 gensim 库构建词篇矩阵
import gensim
from gensim import corpora
#构建字典，把刚刚处理好的词都存进去
dictionary = corpora.Dictionary(texts)

#构建文档-词频矩阵，得到的是词袋矩阵
corpus = [dictionary.doc2bow(text) for text in texts]
#print('
文档-词频矩阵：')
#pprint(corpus)
#pprint(corpus[0:19])
#for c in corpus:
    #print(c)

#转换成稀疏矩阵
from gensim.matutils import corpus2dense
corpus_matrix=corpus2dense(corpus, len(dictionary))

print(corpus_matrix)

[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 1. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]

corpus_matrix.shape

(1342, 15)

查看全文

相关阅读:
PHP:第一章——PHP中字符运算符、比较运算符、错误控制运算符
 PHP:第一章——PHP中逻辑运算符的使用方法
 PHP:第一章——PHP中的算术运算符/递增、递减运算符/赋值运算符
 微信小程序通过js动态修改css样式的方法（交流QQ群：604788754）
微信小程序跨页面获取数据示例
 JavaEE资源
 java 学习路线
 想以编程为职业，现在正在看毕向东的java基础，接下来应该看什么视频，求前辈们指教。
2017Java学习路线图,内附完整Java自学视频教程+工具经验+面试
 各种学习路线图专区

原文地址：https://www.cnblogs.com/Cookie-Jing/p/13837548.html