# -*- coding:utf-8 -*-
# @Author : chenfei
# @time : 2020/10/23 15:30
# @File : 读取word统计词频输出excel.PY
# @Software : PyCharm
import docx
import jieba
from collections import Counter
import pandas as pd
document =docx.Document(r'D:免安装使用WeChatDownload v20200423曹政的梦呓合集10.docx')
content = ' '.join([para.text for para in document.paragraphs])
print(len(content))
# print(content[0:100])
# 中文分词
seg_list = jieba.cut(content,cut_all=False)
print(type(seg_list))
# 过滤标点符号、无意义的单个字
seg_list = [word for word in seg_list if len(word)>1]
# print(seg_list[:30])
# 统计词频
counter = Counter(seg_list)
# # 输出前10个
# for key,count in list(counter.items())[:10]:
# print(key,count)
# 构造pandas
df = pd.DataFrame(list(counter.items()),columns=['word','count'])
# print(df.head()) # 输出前5行
#排序
df.sort_values(by='count',ascending=False,inplace=True)
print(df.head())
# 输出excel
df.to_excel('分析结果-词频数据.xlsx',index=False)