zoukankan      html  css  js  c++  java
  • 中文词频统计与词云生成

    1. 下载一长篇中文小说

      此处下载的长篇中文小说是:三体

    2. 从文件读取待分析文本

    1 text = open("C:三体.txt", "r", encoding="UTF-8").read()      # 读取文本

    3. 安装并使用jieba进行中文分词

      通过命令行,使用命令:pip install jieba 安装jieba

    1 import jieba
    2 
    3 
    4 text = open("C:三体.txt", "r", encoding="UTF-8").read()      # 读取文本
    5 word_txt = jieba.lcut(text)                                 # 进行中文分词

    4. 更新词库,加入所分析对象的专业词汇

    1 jieba.load_userdict(r'C:三体词汇.txt')                       # 加入小说分析对象的特有词汇
    2 jieba.add_word("量子力学")                                   # 丰富词汇
    3 jieba.add_word("万有引力")

      词库下载地址:https://pinyin.sogou.com/dict/

      词汇格式转换代码(scel格式转txt格式):

      1 # -*- coding: utf-8 -*-
      2 import struct
      3 import os
      4  
      5 # 拼音表偏移,
      6 startPy = 0x1540;
      7  
      8 # 汉语词组表偏移
      9 startChinese = 0x2628;
     10  
     11 # 全局拼音表
     12 GPy_Table = {}
     13  
     14 # 解析结果
     15 # 元组(词频,拼音,中文词组)的列表
     16  
     17  
     18 # 原始字节码转为字符串
     19 def byte2str(data):
     20     pos = 0
     21     str = ''
     22     while pos < len(data):
     23         c = chr(struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0])
     24         if c != chr(0):
     25             str += c
     26         pos += 2
     27     return str
     28  
     29 # 获取拼音表
     30 def getPyTable(data):
     31     data = data[4:]
     32     pos = 0
     33     while pos < len(data):
     34         index = struct.unpack('H', bytes([data[pos],data[pos + 1]]))[0]
     35         pos += 2
     36         lenPy = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
     37         pos += 2
     38         py = byte2str(data[pos:pos + lenPy])
     39  
     40         GPy_Table[index] = py
     41         pos += lenPy
     42  
     43 # 获取一个词组的拼音
     44 def getWordPy(data):
     45     pos = 0
     46     ret = ''
     47     while pos < len(data):
     48         index = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
     49         ret += GPy_Table[index]
     50         pos += 2
     51     return ret
     52  
     53 # 读取中文表
     54 def getChinese(data):
     55     GTable = []
     56     pos = 0
     57     while pos < len(data):
     58         # 同音词数量
     59         same = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
     60  
     61         # 拼音索引表长度
     62         pos += 2
     63         py_table_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
     64  
     65         # 拼音索引表
     66         pos += 2
     67         py = getWordPy(data[pos: pos + py_table_len])
     68  
     69         # 中文词组
     70         pos += py_table_len
     71         for i in range(same):
     72             # 中文词组长度
     73             c_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
     74             # 中文词组
     75             pos += 2
     76             word = byte2str(data[pos: pos + c_len])
     77             # 扩展数据长度
     78             pos += c_len
     79             ext_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
     80             # 词频
     81             pos += 2
     82             count = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
     83  
     84             # 保存
     85             GTable.append((count, py, word))
     86  
     87             # 到下个词的偏移位置
     88             pos += ext_len
     89     return GTable
     90  
     91  
     92 def scel2txt(file_name):
     93     print('-' * 60)
     94     with open(file_name, 'rb') as f:
     95         data = f.read()
     96  
     97     print("词库名:", byte2str(data[0x130:0x338])) # .encode('GB18030')
     98     print("词库类型:", byte2str(data[0x338:0x540]))
     99     print("描述信息:", byte2str(data[0x540:0xd40]))
    100     print("词库示例:", byte2str(data[0xd40:startPy]))
    101  
    102     getPyTable(data[startPy:startChinese])
    103     getChinese(data[startChinese:])
    104     return getChinese(data[startChinese:])
    105  
    106 if __name__ == '__main__':
    107     # scel所在文件夹路径
    108     in_path = r"C:UsersAdministratorDownloads"   #修改为你的词库文件存放文件夹
    109     # 输出词典所在文件夹路径
    110     out_path = r"C:UsersAdministratorDownloads	ext"  # 转换之后文件存放文件夹
    111     fin = [fname for fname in os.listdir(in_path) if fname[-5:] == ".scel"]
    112     for f in fin:
    113         try:
    114             for word in scel2txt(os.path.join(in_path, f)):
    115                 file_path=(os.path.join(out_path, str(f).split('.')[0] + '.txt'))
    116                 # 保存结果
    117                 with open(file_path,'a+',encoding='utf-8')as file:
    118                     file.write(word[2] + '
    ')
    119             os.remove(os.path.join(in_path, f))
    120         except Exception as e:
    121             print(e)
    122             pass
    scel_to_text

    5. 生成词频统计

    1 for word in word_list:
    2     if len(word) == 1:
    3         continue
    4     else:
    5         word_list = word_lists.append(word)
    6         word_dict[word] = word_dict.get(word, 0)+1                         # get()函数返回指定键的值,若没有则返回默认值

    6. 排序

    1 wd = list(word_dict.items())                                               # 为了排序,使字典列表化
    2 wd.sort(key=lambda x: x[1], reverse=True)                                  # 根据字典的值排序

    7. 排除语法型词汇,代词、冠词、连词等停用词

    1 stops_word = open("C:stops_chinese.txt", "r", encoding="UTF-8").read()     # 读取停用词文本
    2 exclude = {'两个', '东西', '很快', '一种', '这是', '看着', '真的', '发出', '回答',
    3            '感觉', '仿佛', 'u3000', '
    ',''}                            # 自定义停用词
    4 stop_list = stops_word.split()
    5 stops_all = set(stop_list).union(set(stop_list), exclude)                  # 求停用词的并集
    6 word_list = [element for element in word_txt if element not in stops_all]  # 去掉停用词

    8. 输出词频最大TOP20,把结果存放到文件里

    1 for i in range(20):                                                        # 输出前20个高频的词
    2     print(wd[i])
    3 word_csv = wd                                                              # 生成csv文件
    4 pd.DataFrame(data=word_csv[0:20]).to_csv('The_three_body.csv', encoding='UTF-8')
    5 mywc.to_file('三体词云.png') # 生成保存词云图片

    9. 生成词云

       完整源码:

     1 from wordcloud import WordCloud
     2 
     3 import matplotlib.pyplot as plt
     4 import jieba
     5 import pandas as pd
     6 
     7 text = open("C:三体.txt", "r", encoding="UTF-8").read()                    # 读取小说文本
     8 text = text.strip()
     9 word_txt = jieba.lcut(text)                                                # 进行中文分词
    10 jieba.load_userdict(r'C:三体词汇.txt')                                     # 加入小说分析对象的特有词汇
    11 jieba.add_word("量子力学")                                                 # 丰富词汇
    12 jieba.add_word("万有引力")
    13 # jieba.add_word('')                                                       # 添加小说特有词汇
    14 stops_word = open("C:stops_chinese.txt", "r", encoding="UTF-8").read()     # 读取停用词文本
    15 exclude = {'两个', '东西', '很快', '一种', '这是', '看着', '真的', '发出', '回答',
    16            '感觉', '仿佛', ''}                                           # 自定义停用词
    17 stop_list = stops_word.split()
    18 stops_all = set(stop_list).union(set(stop_list), exclude)                  # 求停用词的并集
    19 word_list = [element for element in word_txt if element not in stops_all]  # 去掉停用词
    20 word_dict = {}
    21 word_lists = []
    22 for word in word_list:
    23     if len(word) == 1:
    24         continue
    25     else:
    26         word_lists.append(word)
    27         word_dict[word] = word_dict.get(word, 0)+1                         # get()函数返回指定键的值,若没有则返回默认值
    28 wd = list(word_dict.items())                                               # 为了排序,使字典列表化
    29 wd.sort(key=lambda x: x[1], reverse=True)                                  # 根据字典的值排序
    30 for i in range(20):                                                        # 输出前20个高频的词
    31     print(wd[i])
    32 word_csv = wd                                                              # 生成csv文件
    33 pd.DataFrame(data=word_csv[0:20]).to_csv('The_three_body.csv', encoding='UTF-8')
    34 
    35 mywc = WordCloud(font_path="C:/Windows/Fonts/msyh.ttc",background_color='black', margin=2,width=1800, height=800, random_state=42).generate(str(word_lists))
    36 plt.imshow(mywc,interpolation='bilinear')
    37 plt.axis("off")
    38 plt.tight_layout()
    39 mywc.to_file('三体词云.png')
    40 plt.show()
    中文词频统计与词云生成
  • 相关阅读:
    串口RS232和485通信的波形分析
    Ubuntu添加中文输入法
    虚拟机桥接模式联网方法,Xshell的连接与使用
    waitpid 函数详解
    linux for循环 fork() 产生子进程
    【LeetCode解题总结】动态规划篇
    【LeetCode解题总结】递归篇
    【LeetCode解题总结】排序篇
    【LeetCode解题总结】树/图篇
    【LeetCode解题总结】栈/队列篇
  • 原文地址:https://www.cnblogs.com/trojans/p/10590620.html
Copyright © 2011-2022 走看看