zoukankan      html  css  js  c++  java
  • 分词及词云图设计

    1.jieba的基本使用

     1 import jieba
     2 
     3 
     4 s1 = '我喜欢广州小蛮腰'
     5 s2 = "我喜欢上海东方明珠"
     6 #jieba,cut()#默认精准模式
     7 print(10*'-','全模式',10*'-')
     8 r1 = jieba.cut(s1,cut_all=True)#全模式
     9 print(s1)
    10 for ele in r1:
    11     print(ele)
    12 
    13 print(10*'-','精准模式',10*'-')
    14 r2 = jieba.cut(s1,cut_all=False)
    15 print(s1)
    16 for ele in r2:
    17     print(ele)
    18 
    19 print(10 * '-', '搜索引擎模式', 10 * '-')
    20 r3 = jieba.cut_for_search(s1)#搜索引擎模式
    21 for ele in r3:
    22     print(ele)
    23 
    24 #词性标注
    25 import jieba.posseg
    26 print('---词性标注---')
    27 r4 = jieba.posseg.cut(s1)
    28 # flag 词性
    29 # word 词语
    30 for ele in r4:
    31     print(ele.word+ele.flag)
    32 
    33 #词典加载
    34 jieba.load_userdict('/home/chen/anaconda3/lib/python3.6/site-packages/jieba/dict.txt')
    35 #更改词频
    36 print('---更改词频---')
    37 word_chg = '上海东方'
    38 jieba.add_word(word_chg)
    39 jieba.suggest_freq(word_chg,True)
    40 
    41 #提取关键词
    42 import jieba.analyse
    43 tag = jieba.analyse.extract_tags(s2,3)
    44 print(tag)
    45 
    46 #返回词语的位置
    47 print('---返回词语的位置---')
    48 word_index = jieba.tokenize(s2)
    49 for ele in word_index:
    50     print(ele)
    51 print('---以搜索引擎的方式')
    52 Word_index = jieba.tokenizes(s2,mode='search')
    53 for ele in word_index:
    54     print(ele)
    55 '''
    56 a 形容词
    57 c 连词
    58 d 副词
    59 e 叹词
    60 f 方位词
    61 i 成语
    62 m 数量词
    63 n 名词
    64 nr 人名
    65 ns 地名
    66 nt 机构团体
    67 nz 其他专有名词
    68 p 介词
    69 r 代词
    70 t 时间
    71 u 助词
    72 v 动词
    73 vn 动名词
    74 w 标点
    75 un 未知词语
    View Code

    2.文本挖掘实例

     1 import jieba.analyse
     2 import os
     3 
     4 print(os.path.dirname(os.getcwd()))
     5 file_name = '/home/chen/projects/词云/resource/斗罗大陆.txt'
     6 path = os.path.join(os.path.dirname(os.getcwd()),file_name)
     7 
     8 data = open(file_name).read()
     9 #默认20个关键词
    10 tag = jieba.analyse.extract_tags(data,50)#提取50个关键词
    11 print(tag)
    View Code

    3.词云图设计

     1 #收据收集-->分词-->过滤噪音词-->筛选高频词-->生成图
     2 
     3 from wordcloud import WordCloud
     4 import os
     5 import jieba  #中文分词
     6 import numpy as np
     7 import PIL.Image as img #图片处理
     8 import random
     9 
    10 
    11 stopwords = {"然后":0,"这些":0,"那些":0,"如果":0}
    12 cur_path = os.path.dirname(os.path.dirname(__file__)) + "/resource"
    13 
    14 def chinese_jieba(text):
    15     wordlist_jieba=jieba.cut(text)
    16     text_jieba=" ".join(wordlist_jieba)
    17     return text_jieba
    18 
    19 with open(os.path.join(cur_path,'test.txt')) as fp:
    20     text = fp.read()
    21     text = chinese_jieba(text)
    22     r_num = random.randrange(0,5)
    23     r_num = 4
    24     if r_num == 0:
    25         bg_img = 'zh.png'
    26     elif r_num == 1:
    27         bg_img = 'heart.png'
    28     elif r_num == 2:
    29         bg_img = 'broken_heart.png'
    30     elif r_num == 3:
    31         bg_img = 'tree.png'
    32     else:
    33         bg_img = 'love_love.png'
    34     #设置背景图
    35     mask_pic = np.array(img.open(os.path.join(cur_path,bg_img)))
    36     #默认生成方块词云,背景颜色为黑色
    37     # wcd = WordCloud().generate(text)
    38     #指定背景颜色,指定关键词个数,设置最大字体大小,设置字体,过滤词,背景图形
    39     wcd = WordCloud(background_color='white',
    40                     #max_words=100,
    41                     max_font_size=40,
    42                     font_path='JDFZONGYI.ttf',
    43                     stopwords=stopwords,
    44                     mask=mask_pic).generate(text)
    45     image=wcd.to_image()
    46     image.show()
    View Code
  • 相关阅读:
    上下文有关文法
    sqlserver cte 速度慢
    hibernate tools eclipse 安装
    sts java nullpointer exception
    Oracle RAC集群体系结构
    bean scope scoped-proxy
    hibernate persist不能插入到表中
    system.out 汉字乱码
    NoSQL数据库(转)
    在PowerShell中获取本地的RAM信息(容量)
  • 原文地址:https://www.cnblogs.com/ray-mmss/p/9377090.html
Copyright © 2011-2022 走看看