import re import requests from bs4 import BeautifulSoup import jieba import matplotlib.pyplot as plt from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator news="" html = requests.get('http://www.kejixun.com/news/index.html') html.encoding = 'gb2312' soup = BeautifulSoup(html.text,'html.parser') for p in soup.find_all("figcaption",class_='title'): news = news + p.get_text() ls = [] words = jieba.lcut(news) counts = {} for word in words: if len(word) == 1: continue else: counts[word] = counts.get(word,0)+1 ls.append(word) items = list(counts.items()) items.sort(key = lambda x:x[1], reverse = True) for i in range(10): word , count = items[i] print ("{:<10}{:>5}".format(word,count)) wz = open('ms.txt','w+') wz.write(str(ls)) wz.close() wz = open('ms.txt','r').read() backgroud_Image = plt.imread('cloud.jpg') wc = WordCloud( background_color = 'white', mask = backgroud_Image, max_words = 2000, stopwords = STOPWORDS, font_path = 'C:/Users/Windows/fonts/msyh.ttf', max_font_size = 200, random_state = 30, ) wc.generate(wz) image_colors = ImageColorGenerator(backgroud_Image) wc.recolor(color_func = image_colors) plt.imshow(wc) plt.axis('off') plt.show()
捕获关键词
生成词云