爬取酷狗音乐榜单并做一个词云展示。分为两部分,爬虫部分和可视化部分;爬虫的话我用的是openpyxl进行保存的,因为这块一直是薄弱的地方,加强一下面向对象保存数据到excel里。
1 import openpyxl 2 import requests 3 import parsel 4 5 6 def create_workbook(): 7 wb = openpyxl.Workbook() # 创建工作簿 8 ws = wb.create_sheet(title='summary', index=0) 9 wb.remove(wb['Sheet']) # 删除原始表 10 wb.close() 11 wb.save('202201122test.xlsx') 12 13 14 # 打开工作簿并写入头 15 def open_workbook_and_write_header(): 16 wb = openpyxl.load_workbook('202201122test.xlsx') 17 ws = wb['summary'] # 选中工作表 18 ws.append(['歌名', '歌手', '链接', '时长']) 19 wb.close() 20 wb.save('anothertest.xlsx') 21 22 23 class kugouSpider(): 24 headers = { 25 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36', 26 } 27 def __init__(self, url, headers=headers): 28 self.url = url 29 self.headers = headers 30 31 def get_html(self): 32 response = requests.get(url=self.url, headers=self.headers) 33 return response.text 34 35 def parsel_data(self): 36 # wb = openpyxl.load_workbook('anothertest.xlsx') 37 # ws = wb['summary'] 38 songNames = [] 39 singers = [] 40 playPages = [] 41 ttimes = [] 42 selector = parsel.Selector(self.get_html()) 43 results = selector.xpath('//div[@id="rankWrap"]/div[2]/ul/li') 44 for item in results: 45 songName = item.xpath('.//a/@title').get().split('-')[0] 46 songNames.append(songName) 47 singer = item.xpath('.//a/@title').get().split('-')[1] 48 singers.append(singer) 49 playPage = item.xpath('.//a/@href').get() 50 playPages.append(playPage) 51 ttime = item.xpath('.//span[@class="pc_temp_time"]/text()').get().strip() 52 ttimes.append(ttime) 53 # ws.append([songName, singer, playPage, ttime]) 54 # print(songName, singer, playPage, ttime, sep=' | ') 55 return zip(songNames, singers, playPages, ttimes) 56 # wb.close() 57 # wb.save('anothertest.xlsx') 58 59 def save_to_excel(self): 60 # 读取excel并保存数据 61 wb = openpyxl.load_workbook('anothertest.xlsx') 62 ws = wb['summary'] 63 for songName, singer, playPage, ttime in self.parsel_data(): 64 print(songName, singer, playPage, ttime, sep=' | ') 65 ws.append([songName, singer, playPage, ttime]) 66 wb.close() 67 wb.save('anothertest.xlsx') # 每爬取一页就保存一次 68 69 def run(self): 70 self.save_to_excel() 71 72 if __name__ == "__main__": 73 # 方便演示写了两个函数,这样步骤也清晰 74 create_workbook() # 创建工作簿 75 open_workbook_and_write_header() # 写入头 76 for page in range(1, 20+1): 77 url = f'https://www.kugou.com/yy/rank/home/{page}-8888.html?from=rank' 78 app = kugouSpider(url=url) 79 app.run()
保存到excel不像保存到csv,csv相对简单很多,个人感觉。excel有打开保存的过程,一不小心就没保存到数据,或者翻页爬取的时候,保存的是最后一页的数据,不像csv,csv的话,只要打开文档用a+模式就可以完美解决翻页的问题。
如果偏执xlsx格式文件而又想简单的写入数据保存到本地,可以先将数据保存成csv,然后用pandas将其转换成excel文件。
数据转换部分:从excel读取数据有xlrd和pandas,两种方法都贴一下,很简单。
1 def singer_process_by_xlrd(): 2 """ 3 通过xlrd读取excel文档并取出歌手名返回列表 4 """ 5 singers = [] 6 xlsx = xlrd.open_workbook('anothertest.xlsx') 7 table = xlsx.sheet_by_name('summary') 8 # 遍历表格行 9 for row in range(1, table.nrows): # 从表头的下一行开始遍历 10 singer = table.cell(row, 0).value # 读取第row行,第一列的值 11 singers.append(singer) 12 return singers
上面的是xlrd的读取excel表格。下面用pandas。
1 def singer_process_by_pandas(): 2 """ 3 通过pandas读取excel文档并取出歌手名返回列表 4 """ 5 df = pd.read_excel('anothertest.xlsx') # 读取文档 6 singers = df['歌名'].tolist() 7 return singers
数据处理完了,现在来进行绘制词云:
第一是根据stylecloud的制作,stylecloud需要用jieba进行分词然后再进行绘制:
1 def gen_style_words(): 2 wordsList = singer_process_by_pandas() # 列表 3 wordsStr = ' '.join(wordsList) # 转换成字符串 4 # 开始分词 5 wordResults = jieba.cut(wordsStr) 6 print(wordResults) # 是个生成器 7 newResults = ''.join(wordResults) # 8 print(newResults) 9 newResultsStr = re.sub(r"[A-Z0-9-a-zăâ、()\!\%\[\]\,\。]", "", newResults) # 将特殊字符替 10 print(newResultsStr) 11 12 with open('stopwords.txt', mode='r', encoding='utf-8') as f: 13 stopWords = f.read() # str 14 # print(type(stopWords)) 15 stopWords = stopWords.split('\n') #list 16 # print(type(stopWords)) 17 stylecloud.gen_stylecloud( 18 text=newResultsStr, 19 size=1280, 20 font_path=r'C:\Windows\Fonts\simhei.ttf', 21 max_font_size=200, 22 max_words=150, 23 # custom_stopwords=stopWords, 24 output_name = 'str.png', 25 )
第二种是wordcloud,wordcloud分词的时候,只需要一个str,所以将列表转换成str直接进行绘制。
1 def word_cloud_style(): 2 """ 3 用wordcloud其实没有用到结巴分词,它只需要一个str,如果要精准分词,可以用jieba分一次再用wordcloud生成 4 """ 5 singers = singer_process_by_xlrd() # 列表 6 singerStr = re.sub(r"[A-Z0-9-a-zăâ、()\!\%\[\]\,\。]", "", ''.join(singers)) 7 print(singerStr) 8 # # 开始分词 9 # words = jieba.cut(singerStr) 10 # print(words) # 生成器 11 # wordStr = ''.join(words) 12 img = Image.open('wc.jpg') 13 img_array = np.array(img) 14 wordcloud = WordCloud( 15 background_color='white', 16 font_path=r'C:\Windows\Fonts\simhei.ttf', 17 width=1280, 18 height=960, 19 max_words=150, 20 max_font_size=200, 21 scale=10, 22 margin=2, 23 mask=img_array, 24 collocations=False, 25 ).generate(singerStr) 26 plt.imshow(wordcloud) 27 plt.axis('Off') 28 plt.show() 29 wordcloud.to_file('wordcloud.jpg')
调用函数就可以绘制。
效果图: