zoukankan      html  css  js  c++  java
  • 爬取酷狗榜单并可视化词云 Python

    爬取酷狗音乐榜单并做一个词云展示。分为两部分,爬虫部分和可视化部分;爬虫的话我用的是openpyxl进行保存的,因为这块一直是薄弱的地方,加强一下面向对象保存数据到excel里。

     1 import openpyxl
     2 import requests
     3 import parsel
     4 
     5 
     6 def create_workbook():
     7     wb = openpyxl.Workbook() # 创建工作簿
     8     ws = wb.create_sheet(title='summary', index=0)
     9     wb.remove(wb['Sheet']) #  删除原始表
    10     wb.close()
    11     wb.save('202201122test.xlsx')
    12 
    13 
    14 # 打开工作簿并写入头
    15 def open_workbook_and_write_header():
    16     wb = openpyxl.load_workbook('202201122test.xlsx')
    17     ws = wb['summary']  # 选中工作表
    18     ws.append(['歌名', '歌手', '链接', '时长'])
    19     wb.close()
    20     wb.save('anothertest.xlsx')
    21 
    22 
    23 class kugouSpider():
    24     headers = {
    25         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
    26     }
    27     def __init__(self, url, headers=headers):
    28         self.url = url
    29         self.headers = headers
    30 
    31     def get_html(self):
    32         response = requests.get(url=self.url, headers=self.headers)
    33         return response.text
    34 
    35     def parsel_data(self):
    36         # wb = openpyxl.load_workbook('anothertest.xlsx')
    37         # ws = wb['summary']
    38         songNames = []
    39         singers = []
    40         playPages = []
    41         ttimes = []
    42         selector = parsel.Selector(self.get_html())
    43         results = selector.xpath('//div[@id="rankWrap"]/div[2]/ul/li')
    44         for item in results:
    45             songName = item.xpath('.//a/@title').get().split('-')[0]
    46             songNames.append(songName)
    47             singer = item.xpath('.//a/@title').get().split('-')[1]
    48             singers.append(singer)
    49             playPage = item.xpath('.//a/@href').get()
    50             playPages.append(playPage)
    51             ttime = item.xpath('.//span[@class="pc_temp_time"]/text()').get().strip()
    52             ttimes.append(ttime)
    53             # ws.append([songName, singer, playPage, ttime])
    54             # print(songName, singer, playPage, ttime, sep=' | ')
    55         return zip(songNames, singers, playPages, ttimes)
    56         # wb.close()
    57         # wb.save('anothertest.xlsx')
    58 
    59     def save_to_excel(self):
    60         # 读取excel并保存数据
    61         wb = openpyxl.load_workbook('anothertest.xlsx')
    62         ws = wb['summary']
    63         for songName, singer, playPage, ttime in self.parsel_data():
    64             print(songName, singer, playPage, ttime, sep=' | ')
    65             ws.append([songName, singer, playPage, ttime])
    66         wb.close()
    67         wb.save('anothertest.xlsx') # 每爬取一页就保存一次
    68 
    69     def run(self):
    70         self.save_to_excel()
    71 
    72 if __name__ == "__main__":
    73     # 方便演示写了两个函数,这样步骤也清晰
    74     create_workbook()  # 创建工作簿
    75     open_workbook_and_write_header()  # 写入头
    76     for page in range(1, 20+1):
    77         url = f'https://www.kugou.com/yy/rank/home/{page}-8888.html?from=rank'
    78         app = kugouSpider(url=url)
    79         app.run()

    保存到excel不像保存到csv,csv相对简单很多,个人感觉。excel有打开保存的过程,一不小心就没保存到数据,或者翻页爬取的时候,保存的是最后一页的数据,不像csv,csv的话,只要打开文档用a+模式就可以完美解决翻页的问题。

    如果偏执xlsx格式文件而又想简单的写入数据保存到本地,可以先将数据保存成csv,然后用pandas将其转换成excel文件。

    数据转换部分:从excel读取数据有xlrd和pandas,两种方法都贴一下,很简单。

     1 def singer_process_by_xlrd():
     2     """
     3     通过xlrd读取excel文档并取出歌手名返回列表
     4     """
     5     singers = []
     6     xlsx = xlrd.open_workbook('anothertest.xlsx')
     7     table = xlsx.sheet_by_name('summary')
     8     # 遍历表格行
     9     for row in range(1, table.nrows): # 从表头的下一行开始遍历
    10         singer = table.cell(row, 0).value # 读取第row行,第一列的值
    11         singers.append(singer)
    12     return singers

    上面的是xlrd的读取excel表格。下面用pandas。

    1 def singer_process_by_pandas():
    2     """
    3     通过pandas读取excel文档并取出歌手名返回列表
    4     """
    5     df = pd.read_excel('anothertest.xlsx') # 读取文档
    6     singers = df['歌名'].tolist()
    7     return singers

    数据处理完了,现在来进行绘制词云:

    第一是根据stylecloud的制作,stylecloud需要用jieba进行分词然后再进行绘制:

     1 def gen_style_words():
     2     wordsList = singer_process_by_pandas() # 列表
     3     wordsStr = ' '.join(wordsList) # 转换成字符串
     4     # 开始分词
     5     wordResults = jieba.cut(wordsStr)
     6     print(wordResults) # 是个生成器
     7     newResults = ''.join(wordResults) #
     8     print(newResults)
     9     newResultsStr = re.sub(r"[A-Z0-9-a-zăâ、()\!\%\[\]\,\。]", "", newResults) # 将特殊字符替
    10     print(newResultsStr)
    11 
    12     with open('stopwords.txt', mode='r', encoding='utf-8') as f:
    13         stopWords = f.read() # str
    14         # print(type(stopWords))
    15         stopWords = stopWords.split('\n') #list
    16         # print(type(stopWords))
    17     stylecloud.gen_stylecloud(
    18         text=newResultsStr,
    19         size=1280,
    20         font_path=r'C:\Windows\Fonts\simhei.ttf',
    21         max_font_size=200,
    22         max_words=150,
    23         # custom_stopwords=stopWords,
    24         output_name = 'str.png',
    25     )

    第二种是wordcloud,wordcloud分词的时候,只需要一个str,所以将列表转换成str直接进行绘制。

     1 def word_cloud_style():
     2     """
     3     用wordcloud其实没有用到结巴分词,它只需要一个str,如果要精准分词,可以用jieba分一次再用wordcloud生成
     4     """
     5     singers = singer_process_by_xlrd() # 列表
     6     singerStr = re.sub(r"[A-Z0-9-a-zăâ、()\!\%\[\]\,\。]", "", ''.join(singers))
     7     print(singerStr)
     8     # # 开始分词
     9     # words = jieba.cut(singerStr)
    10     # print(words) # 生成器
    11     # wordStr = ''.join(words)
    12     img = Image.open('wc.jpg')
    13     img_array = np.array(img)
    14     wordcloud = WordCloud(
    15         background_color='white',
    16         font_path=r'C:\Windows\Fonts\simhei.ttf',
    17         width=1280,
    18         height=960,
    19         max_words=150,
    20         max_font_size=200,
    21         scale=10,
    22         margin=2,
    23         mask=img_array,
    24         collocations=False,
    25     ).generate(singerStr)
    26     plt.imshow(wordcloud)
    27     plt.axis('Off')
    28     plt.show()
    29     wordcloud.to_file('wordcloud.jpg')

    调用函数就可以绘制。

    效果图:

  • 相关阅读:
    js伪数组转数组内部实现
    Vuex核心部分学习参考地址
    vue中让异步代码变成同步的写法
    node.js中文件操作路径和模板标识路径问题
    如果不想安装cnpm又想使用淘宝的服务器来下载,怎么做?
    npm常用命令
    node中模块加载机制
    通过nodejs,简单模拟客户端和服务端进行通信
    vue中非父子组件的传值
    图论1-2
  • 原文地址:https://www.cnblogs.com/mafu/p/15791791.html
Copyright © 2011-2022 走看看