zoukankan      html  css  js  c++  java
  • CSIC_716_20191028【爬小破站】

    1、爬取小破站的弹幕

    2、展示爬取内容

    打开网页,用教的方法找到cid 和header

    import requests

    from bs4 import BeautufulSoup

    import pandas as pd

    import re

    header={

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'

    }#小破站的header可以通过谷歌浏览器的network中的request header获取

    url=' http://comment.破站.com/125507930.xml'  #以爬取小破站cid=125507930为例

    response=request.get(headers=header,url=url) #获取响应

    response.encoding=response.apparent_encoding

    data=response.text

    soup=BeautifulSoup(data,'lxml' )#解析

    dlist=soup.find_all('d') #将内容按照规律向需要获取的内容筛选靠拢,存入dlist

    d_list=[ ]

    for i in d_list

    danmu{ }

    danmu['弹幕']=i.text

    d_list.append(danmu)  #将取到的值放到d_list中

    df=pd.dateframe(d_list)     #df应该还是列表,pd是pandas

    f.open('sign.txt','encoding='utf-8')

    for p in danmu['danmu'].value:

    pat=re.compile(r'[一-龥]+')

    filter_data=re.findall(pattern=pat,string=p )#筛选

    f.write( " ".join(filter_data))  #保存数据进入sign文件

    f.close()

    ------------------------------------------------------------------------------------------------------------------------------------------------------

    利用上面获得的文件 sign.txt进行展示

    import jieba

    from imageio import imread

    from wordcloud import WordCloud

    f.open('sign.txt ',encoding='utf-8' )

    data=f.read()

    result=" ".join(jieba.lcut(data))

    f.close( )

    mask_color=imread('XXXX.jpg')

    wc=WordCloud(

    font_path='font的路径',

    mask=mask_color,

    width=xx,

    height=xx,

    )

    wc.generate(result)

    wc.to_file('xxxx.png')

    plt.imshow( wc)

    plt.show()

    ----------------------------------------------------以下为正式代码将两者合并--------------------------------------------------------------------------------------------------

    如果不生成中间文件,爬完网页直接生成图片,代码合并,如下所示

     1 #coding:utf-8
     2 import requests
     3 import jieba
     4 import pandas as pd
     5 import re
     6 import matplotlib.pyplot as plt
     7 from bs4 import BeautifulSoup
     8 from imageio import imread
     9 from wordcloud import WordCloud
    10 header={
    11 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
    12 }
    13 url='http://comment.bilibili.com/122593266.xml'  #换一下数字的cid即可
    14 response = requests.get(url=url,headers=header)
    15 response.encoding=response.apparent_encoding
    16 data=response.text
    17 suop=BeautifulSoup(data,'lxml')
    18 list_filter=suop.find_all('d')
    19 plist=[]
    20 for i in list_filter:
    21     danmu={}
    22     danmu['弹幕']= i.text
    23     plist.append(danmu)
    24 df=pd.DataFrame(plist)
    25 reslist=[]
    26 for p in df['弹幕'].values:
    27     txtfilter= re.compile(r'[一-龥]+')
    28     res=re.findall(pattern=txtfilter,string=p)
    29     result = " ".join(res)
    30     reslist.append(result)
    31 result= " ".join(reslist)
    32 finalResult=" ".join(jieba.lcut(result))
    33 mask_color=imread('五角星.jpg')  #图片可以随意替换,放在project中即可
    34 wc=WordCloud(
    35     font_path=r'C:WindowsFontsSTLITI.TTF',
    36     mask=mask_color,
    37     width=1920,
    38     height=1080,
    39     background_color='white'
    40 )
    41 wc.generate(finalResult)
    42 wc.to_file('hunt.png')
    43 plt.imshow(wc)
    44 plt.show( )
  • 相关阅读:
    Asp.Net构架(Http请求处理流程)、(Http Handler 介绍)、(HttpModule 介绍)
    JQuery中的事件(三)
    关于asp.net mvc中的httpModules 与 httpHandler
    jQuery中的CSS(二)
    JQuery选择器(一)
    JavaScript中利用Ajax 实现客户端与服务器端通信(九)
    JavaScriptDom操作与高级应用(八)
    oracle(二)V$lock 视图中ID1 , ID2 列的含义
    关于static、内部类
    oracle(一)复习起航
  • 原文地址:https://www.cnblogs.com/csic716/p/11755298.html
Copyright © 2011-2022 走看看