zoukankan      html  css  js  c++  java
  • 爬虫

    利用爬虫爬出数据,词频统计

    import requests
    import re
    import jieba
    
    response = requests.get('http://www.haha56.net/xiaohua/gushi/list_1_2.html')
    # print(response.status_code)
    response.encoding = 'gbk'
    data = response.text
    # print(data)
    content_res = re.findall('<dd class="preview">(.*?)</dd>',data)
    res = str(content_res)
    res_cut = jieba.lcut(res)
    
    dic = {}
    
    for i in res_cut:
        if len(i) == 1:
            continue
        if i == '...':
            continue
        if i in dic:
            dic[i] += 1
        else:
            dic[i] = 1
    
    def func(i):
        return i[1]
    dic_list = list(dic.items())
    dic_list.sort(key=func)
    dic_list.reverse()
    print(dic_list)
    

    词云

    import requests
    import re
    import wordcloud
    
    response = requests.get('http://www.haha56.net/xiaohua/gushi/list_1_2.html')
    response.encoding = 'gbk'
    data = response.text
    content_res = re.findall('<dd class="preview">(.*?)</dd>',data)
    res = ''.join(content_res)
    w = wordcloud.WordCloud(font_path=r'C:WindowsFontsLHANDW')
    w.generate(res)
    w.to_file("ciyun.png")
    

    爬取图片

    import requests
    import re
    
    response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1')
    data = response.text
    # print(data)
    
    img_url_res = re.findall('data-src="(.*?)"',data)
    for i in img_url_res:
        img_response = requests.get(i)
        img_data = img_response.content
        img_name = i.split('/')[-1]
        f=open(img_name,'wb')
        f.write(img_data)
    #     f.flush()   # 快速刷新
    

    爬取视频

    import requests
    import re
    
    response = requests.get('http://www.mod.gov.cn/v/index.htm')
    data = response.text
    
    mp4_res2 = re.findall('<a href="(.*?)">', data)
    
    for i in mp4_res2:  # type:str
        res = re.findall('(.*?htm)', i)[0]
        res = 'http://www.mod.gov.cn/v/' + res
    
        response = requests.get(res)
        data = response.text
        url_res = re.findall('//Video (.*?.mp4)',data)[0]
    
    
        mp4_response = requests.get(url_res)
        mp4_data = mp4_response.content
        f = open('test.mp4','wb')
        f.write(mp4_data)
    
  • 相关阅读:
    [HAOI 2007]上升序列
    转载:分布式与集群的区别究竟是什么?
    转载:5个顶级异步Python框架 https://geekflare.com/?s=python
    代码走读 airflow
    走读中学到的技巧 airflow
    sqlalchemy 相关
    pandas 筛选
    pandas IO
    服务端高并发分布式架构演进之路 转载,原文地址:https://segmentfault.com/a/1190000018626163
    pandas 6 时间
  • 原文地址:https://www.cnblogs.com/yushan1/p/11228055.html
Copyright © 2011-2022 走看看