zoukankan      html  css  js  c++  java
  • 爬虫

    利用爬虫爬出数据,词频统计

    import requests
    import re
    import jieba
    
    response = requests.get('http://www.haha56.net/xiaohua/gushi/list_1_2.html')
    # print(response.status_code)
    response.encoding = 'gbk'
    data = response.text
    # print(data)
    content_res = re.findall('<dd class="preview">(.*?)</dd>',data)
    res = str(content_res)
    res_cut = jieba.lcut(res)
    
    dic = {}
    
    for i in res_cut:
        if len(i) == 1:
            continue
        if i == '...':
            continue
        if i in dic:
            dic[i] += 1
        else:
            dic[i] = 1
    
    def func(i):
        return i[1]
    dic_list = list(dic.items())
    dic_list.sort(key=func)
    dic_list.reverse()
    print(dic_list)
    

    词云

    import requests
    import re
    import wordcloud
    
    response = requests.get('http://www.haha56.net/xiaohua/gushi/list_1_2.html')
    response.encoding = 'gbk'
    data = response.text
    content_res = re.findall('<dd class="preview">(.*?)</dd>',data)
    res = ''.join(content_res)
    w = wordcloud.WordCloud(font_path=r'C:WindowsFontsLHANDW')
    w.generate(res)
    w.to_file("ciyun.png")
    

    爬取图片

    import requests
    import re
    
    response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1')
    data = response.text
    # print(data)
    
    img_url_res = re.findall('data-src="(.*?)"',data)
    for i in img_url_res:
        img_response = requests.get(i)
        img_data = img_response.content
        img_name = i.split('/')[-1]
        f=open(img_name,'wb')
        f.write(img_data)
    #     f.flush()   # 快速刷新
    

    爬取视频

    import requests
    import re
    
    response = requests.get('http://www.mod.gov.cn/v/index.htm')
    data = response.text
    
    mp4_res2 = re.findall('<a href="(.*?)">', data)
    
    for i in mp4_res2:  # type:str
        res = re.findall('(.*?htm)', i)[0]
        res = 'http://www.mod.gov.cn/v/' + res
    
        response = requests.get(res)
        data = response.text
        url_res = re.findall('//Video (.*?.mp4)',data)[0]
    
    
        mp4_response = requests.get(url_res)
        mp4_data = mp4_response.content
        f = open('test.mp4','wb')
        f.write(mp4_data)
    
  • 相关阅读:
    字节流
    A、B
    rollup
    使用nodejs提供动态javascript文件
    nodejs服务器部署
    A js 中加载Bjs

    01月05日22:14:32 学习进度笔记
    01月07日19:10:50 学习进度笔记
    01月07日18:53:49 学习进度笔记
  • 原文地址:https://www.cnblogs.com/yushan1/p/11228055.html
Copyright © 2011-2022 走看看