zoukankan      html  css  js  c++  java
  • 爬虫

    利用爬虫爬出数据,词频统计

    import requests
    import re
    import jieba
    
    response = requests.get('http://www.haha56.net/xiaohua/gushi/list_1_2.html')
    # print(response.status_code)
    response.encoding = 'gbk'
    data = response.text
    # print(data)
    content_res = re.findall('<dd class="preview">(.*?)</dd>',data)
    res = str(content_res)
    res_cut = jieba.lcut(res)
    
    dic = {}
    
    for i in res_cut:
        if len(i) == 1:
            continue
        if i == '...':
            continue
        if i in dic:
            dic[i] += 1
        else:
            dic[i] = 1
    
    def func(i):
        return i[1]
    dic_list = list(dic.items())
    dic_list.sort(key=func)
    dic_list.reverse()
    print(dic_list)
    

    词云

    import requests
    import re
    import wordcloud
    
    response = requests.get('http://www.haha56.net/xiaohua/gushi/list_1_2.html')
    response.encoding = 'gbk'
    data = response.text
    content_res = re.findall('<dd class="preview">(.*?)</dd>',data)
    res = ''.join(content_res)
    w = wordcloud.WordCloud(font_path=r'C:WindowsFontsLHANDW')
    w.generate(res)
    w.to_file("ciyun.png")
    

    爬取图片

    import requests
    import re
    
    response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1')
    data = response.text
    # print(data)
    
    img_url_res = re.findall('data-src="(.*?)"',data)
    for i in img_url_res:
        img_response = requests.get(i)
        img_data = img_response.content
        img_name = i.split('/')[-1]
        f=open(img_name,'wb')
        f.write(img_data)
    #     f.flush()   # 快速刷新
    

    爬取视频

    import requests
    import re
    
    response = requests.get('http://www.mod.gov.cn/v/index.htm')
    data = response.text
    
    mp4_res2 = re.findall('<a href="(.*?)">', data)
    
    for i in mp4_res2:  # type:str
        res = re.findall('(.*?htm)', i)[0]
        res = 'http://www.mod.gov.cn/v/' + res
    
        response = requests.get(res)
        data = response.text
        url_res = re.findall('//Video (.*?.mp4)',data)[0]
    
    
        mp4_response = requests.get(url_res)
        mp4_data = mp4_response.content
        f = open('test.mp4','wb')
        f.write(mp4_data)
    
  • 相关阅读:
    osg::PagedLOD example
    osg::NodeVisitor example
    osg::NodeVisitor
    osg::NodeVisitor osg3.4.0
    Visual studio 正在从以下位置加载符号:Microsoft符号服务器 尝试取消禁用后续符号加载
    osgViewer::Viewer::Windows
    Inventor2018专业版软件安装激活教程
    osg osgUtil::LineSegmentIntersector
    Civil 3D百度云地址
    osg define shape(create box)
  • 原文地址:https://www.cnblogs.com/yushan1/p/11228055.html
Copyright © 2011-2022 走看看