zoukankan      html  css  js  c++  java
  • 用python爬校花网

    import requests
    import re
    import hashlib,time
    
    def get_index(url):
        response=requests.get(url)
        if response.status_code == 200:
            return response.text
    
    def parse_index(res):
        urls=re.findall(r'class="items".*?href="(.*?)"',res,re.S)
        return urls
    
    def get_detail(urls):
        for url in urls:
            if not url.startswith('http'):
                url='http://www.xiaohuar.com%s' %url
            r1=requests.get(url)
            if r1.status_code == 200:
                url_list=re.findall(r'id="media".*?src="(.*?)"',r1.text,re.S)
                if url_list:
                    mp4_url=url_list[0]
                    # print(mp4_url)
                    save(mp4_url)
    
    def save(url):
        print('Download:%s' %url)
        r2=requests.get(url)
        if r2.status_code == 200:
            m=hashlib.md5()
            m.update(url.encode('utf-8'))
            m.update(str(time.time()).encode('utf-8'))
            filename='%s.mp4' %m.hexdigest()
            file_path=r'D:\爬虫视频\%s' % filename
            with open(file_path,'wb') as f:
                f.write(r2.content)
    
    def main():
        for i in range(5):
            res1=get_index('http://www.xiaohuar.com/list-3-%s.html' %i)
            res2=parse_index(res1)
            get_detail(res2)
    
    if __name__ == '__main__':
        main()

    基于上面代码开多线程爬取视频,优化下载速度

    # 异步,多线程优化下载速度
    
    import requests
    import re
    import hashlib,time
    from concurrent.futures import ThreadPoolExecutor
    
    p=ThreadPoolExecutor(30)
    
    def get_index(url):
        response=requests.get(url)
        if response.status_code == 200:
            return response.text
    
    def parse_index(res):
        res=res.result()
        urls=re.findall(r'class="items".*?href="(.*?)"',res,re.S)
        # return urls
        for url in urls:
            p.submit(get_detail,url)
    
    def get_detail(urls):
        for url in urls:
            if not url.startswith('http'):
                url='http://www.xiaohuar.com%s' %url
            r1=requests.get(url)
            if r1.status_code == 200:
                url_list=re.findall(r'id="media".*?src="(.*?)"',r1.text,re.S)
                if url_list:
                    mp4_url=url_list[0]
                    # print(mp4_url)
                    save(mp4_url)
    
    def save(url):
        print('Download:%s' %url)
        r2=requests.get(url)
        if r2.status_code == 200:
            m=hashlib.md5()
            m.update(url.encode('utf-8'))
            m.update(str(time.time()).encode('utf-8'))
            filename='%s.mp4' %m.hexdigest()
            file_path=r'D:\爬虫视频\%s' % filename
            with open(file_path,'wb') as f:
                f.write(r2.content)
    
    def main():
        for i in range(5):
            p.submit(get_index,'http://www.xiaohuar.com/list-3-%s.html' %i).add_done_callback(parse_index)
            # res1=get_index('http://www.xiaohuar.com/list-3-%s.html' %i)
            # res2=parse_index(res1)
            # get_detail(res2)
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    ZooKeeper概述(转)
    ZooKeeper典型应用场景(转)
    部署与管理ZooKeeper(转)
    Hbase分布式安装部署过程
    HBase安装
    使用info命令查看Redis信息和状态
    java定时调度器解决方案分类及特性介绍
    谈谈应用层切面设计
    七层协议和四层协议
    HTTP协议详解
  • 原文地址:https://www.cnblogs.com/shenbuer/p/7824422.html
Copyright © 2011-2022 走看看