zoukankan      html  css  js  c++  java
  • 使用线程池进行爬虫

    import requests #pip3 install requests
    import re
    import hashlib
    import time
    from concurrent.futures import ThreadPoolExecutor
    
    pool=ThreadPoolExecutor(50)
    movie_path=r'C:mp4'
    
    def get_page(url):
        try:
            response=requests.get(url)
            if response.status_code == 200:
                return response.text
        except Exception:
            pass
    
    def parse_index(index_page):
        index_page=index_page.result()
        urls=re.findall('class="items".*?href="(.*?)"',index_page,re.S)
        for detail_url in urls:
            if not detail_url.startswith('http'):
                detail_url='http://www.xiaohuar.com'+detail_url
            pool.submit(get_page,detail_url).add_done_callback(parse_detail)
    
    def parse_detail(detail_page):
        detail_page=detail_page.result()
        l=re.findall('id="media".*?src="(.*?)"',detail_page,re.S)
        if l:
            movie_url=l[0]
            if movie_url.endswith('mp4'):
                pool.submit(get_movie,movie_url)
    
    def get_movie(url):
        try:
            response=requests.get(url)
            if response.status_code == 200:
                m=hashlib.md5()
                m.update(str(time.time()).encode('utf-8'))
                m.update(url.encode('utf-8'))
                filepath='%s\%s.mp4' %(movie_path,m.hexdigest())
                with open(filepath,'wb') as f:
                    f.write(response.content)
                    print('%s 下载成功' %url)
        except Exception:
            pass
    
    def main():
        base_url='http://www.xiaohuar.com/list-3-{page_num}.html'
        for i in range(5):
            url=base_url.format(page_num=i)
            pool.submit(get_page,url).add_done_callback(parse_index)
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    服务器运维
    mysq配置
    PHP-FPM进程数的设定
    vsftpd 安装配置详细教程
    php-fpm性能优化
    如果不知道MySQL当前使用配置文件(my.cnf)的路径的解决方法
    搭建linux+nginx+mysql+php环境
    PHP 页面编码声明方法详解(header或meta)
    Linux内核的一些知识。
    Connector框架笔记
  • 原文地址:https://www.cnblogs.com/ldq1996/p/8306015.html
Copyright © 2011-2022 走看看