zoukankan      html  css  js  c++  java
  • 异步线程池爬取 校花网视频

    import re
    import requests
    
    response = requests.get("http://www.xiaohuar.com/v/")
    
    
    
    url_s = re.findall('<div class="items">.*?href="(.*?)"',response.text,re.S)
    for url in url_s:
        res = requests.get(url)
        result = re.findall('<video id="media".*?src="(.*?)"',res.text,re.S)
    
        # print(result)
    
    def get_page(url):
        try:
            response = requests.get(url)
            if response.status_code==200:
                return response.text
        except Exception:
            pass
    
    
    
    def  parse_data(text):
        url_s = re.findall('<div class="items">.*?href="(.*?)"',text,re.S)
        # list = []
        for url in url_s:
            if url:
                yield url
    
    def parse_detail(text):
        try:
            movie_url_list = re.findall('<video id="media".*?src="(.*?)"',text,re.S)
            if movie_url_list:
                movie_url = movie_url_list[0]
                if movie_url.endswith(".mp4"):
                    return movie_url
    
        except Exception(TypeError):
            pass
    import uuid
    def download_movie(movie_url):
        try:
            response=requests.get(movie_url)
            # print(response.text)
            with open (r"D:spider1movies\%s.mp4"%uuid.uuid4(),"wb")as f:
                f.write(response.content)
        except Exception:
            pass
    
    
    
    
    
    
    
    if __name__ == '__main__':
        base_url = "http://www.xiaohuar.com/list-3-{}.html"
        for line in range(1):
            url=base_url.format(line)
            #1 发送请求
            index_text=get_page(url)
            #2解析数据
            urls = parse_data(index_text)
    
            for url in urls:
                #访问详情页获取详情页文本
                detail_text = get_page(url)
    
                movie_url =  parse_detail(detail_text)
                #保存视屏
                download_movie(movie_url)
    
    
    from concurrent.futures import ThreadPoolExecutor
    pool = ThreadPoolExecutor(50)
    
    
    
    response = requests.get("http://www.xiaohuar.com/v/")
    # print(response.text)
    
    
    url_s = re.findall('<div class="items">.*?href="(.*?)"',response.text,re.S)
    for url in url_s:
        # print(url)
    
        res = requests.get(url)
        result = re.findall('<video id="media".*?src="(.*?)"',res.text,re.S)
    
        # print(result)
    
    def get_page(url):
        print(url)
        try:
            response = requests.get(url)
            if response.status_code==200:
                return response.text
        except Exception:
            pass
    
    
    
    def  parse(res):
        text = res.result()
        if text:
            # print(text)
            url_s = re.findall('<div class="items">.*?href="(.*?)"',text,re.S)
            # list = []
            for url in url_s:
                if url:
                    if url.startswith("/"):
                        url = "http://www.xiaohuar.com"+url
                    pool.submit(get_page,url).add_done_callback(parse_detail)
    
    def parse_detail(res):
        text = res.result()
        if text:
            try:
                movie_url_list = re.findall('<video id="media".*?src="(.*?)"',text,re.S)
                if movie_url_list:
                    movie_url = movie_url_list[0]
                    if movie_url.endswith(".mp4"):
                        pool.submit(download_movie,movie_url)
    
            except Exception(TypeError):
                pass
    import uuid
    def download_movie(movie_url):
        if movie_url:
            try:
                response=requests.get(movie_url)
                # print(response.text)
                with open (r"D:spider1movies\%s.mp4"%uuid.uuid4(),"wb")as f:
                    f.write(response.content)
            except Exception:
                pass
    
    
    
    
    
    
    
    if __name__ == '__main__':
        base_url = "http://www.xiaohuar.com/list-3-{}.html"
        for line in range(2):
            url=base_url.format(line)
            #1 发送请求
            pool.submit(get_page,url).add_done_callback(parse)
  • 相关阅读:
    Laravel + Vue 之 OPTIONS 请求的处理
    Vue2.0 keep-alive 组件的最佳实践
    Vue.js 登录注册实现
    数仓项目04:环境搭建(MysqlHA+Hive)
    CentOS7_JDK安装和环境变量配置
    改环境变量改出问题了,vi,ls这些命令都不能用了,怎么办
    CentOS7配置网络
    curl命令下载jdk
    CentOS7设置IP地址
    ODPS
  • 原文地址:https://www.cnblogs.com/tangda/p/10932916.html
Copyright © 2011-2022 走看看