zoukankan      html  css  js  c++  java
  • day1之校花网小试牛刀

    一 利用生成器来完成爬去校花网视频

      

    import requests 
    import re
    import os
    import hashlib
    import time
    
    DOWLOAD_PATH=r'D:DOWNLOAD'
    
    def get_page(url):
        try:
            response=requests.get(url,)
            if response.status_code == 200:
                return response.text
        except Exception:
            pass
    
    def parse_index(index_contents):
        # print(type(index_contents))
        detail_urls=re.findall('class="items".*?href="(.*?)"',index_contents,re.S)
        for detail_url in detail_urls:
            if not detail_url.startswith('http'):
                detail_url='http://www.xiaohuar.com'+detail_url
            yield detail_url
    
    def parse_detail(detail_contents):
        movie_urls=re.findall('id="media".*?src="(.*?)"',detail_contents,re.S)
        if movie_urls:
            movie_url=movie_urls[0]
            if movie_url.endswith('mp4'):
               yield movie_url
    
    def download(movie_url):
        print(movie_url)
        try:
            response=requests.get(movie_url,
                                  )
            if response.status_code == 200:
                data=response.content
                m=hashlib.md5()
                m.update(str(time.time()).encode('utf-8'))
                m.update(movie_url.encode('utf-8'))
                filepath=os.path.join(DOWLOAD_PATH,'%s.mp4' %m.hexdigest())
                with open(filepath,'wb') as f:
                    f.write(data)
                    f.flush()
                    print('下载成功',movie_url)
        except Exception:
            pass
    
    def main():
        raw_url='http://www.xiaohuar.com/list-3-{page_num}.html'
        for i in range(5):
            #请求索引页,解析拿到详情页链接
            index_url=raw_url.format(page_num=i)
            index_contents=get_page(index_url)
            detail_urls=parse_index(index_contents)
    
            #请求详情页,解析拿到视频的链接地址
            for detail_url in detail_urls:
                detail_contents=get_page(detail_url)
                movie_urls=parse_detail(detail_contents)
    
                #下载视频
                for movie_url in movie_urls:
                    download(movie_url)
    
    
    
    if __name__ == '__main__':
        t1=time.time()
        main()
        print(time.time()-t1)

    二 利用对线程优化上述代码

      

    import requests #pip install requests
    import re
    import os
    import hashlib
    import time
    from concurrent.futures import ThreadPoolExecutor
    
    pool=ThreadPoolExecutor(50)
    DOWLOAD_PATH=r'D:DOWNLOAD'
    
    def get_page(url):
        try:
            response=requests.get(url,)
            if response.status_code == 200:
                return response.text
        except Exception:
            pass
    
    def parse_index(index_contents):
        index_contents=index_contents.result()
        detail_urls=re.findall('class="items".*?href="(.*?)"',index_contents,re.S)
        for detail_url in detail_urls:
            if not detail_url.startswith('http'):
                detail_url='http://www.xiaohuar.com'+detail_url
            pool.submit(get_page,detail_url).add_done_callback(parse_detail)
    
    def parse_detail(detail_contents):
        detail_contents=detail_contents.result()
        movie_urls=re.findall('id="media".*?src="(.*?)"',detail_contents,re.S)
        if movie_urls:
            movie_url=movie_urls[0]
            if movie_url.endswith('mp4'):
               pool.submit(download,movie_url)
    
    def download(movie_url):
        # print(movie_url)
        try:
            response=requests.get(movie_url,
                                  )
            if response.status_code == 200:
                data=response.content
                m=hashlib.md5()
                m.update(str(time.time()).encode('utf-8'))
                m.update(movie_url.encode('utf-8'))
                filepath=os.path.join(DOWLOAD_PATH,'%s.mp4' %m.hexdigest())
                with open(filepath,'wb') as f:
                    f.write(data)
                    f.flush()
                    print('下载成功',movie_url)
        except Exception:
            pass
    
    def main():
        raw_url='http://www.xiaohuar.com/list-3-{page_num}.html'
        for i in range(5):
            #请求索引页,解析拿到详情页链接
            index_url=raw_url.format(page_num=i)
            pool.submit(get_page,index_url).add_done_callback(parse_index)
    
    
    if __name__ == '__main__':
        t1=time.time()
        main()
        print(time.time()-t1)

    牛逼的代码

    三 自己根据egon讲的grep命令,类似的道理,写的爬去校花网图片的代码

      

    import requests,re,os
    def init(f):
        def inner(*args,**kwargs):
            g=f(*args,**kwargs)
            next(g)
            return g
        return inner
    def get(url):
        r=requests.get(url)
        def inner():
            r.encoding='gbk'
            return r.text
        return inner
    xiaohua=get('http://www.xiaohuar.com/2014.html')
    xiaohua_contend=xiaohua()
    def search(target):
        g=re.finditer('<a href=.*? target=.*?><img width=.*?  alt="(?P<name>.*?)" src="(?P<src>.*?)" /></a>',xiaohua_contend,re.S)
        for i in g:
            target.send((i.group('name'),i.group('src')))
    @init
    def handle(target):
        while True:
            name,src=yield
            if src.startswith('http'):
                pass
            else:
                src='http://www.xiaohuar.com'+src
            target.send((name,src))
    @init
    def download():
        while True:
            name,src=yield
            r=requests.get(src)
            with open(r'D:校花网'+'\'+name+'.jpg','wb')as f:
                f.write(r.content)
    search(handle((download())))

     总结:

    egon授课。   

    生成器与协程有紧密的联系。

    生成器可以通过yield接收参数,通过send传值。

    生成器与多线程也有关系吗?没有吧。

    普通函数爬取视频也是可以用到多线程的。

    优化的余地:可以加上进度条,利用类实现。大概就是这个样式,copy的。

    def download_file(url, path):
        with closing(requests.get(url, stream=True)) as r:
            chunk_size = 1024*10
            content_size = int(r.headers['content-length'])
            print '下载开始'
            with open(path, "wb") as f:
                p = ProgressData(size = content_size, unit='Kb', block=chunk_size)
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)
                    p.output()
    class ProgressData(object):
     
        def __init__(self, block,size, unit, file_name='', ):
            self.file_name = file_name
            self.block = block/1000.0
            self.size = size/1000.0
            self.unit = unit
            self.count = 0
            self.start = time.time()
        def output(self):
            self.end = time.time()
            self.count += 1
            speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0
            self.start = time.time()
            loaded = self.count*self.block
            progress = round(loaded/self.size, 4)
            if loaded >= self.size:
                print u'%s下载完成
    '%self.file_name
            else:
                print u'{0}下载进度{1:.2f}{2}/{3:.2f}{4} 下载速度{5:.2%} {6:.2f}{7}/s'.
                      format(self.file_name, loaded, self.unit,
                      self.size, self.unit, progress, speed, self.unit)
                print '%50s'%('/'*int((1-progress)*50))
  • 相关阅读:
    pip安装pyinstaller失败的解决方法
    导药仪端子接线方式
    聊聊信号的回勾和过冲(转)
    玩不好触发,就不算会用示波器
    TI DS125BR401A 官方DEMO板鉴赏+学习+分析
    导药仪射频卡连接线制作方式
    解决ISE14.7在win10中不稳定的问题
    VS2013编译VTK7.1.1
    QT5.8.0与VS2013环境配置
    The Architecture of Open Source Applications---VTK
  • 原文地址:https://www.cnblogs.com/654321cc/p/8254180.html
Copyright © 2011-2022 走看看