zoukankan      html  css  js  c++  java
  • day1之校花网小试牛刀

    一 利用生成器来完成爬去校花网视频

      

    import requests 
    import re
    import os
    import hashlib
    import time
    
    DOWLOAD_PATH=r'D:DOWNLOAD'
    
    def get_page(url):
        try:
            response=requests.get(url,)
            if response.status_code == 200:
                return response.text
        except Exception:
            pass
    
    def parse_index(index_contents):
        # print(type(index_contents))
        detail_urls=re.findall('class="items".*?href="(.*?)"',index_contents,re.S)
        for detail_url in detail_urls:
            if not detail_url.startswith('http'):
                detail_url='http://www.xiaohuar.com'+detail_url
            yield detail_url
    
    def parse_detail(detail_contents):
        movie_urls=re.findall('id="media".*?src="(.*?)"',detail_contents,re.S)
        if movie_urls:
            movie_url=movie_urls[0]
            if movie_url.endswith('mp4'):
               yield movie_url
    
    def download(movie_url):
        print(movie_url)
        try:
            response=requests.get(movie_url,
                                  )
            if response.status_code == 200:
                data=response.content
                m=hashlib.md5()
                m.update(str(time.time()).encode('utf-8'))
                m.update(movie_url.encode('utf-8'))
                filepath=os.path.join(DOWLOAD_PATH,'%s.mp4' %m.hexdigest())
                with open(filepath,'wb') as f:
                    f.write(data)
                    f.flush()
                    print('下载成功',movie_url)
        except Exception:
            pass
    
    def main():
        raw_url='http://www.xiaohuar.com/list-3-{page_num}.html'
        for i in range(5):
            #请求索引页,解析拿到详情页链接
            index_url=raw_url.format(page_num=i)
            index_contents=get_page(index_url)
            detail_urls=parse_index(index_contents)
    
            #请求详情页,解析拿到视频的链接地址
            for detail_url in detail_urls:
                detail_contents=get_page(detail_url)
                movie_urls=parse_detail(detail_contents)
    
                #下载视频
                for movie_url in movie_urls:
                    download(movie_url)
    
    
    
    if __name__ == '__main__':
        t1=time.time()
        main()
        print(time.time()-t1)

    二 利用对线程优化上述代码

      

    import requests #pip install requests
    import re
    import os
    import hashlib
    import time
    from concurrent.futures import ThreadPoolExecutor
    
    pool=ThreadPoolExecutor(50)
    DOWLOAD_PATH=r'D:DOWNLOAD'
    
    def get_page(url):
        try:
            response=requests.get(url,)
            if response.status_code == 200:
                return response.text
        except Exception:
            pass
    
    def parse_index(index_contents):
        index_contents=index_contents.result()
        detail_urls=re.findall('class="items".*?href="(.*?)"',index_contents,re.S)
        for detail_url in detail_urls:
            if not detail_url.startswith('http'):
                detail_url='http://www.xiaohuar.com'+detail_url
            pool.submit(get_page,detail_url).add_done_callback(parse_detail)
    
    def parse_detail(detail_contents):
        detail_contents=detail_contents.result()
        movie_urls=re.findall('id="media".*?src="(.*?)"',detail_contents,re.S)
        if movie_urls:
            movie_url=movie_urls[0]
            if movie_url.endswith('mp4'):
               pool.submit(download,movie_url)
    
    def download(movie_url):
        # print(movie_url)
        try:
            response=requests.get(movie_url,
                                  )
            if response.status_code == 200:
                data=response.content
                m=hashlib.md5()
                m.update(str(time.time()).encode('utf-8'))
                m.update(movie_url.encode('utf-8'))
                filepath=os.path.join(DOWLOAD_PATH,'%s.mp4' %m.hexdigest())
                with open(filepath,'wb') as f:
                    f.write(data)
                    f.flush()
                    print('下载成功',movie_url)
        except Exception:
            pass
    
    def main():
        raw_url='http://www.xiaohuar.com/list-3-{page_num}.html'
        for i in range(5):
            #请求索引页,解析拿到详情页链接
            index_url=raw_url.format(page_num=i)
            pool.submit(get_page,index_url).add_done_callback(parse_index)
    
    
    if __name__ == '__main__':
        t1=time.time()
        main()
        print(time.time()-t1)

    牛逼的代码

    三 自己根据egon讲的grep命令,类似的道理,写的爬去校花网图片的代码

      

    import requests,re,os
    def init(f):
        def inner(*args,**kwargs):
            g=f(*args,**kwargs)
            next(g)
            return g
        return inner
    def get(url):
        r=requests.get(url)
        def inner():
            r.encoding='gbk'
            return r.text
        return inner
    xiaohua=get('http://www.xiaohuar.com/2014.html')
    xiaohua_contend=xiaohua()
    def search(target):
        g=re.finditer('<a href=.*? target=.*?><img width=.*?  alt="(?P<name>.*?)" src="(?P<src>.*?)" /></a>',xiaohua_contend,re.S)
        for i in g:
            target.send((i.group('name'),i.group('src')))
    @init
    def handle(target):
        while True:
            name,src=yield
            if src.startswith('http'):
                pass
            else:
                src='http://www.xiaohuar.com'+src
            target.send((name,src))
    @init
    def download():
        while True:
            name,src=yield
            r=requests.get(src)
            with open(r'D:校花网'+'\'+name+'.jpg','wb')as f:
                f.write(r.content)
    search(handle((download())))

     总结:

    egon授课。   

    生成器与协程有紧密的联系。

    生成器可以通过yield接收参数,通过send传值。

    生成器与多线程也有关系吗?没有吧。

    普通函数爬取视频也是可以用到多线程的。

    优化的余地:可以加上进度条,利用类实现。大概就是这个样式,copy的。

    def download_file(url, path):
        with closing(requests.get(url, stream=True)) as r:
            chunk_size = 1024*10
            content_size = int(r.headers['content-length'])
            print '下载开始'
            with open(path, "wb") as f:
                p = ProgressData(size = content_size, unit='Kb', block=chunk_size)
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)
                    p.output()
    class ProgressData(object):
     
        def __init__(self, block,size, unit, file_name='', ):
            self.file_name = file_name
            self.block = block/1000.0
            self.size = size/1000.0
            self.unit = unit
            self.count = 0
            self.start = time.time()
        def output(self):
            self.end = time.time()
            self.count += 1
            speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0
            self.start = time.time()
            loaded = self.count*self.block
            progress = round(loaded/self.size, 4)
            if loaded >= self.size:
                print u'%s下载完成
    '%self.file_name
            else:
                print u'{0}下载进度{1:.2f}{2}/{3:.2f}{4} 下载速度{5:.2%} {6:.2f}{7}/s'.
                      format(self.file_name, loaded, self.unit,
                      self.size, self.unit, progress, speed, self.unit)
                print '%50s'%('/'*int((1-progress)*50))
  • 相关阅读:
    POJ 1328 Radar Installation
    POJ 1700 Crossing River
    POJ 1700 Crossing River
    poj 3253 Fence Repair (贪心,优先队列)
    poj 3253 Fence Repair (贪心,优先队列)
    poj 3069 Saruman's Army(贪心)
    poj 3069 Saruman's Army(贪心)
    Redis 笔记与总结2 String 类型和 Hash 类型
    数据分析方法有哪些_数据分析方法
    数据分析方法有哪些_数据分析方法
  • 原文地址:https://www.cnblogs.com/654321cc/p/8254180.html
Copyright © 2011-2022 走看看