zoukankan      html  css  js  c++  java
  • 从爬虫看多进程开发

    简介

    因为写英文应用文与写作需要参考新闻信息,但是,我脑子里除了报纸没有其他更好的信息整合平台。遂打算下载renming日报

    参考链接

    https://www.liaoxuefeng.com/wiki/1016959663602400/1017628290184064
    https://blog.csdn.net/qq_38161040/article/details/88366427
    https://blog.csdn.net/baidu_28479651/article/details/76158051?utm_source=blogxgwz7

    code 第一版

    70%手动 30%自动 需要频繁的创建文件夹和更改下载次数

    # coding = UTF-8
    # 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html
    
    import urllib.request
    import re
    import os
    
    # open the url and read
    def getHtml(url):
        page = urllib.request.urlopen(url)
        html = page.read()
        page.close()
        return html
    
    # compile the regular expressions and find
    # all stuff we need
    def getUrl(html):
        reg = r'([A-Z]d+)' #匹配了G176200001
        url_re = re.compile(reg)
        url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
        return(url_lst)
    
    def getFile(url):
        file_name = url.split('/')[-1]
        u = urllib.request.urlopen(url)
        f = open(file_name, 'wb')
    
        block_sz = 8192
        while True:
            buffer = u.read(block_sz)
            if not buffer:
                break
    
            f.write(buffer)
        f.close()
        print ("Sucessful to download" + " " + file_name)
    
    
    
    
    if __name__ == '__main__':
        tmp = "http://paper.people.com.cn/rmrb/page/2020-03/26/01/rmrb20200326";
        for i in range(20):
            #print(i)
            # http://paper.people.com.cn/rmrb/page/2020-03/26/02/rmrb2020032602.pdf
            # http://paper.people.com.cn/rmrb/page/2020-03/26/03/rmrb2020032603.pdf
            if(i+1 <10):
                getFile("http://paper.people.com.cn/rmrb/page/2020-03/07/0"+str(i+1)+"/rmrb202003070"+str(i+1)+".pdf")
            else:
                getFile("http://paper.people.com.cn/rmrb/page/2020-03/07/"+str(i+1)+"/rmrb20200307"+str(i+1)+".pdf")
    

    code 第二版 自动创建文件夹版本

    下载速度较慢需要等待

    # coding = UTF-8
    # 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html
    
    import urllib.request
    import re
    import os
    import shutil
    
    # open the url and read
    def getHtml(url):
        page = urllib.request.urlopen(url)
        html = page.read()
        page.close()
        return html
    
    # compile the regular expressions and find
    # all stuff we need
    def getUrl(html):
        reg = r'([A-Z]d+)' #匹配了G176200001
        url_re = re.compile(reg)
        url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
        return(url_lst)
    
    def getFile(url):
        file_name = url.split('/')[-1]
        u = urllib.request.urlopen(url)
        f = open(file_name, 'wb')
    
        block_sz = 8192
        while True:
            buffer = u.read(block_sz)
            if not buffer:
                break
    
            f.write(buffer)
        f.close()
        print ("Sucessful to download" + " " + file_name)
        return file_name
    
    if __name__ == '__main__':
        for i in range(29):
            folderName=""
            data = str(i+1);
            if(i+1 < 10):
                data = "0"+data;
            folderName = "02"+data;
            os.mkdir(folderName)
           
            for j in range(20):
                fineName = ""
    
                try:
                    if(j+1 <10):
                        fileName = "http://paper.people.com.cn/rmrb/page/2020-02/"+data+"/0"+str(j+1)+"/rmrb202002"+data+"0"+str(j+1)+".pdf";
                        tmp = getFile(fileName)
                    else:
                        fileName = "http://paper.people.com.cn/rmrb/page/2020-02/"+data+"/"+str(j+1)+"/rmrb202002"+data+str(j+1)+".pdf";
                        tmp = getFile(fileName)
                    shutil.move(tmp,folderName)
                except OSError:
                    pass
                continue
    

    code 多进程下载

    超级爽

    # coding = UTF-8
    # 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html
    
    import urllib.request
    import re
    import os
    import shutil
    from multiprocessing import Pool
    import time
    # open the url and read
    def getHtml(url):
        page = urllib.request.urlopen(url)
        html = page.read()
        page.close()
        return html
    
    # compile the regular expressions and find
    # all stuff we need
    def getUrl(html):
        reg = r'([A-Z]d+)' #匹配了G176200001
        url_re = re.compile(reg)
        url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
        return(url_lst)
    
    def getFile(url):
        file_name = url.split('/')[-1]
        u = urllib.request.urlopen(url)
        f = open(file_name, 'wb')
    
        block_sz = 8192
        while True:
            buffer = u.read(block_sz)
            if not buffer:
                break
    
            f.write(buffer)
        f.close()
        print ("Sucessful to download" + " " + file_name)
        return file_name
    def download(i):
        folderName=""
        data = str(i+1);
        if(i+1 < 10):
            data = "0"+data;
        folderName = "01"+data;
        os.mkdir(folderName)
       
        for j in range(20):
            fineName = ""
    
            try:
                if(j+1 <10):
                    fileName = "http://paper.people.com.cn/rmrb/page/2020-01/"+data+"/0"+str(j+1)+"/rmrb202001"+data+"0"+str(j+1)+".pdf";
                    tmp = getFile(fileName)
                else:
                    fileName = "http://paper.people.com.cn/rmrb/page/2020-01/"+data+"/"+str(j+1)+"/rmrb202001"+data+str(j+1)+".pdf";
                    tmp = getFile(fileName)
                shutil.move(tmp,folderName)
            except OSError:
                pass
            continue
    
    if __name__ == '__main__':
        p = Pool(31)
        for i in range(31):
            p.apply_async(download, args = (i,))
        p.close()    
        p.join()
        print('All subprocesses done.')
    
    Hope is a good thing,maybe the best of things,and no good thing ever dies.----------- Andy Dufresne
  • 相关阅读:
    什么是X86和X86-64
    PyQt 5 的学习引言
    GraphQL实战篇(一)
    GraphQL基础篇
    .net之设计模式
    .net面向对象设计原则
    Entity Framework (EF) Core工具创建一对多和多对多的关系
    NLog组件
    给dao层注入jdbcTemplate时的一个强行bug(jdbcDaoSupport不要随便用!用了要记得!)
    使用工厂模式解耦和IoC思想
  • 原文地址:https://www.cnblogs.com/eat-too-much/p/12615094.html
Copyright © 2011-2022 走看看