zoukankan      html  css  js  c++  java
  • 从爬虫看多进程开发

    简介

    因为写英文应用文与写作需要参考新闻信息,但是,我脑子里除了报纸没有其他更好的信息整合平台。遂打算下载renming日报

    参考链接

    https://www.liaoxuefeng.com/wiki/1016959663602400/1017628290184064
    https://blog.csdn.net/qq_38161040/article/details/88366427
    https://blog.csdn.net/baidu_28479651/article/details/76158051?utm_source=blogxgwz7

    code 第一版

    70%手动 30%自动 需要频繁的创建文件夹和更改下载次数

    # coding = UTF-8
    # 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html
    
    import urllib.request
    import re
    import os
    
    # open the url and read
    def getHtml(url):
        page = urllib.request.urlopen(url)
        html = page.read()
        page.close()
        return html
    
    # compile the regular expressions and find
    # all stuff we need
    def getUrl(html):
        reg = r'([A-Z]d+)' #匹配了G176200001
        url_re = re.compile(reg)
        url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
        return(url_lst)
    
    def getFile(url):
        file_name = url.split('/')[-1]
        u = urllib.request.urlopen(url)
        f = open(file_name, 'wb')
    
        block_sz = 8192
        while True:
            buffer = u.read(block_sz)
            if not buffer:
                break
    
            f.write(buffer)
        f.close()
        print ("Sucessful to download" + " " + file_name)
    
    
    
    
    if __name__ == '__main__':
        tmp = "http://paper.people.com.cn/rmrb/page/2020-03/26/01/rmrb20200326";
        for i in range(20):
            #print(i)
            # http://paper.people.com.cn/rmrb/page/2020-03/26/02/rmrb2020032602.pdf
            # http://paper.people.com.cn/rmrb/page/2020-03/26/03/rmrb2020032603.pdf
            if(i+1 <10):
                getFile("http://paper.people.com.cn/rmrb/page/2020-03/07/0"+str(i+1)+"/rmrb202003070"+str(i+1)+".pdf")
            else:
                getFile("http://paper.people.com.cn/rmrb/page/2020-03/07/"+str(i+1)+"/rmrb20200307"+str(i+1)+".pdf")
    

    code 第二版 自动创建文件夹版本

    下载速度较慢需要等待

    # coding = UTF-8
    # 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html
    
    import urllib.request
    import re
    import os
    import shutil
    
    # open the url and read
    def getHtml(url):
        page = urllib.request.urlopen(url)
        html = page.read()
        page.close()
        return html
    
    # compile the regular expressions and find
    # all stuff we need
    def getUrl(html):
        reg = r'([A-Z]d+)' #匹配了G176200001
        url_re = re.compile(reg)
        url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
        return(url_lst)
    
    def getFile(url):
        file_name = url.split('/')[-1]
        u = urllib.request.urlopen(url)
        f = open(file_name, 'wb')
    
        block_sz = 8192
        while True:
            buffer = u.read(block_sz)
            if not buffer:
                break
    
            f.write(buffer)
        f.close()
        print ("Sucessful to download" + " " + file_name)
        return file_name
    
    if __name__ == '__main__':
        for i in range(29):
            folderName=""
            data = str(i+1);
            if(i+1 < 10):
                data = "0"+data;
            folderName = "02"+data;
            os.mkdir(folderName)
           
            for j in range(20):
                fineName = ""
    
                try:
                    if(j+1 <10):
                        fileName = "http://paper.people.com.cn/rmrb/page/2020-02/"+data+"/0"+str(j+1)+"/rmrb202002"+data+"0"+str(j+1)+".pdf";
                        tmp = getFile(fileName)
                    else:
                        fileName = "http://paper.people.com.cn/rmrb/page/2020-02/"+data+"/"+str(j+1)+"/rmrb202002"+data+str(j+1)+".pdf";
                        tmp = getFile(fileName)
                    shutil.move(tmp,folderName)
                except OSError:
                    pass
                continue
    

    code 多进程下载

    超级爽

    # coding = UTF-8
    # 爬取自己编写的html链接中的PDF文档,网址:file:///E:/ZjuTH/Documents/pythonCode/pythontest.html
    
    import urllib.request
    import re
    import os
    import shutil
    from multiprocessing import Pool
    import time
    # open the url and read
    def getHtml(url):
        page = urllib.request.urlopen(url)
        html = page.read()
        page.close()
        return html
    
    # compile the regular expressions and find
    # all stuff we need
    def getUrl(html):
        reg = r'([A-Z]d+)' #匹配了G176200001
        url_re = re.compile(reg)
        url_lst = url_re.findall(html.decode('UTF-8')) #返回匹配的数组
        return(url_lst)
    
    def getFile(url):
        file_name = url.split('/')[-1]
        u = urllib.request.urlopen(url)
        f = open(file_name, 'wb')
    
        block_sz = 8192
        while True:
            buffer = u.read(block_sz)
            if not buffer:
                break
    
            f.write(buffer)
        f.close()
        print ("Sucessful to download" + " " + file_name)
        return file_name
    def download(i):
        folderName=""
        data = str(i+1);
        if(i+1 < 10):
            data = "0"+data;
        folderName = "01"+data;
        os.mkdir(folderName)
       
        for j in range(20):
            fineName = ""
    
            try:
                if(j+1 <10):
                    fileName = "http://paper.people.com.cn/rmrb/page/2020-01/"+data+"/0"+str(j+1)+"/rmrb202001"+data+"0"+str(j+1)+".pdf";
                    tmp = getFile(fileName)
                else:
                    fileName = "http://paper.people.com.cn/rmrb/page/2020-01/"+data+"/"+str(j+1)+"/rmrb202001"+data+str(j+1)+".pdf";
                    tmp = getFile(fileName)
                shutil.move(tmp,folderName)
            except OSError:
                pass
            continue
    
    if __name__ == '__main__':
        p = Pool(31)
        for i in range(31):
            p.apply_async(download, args = (i,))
        p.close()    
        p.join()
        print('All subprocesses done.')
    
    Hope is a good thing,maybe the best of things,and no good thing ever dies.----------- Andy Dufresne
  • 相关阅读:
    List of the best open source software applications
    Owin对Asp.net Web的扩展
    NSwag给api加上说明
    'workspace' in VS Code
    unable to find valid certification path to requested target
    JMeter的下载以及安装使用
    exception disappear when forgot to await an async method
    Filter execute order in asp.net web api
    记录web api的request以及response(即写log)
    asp.net web api的源码
  • 原文地址:https://www.cnblogs.com/eat-too-much/p/12615094.html
Copyright © 2011-2022 走看看