zoukankan      html  css  js  c++  java
  • 用单进程、多线程并发、多线程分别实现爬一个或多个网站的所有链接,用浏览器打开所有链接并保存截图 python

    #coding=utf-8
    import requests
    import re,os,time,ConfigParser
    from selenium import webdriver
    from multiprocessing.dummy import Pool

    ######单进程#####

    #创建保存截图的目录

    def createImagesPath():
        dirname=os.path.dirname(os.path.abspath(__file__)) 
        #print dirname
        imges_path=os.path.join(dirname,time.strftime("%Y%m%d")) 
        #print imges_path
        try:
            if not os.path.exists(imges_path):
                os.mkdir(imges_path)   
            print u"截图保存的目录:",imges_path
            return imges_path 
        except Exception,e:
            print e

    #从文件中获取要爬的网站地址
    def getWebUrls(web_urls_file_path):
        web_urls=[]
        try:
            with open(web_urls_file_path) as fp:  
                lines=fp.readlines() 
            for line in lines: 
                if line.strip():
                    web_urls.append(line.strip()) 
            return web_urls 
        except Exception,e:
            print e

    #获取单个网站的所有有效链接
    def getLinks(web_url):
        try:
            response=requests.get(web_url) 
            #print response
            html=response.text
            links=re.findall(r'href="(.*?)"',html)
            valid_links=[]
            invalid_links=[]     
            for link in links:  
                if link.strip().startswith("//"):   
                    valid_links.append("http:"+link.strip())
                elif link.strip()=="" or link.strip()=="#" or link.strip()=="/" or link.strip().count("javascript")>=1 or link.strip().count("mailto:") >= 1:   
                    invalid_links.append(link)
                elif re.search(r"(.jpg)|(.jpeg)|(.gif)|(.ico)|(.png)|(.js)|(.css)$",link.strip()) or re.match(r'/[^/].*',link.strip()):
                    invalid_links.append(link)
                else:
                    valid_links.append(link.strip())
            valid_links=list(set(valid_links))  
            return valid_links     
        except Exception,e:
            print e

    #保存有效的链接到.txt文件
    def saveLinks(links):
        dirname=os.path.dirname(os.path.abspath(__file__)) 
        #print dirname
        links_path=os.path.join(dirname,time.strftime("%Y%m%d")) 
        #print links_path
        try:
            if not os.path.exists(links_path):
                os.mkdir(links_path)   
            links_file_path=os.path.join(links_path,"links.txt")
            print u"链接保存的路径:",links_file_path   
            with open(links_file_path,"w") as fp: 
                fp.writelines([link+" " for link in links])   
        except Exception,e:
            print e

    #模拟浏览器打开链接并保存截图
    class OpenLinkAndSaveImg(object):
        def __init__(self,browser_type):
            try:
                configFilePath=os.path.dirname(os.path.abspath(__file__))+"\browserAndDriver.ini"
                print u"浏览器驱动配置文件路径:",configFilePath  
                cf=ConfigParser.ConfigParser()
                cf.read(configFilePath)
                browser_type=browser_type.strip().lower()
                driver_path=cf.get("browser_driver",browser_type.strip()).strip()
                print u"浏览器:%s ,驱动位置:%s"%(browser_type,driver_path)
                if browser_type=="ie":
                    self.driver=webdriver.Ie(executable_path=eval(driver_path))
                elif browser_type=="chrome":
                    self.driver=webdriver.Chrome(executable_path=eval(driver_path))
                elif browser_type=="firefox":
                    self.driver=webdriver.Firefox(executable_path=eval(driver_path))
                else:
                    print "invalid browser!"
            except Exception,e:
                print e
       
        #打开链接并保存截图
        def openLinkAndSaveImg(self,link_index_imgspath): 
            try:
                link,index,imgspath=link_index_imgspath   
                self.driver.get(link)   
                self.driver.maximize_window()   
                self.driver.get_screenshot_as_file(os.path.join(imgspath,str(index+1)+".png"))
            except Exception,e:
                print e
     
        def end(self):
            self.driver.quit()


    if __name__=="__main__":

        #单进程
        imgs_path=createImagesPath()
        #weburls=getWebUrls(r"e:\urls.txt")
        weburls=getWebUrls(os.path.dirname(os.path.abspath(__file__))+"\urls.txt")
        links=[]    
        start_time=time.time()
        for weburl in weburls:
            links+=getLinks(weburl)
        print u"链接数:%s ;获取所有链接耗时:%s"%(len(links),time.time()-start_time)
        saveLinks(links)
        start_time1=time.time()
        open_link_and_save_img=OpenLinkAndSaveImg("ie")
        for i in range(len(links)):
            open_link_and_save_img.openLinkAndSaveImg((links[i],i,imgs_path))
        open_link_and_save_img.end()
        print u"单进程打开所有链接并截图耗时耗时:",time.time()-start_time1
       

    ######多线程(线程池并发执行)######


    #coding=utf-8
    import requests
    import re,os,time,ConfigParser
    from selenium import webdriver
    from multiprocessing.dummy import Pool

    #创建保存截图的目录
    def createImagesPath():
        dirname=os.path.dirname(os.path.abspath(__file__)) 
        #print dirname
        imges_path=os.path.join(dirname,time.strftime("%Y%m%d")) 
        #print imges_path
        try:
            if not os.path.exists(imges_path): 
                os.mkdir(imges_path)   
            print u"所有截图保存的目录:",imges_path
            return imges_path 
        except Exception,e:
            print e

    #从文件中获取要爬的网站地址
    def getWebUrls(web_urls_file_path):
        web_urls=[]
        try:
            with open(web_urls_file_path) as fp:
                lines=fp.readlines() 
            for line in lines: 
                if line.strip():
                    web_urls.append(line.strip())  
            return web_urls 
        except Exception,e:
            print e

    #获取单个网站的所有有效链接
    def getLinks(web_url):
        try:
            response=requests.get(web_url) 
            #print response
            html=response.text
            links=re.findall(r'href="(.*?)"',html)
            valid_links=[]
            invalid_links=[]       
            for link in links:    
                if link.strip().startswith("//"):   
                    valid_links.append("http:"+link.strip())
                elif link.strip()=="" or link.strip()=="#" or link.strip()=="/" or link.strip().count("javascript")>=1 or link.strip().count("mailto:") >= 1:   
                    invalid_links.append(link)
                elif re.search(r"(.jpg)|(.jpeg)|(.gif)|(.ico)|(.png)|(.js)|(.css)$",link.strip()) or re.match(r'/[^/].*',link.strip()):
                    invalid_links.append(link)
                else:
                    valid_links.append(link.strip())
            valid_links=list(set(valid_links))  
            return valid_links     
        except Exception,e:
            print e

    #保存有效的链接到.txt文件
    def saveLinks(links):
        dirname=os.path.dirname(os.path.abspath(__file__)) 
        #print dirname
        links_path=os.path.join(dirname,time.strftime("%Y%m%d")) 
        #print links_path
        try:
            if not os.path.exists(links_path):
                os.mkdir(links_path)  
            links_file_path=os.path.join(links_path,"links.txt")
            print u"所有链接保存的路径:",links_file_path  
            with open(links_file_path,"w") as fp:   
                fp.writelines([link+" " for link in links])   
        except Exception,e:
            print e

    #获取浏览器和驱动
    def getBrowserAndDriver(browser_type):
        try:
            configFilePath=os.path.dirname(os.path.abspath(__file__))+"\browserAndDriver.ini"
            print u"浏览器驱动配置文件路径:",configFilePath    
            cf=ConfigParser.ConfigParser()
            cf.read(configFilePath)
            browser_type=browser_type.strip().lower()
            driver_path=cf.get("browser_driver",browser_type.strip()).strip()
            print u"浏览器:%s ,驱动位置:%s"%(browser_type,driver_path)
            return browser_type,driver_path
        except Exception,e:
            print e

    #打开链接并保存截图
    def openLinkAndSaveImg(browser_driver_link_index_imgspath):  
            try:
                browser,driverpath,link,index,imgspath=browser_driver_link_index_imgspath   
                command="webdriver."+browser.capitalize()+"(executable_path="+driverpath+")"
                driver=eval(command)
                driver.get(link)      
                driver.maximize_window()   
                driver.get_screenshot_as_file(os.path.join(imgspath,str(index+1)+".png"))  
                driver.quit()
            except Exception,e:
                print e

    if __name__=="__main__":
        imgs_path=createImagesPath()
        #weburls=getWebUrls(r"e:\urls.txt")
        weburls=getWebUrls(os.path.dirname(os.path.abspath(__file__))+"\urls.txt")
        p=Pool(5)
        start_time1=time.time()
        links_list=p.map(getLinks,weburls)
        end_time1=time.time()
        links=[]
        for link_list in links_list:
            links+=link_list
        saveLinks(links)
        print u"链接数:%s ;获取所有链接耗时:%s"%(len(links),end_time1-start_time1)
        browser,driver=getBrowserAndDriver("ie")
        browser_driver_link_index_imgspath=zip([browser]*len(links),[driver]*len(links),links,range(len(links)),[imgs_path]*len(links))
        start_time2=time.time()
        p.map(openLinkAndSaveImg,browser_driver_link_index_imgspath)
        p.close()
        p.join()
        print u"多线程打开所有链接并截图耗时:",time.time()-start_time2

    ######多线程######

    #coding=utf-8
    import requests
    import re,os,time,ConfigParser
    from selenium import webdriver
    from multiprocessing.dummy import Pool
    import Queue
    import threading

    #创建保存截图的目录
    def createImagesPath():
        dirname=os.path.dirname(os.path.abspath(__file__)) 
        #print dirname
        imges_path=os.path.join(dirname,time.strftime("%Y%m%d")) 
        #print imges_path
        try:
            if not os.path.exists(imges_path):
                os.mkdir(imges_path)   
            print u"所有截图保存的目录:",imges_path
            return imges_path 
        except Exception,e:
            print e

    #从文件中获取要爬的网站地址
    def getWebUrls(web_urls_file_path):
        web_urls=[]
        try:
            with open(web_urls_file_path) as fp:
                lines=fp.readlines()  
            for line in lines: 
                if line.strip():
                    web_urls.append(line.strip())
            return web_urls 
        except Exception,e:
            print e

    #获取单个网站的所有有效链接
    def getLinks(web_url):
        try:
            response=requests.get(web_url) 
            #print response
            html=response.text
            links=re.findall(r'href="(.*?)"',html)
            valid_links=[]
            invalid_links=[]       
            for link in links:      
                if link.strip().startswith("//"):   
                    valid_links.append("http:"+link.strip()) 
                elif link.strip()=="" or link.strip()=="#" or link.strip()=="/" or link.strip().count("javascript")>=1 or link.strip().count("mailto:") >= 1:   
                    invalid_links.append(link)
                elif re.search(r"(.jpg)|(.jpeg)|(.gif)|(.ico)|(.png)|(.js)|(.css)$",link.strip()) or re.match(r'/[^/].*',link.strip()):
                    invalid_links.append(link)
                else:
                    valid_links.append(link.strip())
            valid_links=list(set(valid_links)) 
            return valid_links     
        except Exception,e:
            print e

    #保存有效的链接到.txt文件(当前目录年月日links.txt)
    def saveLinks(links):
        dirname=os.path.dirname(os.path.abspath(__file__)) 
        #print dirname
        links_path=os.path.join(dirname,time.strftime("%Y%m%d")) 
        #print links_path
        try:
            if not os.path.exists(links_path):  
                os.mkdir(links_path)   
            links_file_path=os.path.join(links_path,"links.txt") 
            print u"所有链接保存的路径:",links_file_path  
            with open(links_file_path,"w") as fp:   
                fp.writelines([link+" " for link in links])   
        except Exception,e:
            print e

    #多线程
    class MyThread(threading.Thread):
        def __init__(self,browser,queue):
            threading.Thread.__init__(self)
            self.queue=queue
            try:
                configFilePath=os.path.dirname(os.path.abspath(__file__))+"\browserAndDriver.ini"
                #print u"浏览器驱动配置文件路径:",configFilePath
                cf=ConfigParser.ConfigParser()
                cf.read(configFilePath)
                browser_type=browser.strip().lower()
                driver_path=cf.get("browser_driver",browser_type.strip()).strip()
                #print u"浏览器:%s ,驱动位置:%s"%(browser_type,driver_path)
                if browser_type=="ie":
                    self.driver=webdriver.Ie(executable_path=eval(driver_path))
                elif browser_type=="chrome":
                    self.driver=webdriver.Chrome(executable_path=eval(driver_path))
                elif browser_type=="firefox":
                    self.driver=webdriver.Firefox(executable_path=eval(driver_path))
                else:
                    print "invalid browser!"

            except Exception,e:
                print e

        def run(self):
            print "Starting"+self.name
            openLinkAndSaveImg(self.driver,self.queue)
            self.driver.quit()

    #打开链接并保存截图
    def openLinkAndSaveImg(driver,queue):  
            while not queue.empty():
                queueLock.acquire()
                link_index_imgspath=queue.get()
                queueLock.release()
                try:             
                    link,index,imgspath=link_index_imgspath   
                    driver.get(link)     
                    driver.maximize_window()  
                    driver.get_screenshot_as_file(os.path.join(imgspath,str(index+1)+".png"))
                except Exception,e:
                    print e

    if __name__=="__main__":
        #多线程
        imgs_path=createImagesPath()
        #weburls=getWebUrls(r"e:\urls.txt")
        weburls=getWebUrls(os.path.dirname(os.path.abspath(__file__))+"\urls.txt")
        p=Pool(5)
        start_time1=time.time()
        links_list=p.map(getLinks,weburls)
        end_time1=time.time()
        links=[]
        for link_list in links_list:
            links+=link_list
        saveLinks(links)
        print u"链接数:%s ;获取所有链接耗时:%s"%(len(links),end_time1-start_time1)

        link_index_imgspath=zip(links,range(len(links)),[imgs_path]*len(links))
        
        queueLock=threading.Lock()
        threads=[]
        link_index_imgspath_Queue=Queue.Queue(len(links))
        for element in link_index_imgspath:
            link_index_imgspath_Queue.put(element)

        start_time2=time.time()

        for i in range(5):
            thread=MyThread("ie",link_index_imgspath_Queue)
            thread.start()
            threads.append(thread)

        for t in threads:
            t.join()

        print u"多线程打开所有链接并截图耗时:",time.time()-start_time2

        print "end!"

  • 相关阅读:
    Java的静态块与实例块(转)
    Programming Ability Test学习 1031. Hello World for U (20)
    Programming Ability Test学习 1011. World Cup Betting (20)
    Programming Ability Test学习 1027. Colors in Mars (20)
    Programming Ability Test学习 1064. Complete Binary Search Tree (30)
    Programming Ability Test学习 1008. Elevator (20)
    【maven详解-生命周期】Maven的生命周期和插件
    【maven详解-插件】maven插件学习之源码插件Source Xref
    $(document).ready(){}、$(fucntion(){})、(function(){})(jQuery)onload()的区别
    你还没真的努力过,就轻易输给了懒惰
  • 原文地址:https://www.cnblogs.com/reyinever/p/9250455.html
Copyright © 2011-2022 走看看