zoukankan      html  css  js  c++  java
  • 用单进程、多线程并发、多线程分别实现爬一个或多个网站的所有链接,用浏览器打开所有链接并保存截图 python

    #coding=utf-8
    import requests
    import re,os,time,ConfigParser
    from selenium import webdriver
    from multiprocessing.dummy import Pool

    ######单进程#####

    #创建保存截图的目录

    def createImagesPath():
        dirname=os.path.dirname(os.path.abspath(__file__)) 
        #print dirname
        imges_path=os.path.join(dirname,time.strftime("%Y%m%d")) 
        #print imges_path
        try:
            if not os.path.exists(imges_path):
                os.mkdir(imges_path)   
            print u"截图保存的目录:",imges_path
            return imges_path 
        except Exception,e:
            print e

    #从文件中获取要爬的网站地址
    def getWebUrls(web_urls_file_path):
        web_urls=[]
        try:
            with open(web_urls_file_path) as fp:  
                lines=fp.readlines() 
            for line in lines: 
                if line.strip():
                    web_urls.append(line.strip()) 
            return web_urls 
        except Exception,e:
            print e

    #获取单个网站的所有有效链接
    def getLinks(web_url):
        try:
            response=requests.get(web_url) 
            #print response
            html=response.text
            links=re.findall(r'href="(.*?)"',html)
            valid_links=[]
            invalid_links=[]     
            for link in links:  
                if link.strip().startswith("//"):   
                    valid_links.append("http:"+link.strip())
                elif link.strip()=="" or link.strip()=="#" or link.strip()=="/" or link.strip().count("javascript")>=1 or link.strip().count("mailto:") >= 1:   
                    invalid_links.append(link)
                elif re.search(r"(.jpg)|(.jpeg)|(.gif)|(.ico)|(.png)|(.js)|(.css)$",link.strip()) or re.match(r'/[^/].*',link.strip()):
                    invalid_links.append(link)
                else:
                    valid_links.append(link.strip())
            valid_links=list(set(valid_links))  
            return valid_links     
        except Exception,e:
            print e

    #保存有效的链接到.txt文件
    def saveLinks(links):
        dirname=os.path.dirname(os.path.abspath(__file__)) 
        #print dirname
        links_path=os.path.join(dirname,time.strftime("%Y%m%d")) 
        #print links_path
        try:
            if not os.path.exists(links_path):
                os.mkdir(links_path)   
            links_file_path=os.path.join(links_path,"links.txt")
            print u"链接保存的路径:",links_file_path   
            with open(links_file_path,"w") as fp: 
                fp.writelines([link+" " for link in links])   
        except Exception,e:
            print e

    #模拟浏览器打开链接并保存截图
    class OpenLinkAndSaveImg(object):
        def __init__(self,browser_type):
            try:
                configFilePath=os.path.dirname(os.path.abspath(__file__))+"\browserAndDriver.ini"
                print u"浏览器驱动配置文件路径:",configFilePath  
                cf=ConfigParser.ConfigParser()
                cf.read(configFilePath)
                browser_type=browser_type.strip().lower()
                driver_path=cf.get("browser_driver",browser_type.strip()).strip()
                print u"浏览器:%s ,驱动位置:%s"%(browser_type,driver_path)
                if browser_type=="ie":
                    self.driver=webdriver.Ie(executable_path=eval(driver_path))
                elif browser_type=="chrome":
                    self.driver=webdriver.Chrome(executable_path=eval(driver_path))
                elif browser_type=="firefox":
                    self.driver=webdriver.Firefox(executable_path=eval(driver_path))
                else:
                    print "invalid browser!"
            except Exception,e:
                print e
       
        #打开链接并保存截图
        def openLinkAndSaveImg(self,link_index_imgspath): 
            try:
                link,index,imgspath=link_index_imgspath   
                self.driver.get(link)   
                self.driver.maximize_window()   
                self.driver.get_screenshot_as_file(os.path.join(imgspath,str(index+1)+".png"))
            except Exception,e:
                print e
     
        def end(self):
            self.driver.quit()


    if __name__=="__main__":

        #单进程
        imgs_path=createImagesPath()
        #weburls=getWebUrls(r"e:\urls.txt")
        weburls=getWebUrls(os.path.dirname(os.path.abspath(__file__))+"\urls.txt")
        links=[]    
        start_time=time.time()
        for weburl in weburls:
            links+=getLinks(weburl)
        print u"链接数:%s ;获取所有链接耗时:%s"%(len(links),time.time()-start_time)
        saveLinks(links)
        start_time1=time.time()
        open_link_and_save_img=OpenLinkAndSaveImg("ie")
        for i in range(len(links)):
            open_link_and_save_img.openLinkAndSaveImg((links[i],i,imgs_path))
        open_link_and_save_img.end()
        print u"单进程打开所有链接并截图耗时耗时:",time.time()-start_time1
       

    ######多线程(线程池并发执行)######


    #coding=utf-8
    import requests
    import re,os,time,ConfigParser
    from selenium import webdriver
    from multiprocessing.dummy import Pool

    #创建保存截图的目录
    def createImagesPath():
        dirname=os.path.dirname(os.path.abspath(__file__)) 
        #print dirname
        imges_path=os.path.join(dirname,time.strftime("%Y%m%d")) 
        #print imges_path
        try:
            if not os.path.exists(imges_path): 
                os.mkdir(imges_path)   
            print u"所有截图保存的目录:",imges_path
            return imges_path 
        except Exception,e:
            print e

    #从文件中获取要爬的网站地址
    def getWebUrls(web_urls_file_path):
        web_urls=[]
        try:
            with open(web_urls_file_path) as fp:
                lines=fp.readlines() 
            for line in lines: 
                if line.strip():
                    web_urls.append(line.strip())  
            return web_urls 
        except Exception,e:
            print e

    #获取单个网站的所有有效链接
    def getLinks(web_url):
        try:
            response=requests.get(web_url) 
            #print response
            html=response.text
            links=re.findall(r'href="(.*?)"',html)
            valid_links=[]
            invalid_links=[]       
            for link in links:    
                if link.strip().startswith("//"):   
                    valid_links.append("http:"+link.strip())
                elif link.strip()=="" or link.strip()=="#" or link.strip()=="/" or link.strip().count("javascript")>=1 or link.strip().count("mailto:") >= 1:   
                    invalid_links.append(link)
                elif re.search(r"(.jpg)|(.jpeg)|(.gif)|(.ico)|(.png)|(.js)|(.css)$",link.strip()) or re.match(r'/[^/].*',link.strip()):
                    invalid_links.append(link)
                else:
                    valid_links.append(link.strip())
            valid_links=list(set(valid_links))  
            return valid_links     
        except Exception,e:
            print e

    #保存有效的链接到.txt文件
    def saveLinks(links):
        dirname=os.path.dirname(os.path.abspath(__file__)) 
        #print dirname
        links_path=os.path.join(dirname,time.strftime("%Y%m%d")) 
        #print links_path
        try:
            if not os.path.exists(links_path):
                os.mkdir(links_path)  
            links_file_path=os.path.join(links_path,"links.txt")
            print u"所有链接保存的路径:",links_file_path  
            with open(links_file_path,"w") as fp:   
                fp.writelines([link+" " for link in links])   
        except Exception,e:
            print e

    #获取浏览器和驱动
    def getBrowserAndDriver(browser_type):
        try:
            configFilePath=os.path.dirname(os.path.abspath(__file__))+"\browserAndDriver.ini"
            print u"浏览器驱动配置文件路径:",configFilePath    
            cf=ConfigParser.ConfigParser()
            cf.read(configFilePath)
            browser_type=browser_type.strip().lower()
            driver_path=cf.get("browser_driver",browser_type.strip()).strip()
            print u"浏览器:%s ,驱动位置:%s"%(browser_type,driver_path)
            return browser_type,driver_path
        except Exception,e:
            print e

    #打开链接并保存截图
    def openLinkAndSaveImg(browser_driver_link_index_imgspath):  
            try:
                browser,driverpath,link,index,imgspath=browser_driver_link_index_imgspath   
                command="webdriver."+browser.capitalize()+"(executable_path="+driverpath+")"
                driver=eval(command)
                driver.get(link)      
                driver.maximize_window()   
                driver.get_screenshot_as_file(os.path.join(imgspath,str(index+1)+".png"))  
                driver.quit()
            except Exception,e:
                print e

    if __name__=="__main__":
        imgs_path=createImagesPath()
        #weburls=getWebUrls(r"e:\urls.txt")
        weburls=getWebUrls(os.path.dirname(os.path.abspath(__file__))+"\urls.txt")
        p=Pool(5)
        start_time1=time.time()
        links_list=p.map(getLinks,weburls)
        end_time1=time.time()
        links=[]
        for link_list in links_list:
            links+=link_list
        saveLinks(links)
        print u"链接数:%s ;获取所有链接耗时:%s"%(len(links),end_time1-start_time1)
        browser,driver=getBrowserAndDriver("ie")
        browser_driver_link_index_imgspath=zip([browser]*len(links),[driver]*len(links),links,range(len(links)),[imgs_path]*len(links))
        start_time2=time.time()
        p.map(openLinkAndSaveImg,browser_driver_link_index_imgspath)
        p.close()
        p.join()
        print u"多线程打开所有链接并截图耗时:",time.time()-start_time2

    ######多线程######

    #coding=utf-8
    import requests
    import re,os,time,ConfigParser
    from selenium import webdriver
    from multiprocessing.dummy import Pool
    import Queue
    import threading

    #创建保存截图的目录
    def createImagesPath():
        dirname=os.path.dirname(os.path.abspath(__file__)) 
        #print dirname
        imges_path=os.path.join(dirname,time.strftime("%Y%m%d")) 
        #print imges_path
        try:
            if not os.path.exists(imges_path):
                os.mkdir(imges_path)   
            print u"所有截图保存的目录:",imges_path
            return imges_path 
        except Exception,e:
            print e

    #从文件中获取要爬的网站地址
    def getWebUrls(web_urls_file_path):
        web_urls=[]
        try:
            with open(web_urls_file_path) as fp:
                lines=fp.readlines()  
            for line in lines: 
                if line.strip():
                    web_urls.append(line.strip())
            return web_urls 
        except Exception,e:
            print e

    #获取单个网站的所有有效链接
    def getLinks(web_url):
        try:
            response=requests.get(web_url) 
            #print response
            html=response.text
            links=re.findall(r'href="(.*?)"',html)
            valid_links=[]
            invalid_links=[]       
            for link in links:      
                if link.strip().startswith("//"):   
                    valid_links.append("http:"+link.strip()) 
                elif link.strip()=="" or link.strip()=="#" or link.strip()=="/" or link.strip().count("javascript")>=1 or link.strip().count("mailto:") >= 1:   
                    invalid_links.append(link)
                elif re.search(r"(.jpg)|(.jpeg)|(.gif)|(.ico)|(.png)|(.js)|(.css)$",link.strip()) or re.match(r'/[^/].*',link.strip()):
                    invalid_links.append(link)
                else:
                    valid_links.append(link.strip())
            valid_links=list(set(valid_links)) 
            return valid_links     
        except Exception,e:
            print e

    #保存有效的链接到.txt文件(当前目录年月日links.txt)
    def saveLinks(links):
        dirname=os.path.dirname(os.path.abspath(__file__)) 
        #print dirname
        links_path=os.path.join(dirname,time.strftime("%Y%m%d")) 
        #print links_path
        try:
            if not os.path.exists(links_path):  
                os.mkdir(links_path)   
            links_file_path=os.path.join(links_path,"links.txt") 
            print u"所有链接保存的路径:",links_file_path  
            with open(links_file_path,"w") as fp:   
                fp.writelines([link+" " for link in links])   
        except Exception,e:
            print e

    #多线程
    class MyThread(threading.Thread):
        def __init__(self,browser,queue):
            threading.Thread.__init__(self)
            self.queue=queue
            try:
                configFilePath=os.path.dirname(os.path.abspath(__file__))+"\browserAndDriver.ini"
                #print u"浏览器驱动配置文件路径:",configFilePath
                cf=ConfigParser.ConfigParser()
                cf.read(configFilePath)
                browser_type=browser.strip().lower()
                driver_path=cf.get("browser_driver",browser_type.strip()).strip()
                #print u"浏览器:%s ,驱动位置:%s"%(browser_type,driver_path)
                if browser_type=="ie":
                    self.driver=webdriver.Ie(executable_path=eval(driver_path))
                elif browser_type=="chrome":
                    self.driver=webdriver.Chrome(executable_path=eval(driver_path))
                elif browser_type=="firefox":
                    self.driver=webdriver.Firefox(executable_path=eval(driver_path))
                else:
                    print "invalid browser!"

            except Exception,e:
                print e

        def run(self):
            print "Starting"+self.name
            openLinkAndSaveImg(self.driver,self.queue)
            self.driver.quit()

    #打开链接并保存截图
    def openLinkAndSaveImg(driver,queue):  
            while not queue.empty():
                queueLock.acquire()
                link_index_imgspath=queue.get()
                queueLock.release()
                try:             
                    link,index,imgspath=link_index_imgspath   
                    driver.get(link)     
                    driver.maximize_window()  
                    driver.get_screenshot_as_file(os.path.join(imgspath,str(index+1)+".png"))
                except Exception,e:
                    print e

    if __name__=="__main__":
        #多线程
        imgs_path=createImagesPath()
        #weburls=getWebUrls(r"e:\urls.txt")
        weburls=getWebUrls(os.path.dirname(os.path.abspath(__file__))+"\urls.txt")
        p=Pool(5)
        start_time1=time.time()
        links_list=p.map(getLinks,weburls)
        end_time1=time.time()
        links=[]
        for link_list in links_list:
            links+=link_list
        saveLinks(links)
        print u"链接数:%s ;获取所有链接耗时:%s"%(len(links),end_time1-start_time1)

        link_index_imgspath=zip(links,range(len(links)),[imgs_path]*len(links))
        
        queueLock=threading.Lock()
        threads=[]
        link_index_imgspath_Queue=Queue.Queue(len(links))
        for element in link_index_imgspath:
            link_index_imgspath_Queue.put(element)

        start_time2=time.time()

        for i in range(5):
            thread=MyThread("ie",link_index_imgspath_Queue)
            thread.start()
            threads.append(thread)

        for t in threads:
            t.join()

        print u"多线程打开所有链接并截图耗时:",time.time()-start_time2

        print "end!"

  • 相关阅读:
    关于apicloud ios自定义模块引用第三方framework not found for architecture armv7
    C#实现生产消费者模式
    C# unity 的 IInterceptionBehavior实现aop拦截器
    递归算法,如何把list中父子类对象递归成树
    php后台权限管理
    php生成json或者xml数据
    PHP实现异步调用方法研究
    PHP判断请求是否是ajax请求
    PHP开发网站之微信登录、绑定
    Linux_10------Linux之shell编程------变量
  • 原文地址:https://www.cnblogs.com/reyinever/p/9250455.html
Copyright © 2011-2022 走看看