zoukankan      html  css  js  c++  java
  • selenium爬取煎蛋网

    selenium爬取煎蛋网

    直接上代码

    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions  as ES
    import requests
    import urllib.request
    import os
    from lxml import etree
    t = 0   
    class Custer(object):
        driver_path = r"D:Program Files (x86)GoogleChromeApplicationchromedriver.exe"
        def __init__(self):
            self.driver = webdriver.Chrome(executable_path=self.driver_path)
            self.url = "http://jandan.net/ooxx"
        def run(self):
            self.driver.get(self.url)
            while True:
                all_source = self.driver.page_source
                html = etree.HTML(all_source)
                self.xqy(html)
                WebDriverWait(self.driver,10).until(
                    ES.presence_of_element_located((By.XPATH,"//div[@class='cp-pagenavi']/a[last()]"))
                )
                try:
                    Btn = self.driver.find_element_by_xpath("//div[@class='cp-pagenavi']/a[last()]")
                    if "Older Comments" in Btn.get_attribute("title"):
                        Btn.click()
                    else:
                        break
                except:
                    print("出现异常")
    
        def xqy(self,html):
            all_content = html.xpath("//div[@class='row']//div")
            all_author = all_content[0].xpath("//div[@class='author']/strong/text()")           #作者列表
    
            #*****************给自己的重点**********************
            #给列表重复元素加工  如果不加工进入字典会少很多元素
            for index,item in enumerate(all_author):
                global t
                if item in all_author[0:index]:                     #判断当前元素是否与之前元素重复  如果重复,则重命名
                    t=t+1                               
                    all_author[index] = item+str(t)                 #如多个重命名使作者加上字符1  依次类推
            #***************************************************
    
            WebDriverWait(self.driver, 10).until(
                ES.presence_of_element_located((By.XPATH, "//div[@class='text']//img"))
            )
            all_img = all_content[1].xpath("//div[@class='text']//img//@src")           #图片列表
            #解决有个张图片没有http:协议
            for index,item in enumerate(all_img):
                if 'http:' not in item:
                    all_img[index] = 'http:'+item
    
            dic = dict(zip(all_author,all_img))         #多个列表生产字典
            #遍历字典保存图片
            for key in dic:
                hz = os.path.splitext(dic[key])[1]          #取出后缀名.jpg/.png
                filename = key+hz                           #文件名(标题+后缀名)
                urllib.request.urlretrieve(dic[key],'images/'+filename)
    
    def main():
        rea = Custer()
        rea.run()
    
    
    
    if __name__ == '__main__':
        main()

    爬取的图片

    进阶

    个人用了个多线程   但不知道是不是多线程爬取 感觉爬取速度快多了

    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions  as ES
    import requests
    import threading
    import urllib.request
    import os
    from lxml import etree
    t = 0
    gCondition = threading.Condition()
    class Custer(threading.Thread):
        driver_path = r"D:Program Files (x86)GoogleChromeApplicationchromedriver.exe"
        driver = webdriver.Chrome(executable_path=driver_path)
        url = "http://jandan.net/ooxx"
        def run(self):
            self.driver.get(self.url)
            while True:
                all_source = self.driver.page_source
                html = etree.HTML(all_source)
                self.xqy(html)
                WebDriverWait(self.driver,10).until(
                    ES.presence_of_element_located((By.XPATH,"//div[@class='cp-pagenavi']/a[last()]"))
                )
                gCondition.acquire()        #加上锁(如果不加锁那么多个线程可能同时请求一个或多个图片)
                try:
                    Btn = self.driver.find_element_by_xpath("//div[@class='cp-pagenavi']/a[last()]")
                    if "Older Comments" in Btn.get_attribute("title"):
                        gCondition.release()            #解锁
                        Btn.click()
                    else:
                        break
    
                except:
                    print("出现异常")
    
        def xqy(self,html):
            all_content = html.xpath("//div[@class='row']//div")
            all_author = all_content[0].xpath("//div[@class='author']/strong/text()")           #作者列表
    
            #*****************给自己的重点**********************
            #给列表重复元素加工  如果不加工进入字典会少很多元素
            for index,item in enumerate(all_author):
                global t
                if item in all_author[0:index]:                     #判断当前元素是否与之前元素重复  如果重复,则重命名
                    t=t+1
                    all_author[index] = item+str(t)                 #如多个重命名使作者加上字符 依次类推
            #***************************************************
    
            WebDriverWait(self.driver, 10).until(
                ES.presence_of_element_located((By.XPATH, "//div[@class='text']//img"))
            )
            all_img = all_content[1].xpath("//div[@class='text']//img//@src")           #图片列表
            #解决有个张图片没有http:协议
            for index,item in enumerate(all_img):
                if 'http:' not in item:
                    all_img[index] = 'http:'+item
    
            dic = dict(zip(all_author,all_img))         #多个列表生产字典
            #遍历字典保存图片
            for key in dic:
                hz = os.path.splitext(dic[key])[1]          #取出后缀名.jpg/.png
                filename = key+hz                           #文件名(标题+后缀名)
                urllib.request.urlretrieve(dic[key],'images/'+filename)
    
    def main():
        for i in range(9):
            rea = Custer()
            rea.start()
    
    
    
    if __name__ == '__main__':
        main()

     

  • 相关阅读:
    为什么要有binary-to-text encoding?
    海量网络存储系统原理与设计(三)
    Java中的Inner Class (一)
    海量网络存储系统原理与设计(二)
    海量网络存储系统原理与设计(一)
    [JavaScript]顺序的异步执行
    [PAT]素因子分解(20)
    [PAT]求集合数据的均方差(15)
    [PAT]数列求和(20)
    【C-001】printf理解
  • 原文地址:https://www.cnblogs.com/c-pyday/p/10609812.html
Copyright © 2011-2022 走看看