selenium爬取煎蛋网
直接上代码
from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as ES import requests import urllib.request import os from lxml import etree t = 0 class Custer(object): driver_path = r"D:Program Files (x86)GoogleChromeApplicationchromedriver.exe" def __init__(self): self.driver = webdriver.Chrome(executable_path=self.driver_path) self.url = "http://jandan.net/ooxx" def run(self): self.driver.get(self.url) while True: all_source = self.driver.page_source html = etree.HTML(all_source) self.xqy(html) WebDriverWait(self.driver,10).until( ES.presence_of_element_located((By.XPATH,"//div[@class='cp-pagenavi']/a[last()]")) ) try: Btn = self.driver.find_element_by_xpath("//div[@class='cp-pagenavi']/a[last()]") if "Older Comments" in Btn.get_attribute("title"): Btn.click() else: break except: print("出现异常") def xqy(self,html): all_content = html.xpath("//div[@class='row']//div") all_author = all_content[0].xpath("//div[@class='author']/strong/text()") #作者列表 #*****************给自己的重点********************** #给列表重复元素加工 如果不加工进入字典会少很多元素 for index,item in enumerate(all_author): global t if item in all_author[0:index]: #判断当前元素是否与之前元素重复 如果重复,则重命名 t=t+1 all_author[index] = item+str(t) #如多个重命名使作者加上字符1 依次类推 #*************************************************** WebDriverWait(self.driver, 10).until( ES.presence_of_element_located((By.XPATH, "//div[@class='text']//img")) ) all_img = all_content[1].xpath("//div[@class='text']//img//@src") #图片列表 #解决有个张图片没有http:协议 for index,item in enumerate(all_img): if 'http:' not in item: all_img[index] = 'http:'+item dic = dict(zip(all_author,all_img)) #多个列表生产字典 #遍历字典保存图片 for key in dic: hz = os.path.splitext(dic[key])[1] #取出后缀名.jpg/.png filename = key+hz #文件名(标题+后缀名) urllib.request.urlretrieve(dic[key],'images/'+filename) def main(): rea = Custer() rea.run() if __name__ == '__main__': main()
爬取的图片
进阶
个人用了个多线程 但不知道是不是多线程爬取 感觉爬取速度快多了
from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as ES import requests import threading import urllib.request import os from lxml import etree t = 0 gCondition = threading.Condition() class Custer(threading.Thread): driver_path = r"D:Program Files (x86)GoogleChromeApplicationchromedriver.exe" driver = webdriver.Chrome(executable_path=driver_path) url = "http://jandan.net/ooxx" def run(self): self.driver.get(self.url) while True: all_source = self.driver.page_source html = etree.HTML(all_source) self.xqy(html) WebDriverWait(self.driver,10).until( ES.presence_of_element_located((By.XPATH,"//div[@class='cp-pagenavi']/a[last()]")) ) gCondition.acquire() #加上锁(如果不加锁那么多个线程可能同时请求一个或多个图片) try: Btn = self.driver.find_element_by_xpath("//div[@class='cp-pagenavi']/a[last()]") if "Older Comments" in Btn.get_attribute("title"): gCondition.release() #解锁 Btn.click() else: break except: print("出现异常") def xqy(self,html): all_content = html.xpath("//div[@class='row']//div") all_author = all_content[0].xpath("//div[@class='author']/strong/text()") #作者列表 #*****************给自己的重点********************** #给列表重复元素加工 如果不加工进入字典会少很多元素 for index,item in enumerate(all_author): global t if item in all_author[0:index]: #判断当前元素是否与之前元素重复 如果重复,则重命名 t=t+1 all_author[index] = item+str(t) #如多个重命名使作者加上字符 依次类推 #*************************************************** WebDriverWait(self.driver, 10).until( ES.presence_of_element_located((By.XPATH, "//div[@class='text']//img")) ) all_img = all_content[1].xpath("//div[@class='text']//img//@src") #图片列表 #解决有个张图片没有http:协议 for index,item in enumerate(all_img): if 'http:' not in item: all_img[index] = 'http:'+item dic = dict(zip(all_author,all_img)) #多个列表生产字典 #遍历字典保存图片 for key in dic: hz = os.path.splitext(dic[key])[1] #取出后缀名.jpg/.png filename = key+hz #文件名(标题+后缀名) urllib.request.urlretrieve(dic[key],'images/'+filename) def main(): for i in range(9): rea = Custer() rea.start() if __name__ == '__main__': main()