上次代码只能抓取一个网页上的链接,本次可以自主设定抓取的页面个数。
代码如下:
from selenium import webdriver import os, time class DownloadFiles(): def __init__(self): self.url = 'http://www.neeq.com.cn/disclosure/announcement.html' self.basePath = os.path.dirname(__file__) self.times = 7 #表示翻页的次数 def makedir(self, name): path = os.path.join(self.basePath, name) isExist = os.path.exists(path) if not isExist: os.makedirs(path) print('File has been created.') else: print('The file is existed.') # 切换到该目录下 os.chdir(path) def connect(self, url): driver = webdriver.PhantomJS() driver.get(url) return driver #翻页 def nextPage(self, driver): #每次点击next之后停顿5秒钟 next = driver.find_element_by_class_name('next') next.click() time.sleep(5) def getFiles(self): driver = self.connect(self.url) self.makedir('Files') #自动翻页 for i in range(self.times): print('第' + str(i+1) + '页:') aList = driver.find_elements_by_tag_name('a') for r in aList: try: link = r.get_attribute('href') if link.endswith('pdf'): print(r.text) print(link) fileName = r.text + '.pdf' #urlretrieve(link, fileName) except: pass self.nextPage(driver=driver) if __name__ == '__main__': obj = DownloadFiles() obj.getFiles()