很久之前写着玩的,最近拿出来还能用,等过段时间优化下
#打开html #获取所有指定的<a>标签中的href #遍历打开所有的href中的url #获取指定的元素 #获取图片的url链接 #通过函数,将图片保存到本地 import time import os import re import urllib.request import uuid from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait #生成一个文件名字符串 def generateFileName(): return str(uuid.uuid1()) #根据文件名创建文件 def createFileWithFileName(localPathParam,fileName): totalPath=localPathParam+'\'+fileName if not os.path.exists(totalPath): file=open(totalPath,'a+') file.close() return totalPath def getAndSaveImg(imgUrl,img_name): if (len(imgUrl) != 0): fileName = img_name + '.jpg' fileName = re.sub('[/:*?"<>|]', '-', fileName) try: urllib.request.urlretrieve(imgUrl, createFileWithFileName("C:\Downloads", fileName)) except: print("这图我没法下载") #获取每个list的url def get_list(): lists = driver.find_elements_by_class_name("list-group-item") for i in range(len(lists)): list = lists[i].get_attribute("href") print(list) # 存入list中 list_info.append(list) if __name__=="__main__": driver=webdriver.PhantomJS() driver.set_window_size(1400, 900) for m in range(28,50): list_info = [] url="http://www.doutula.com/article/list?page="+str(m+1) driver.get(url) #网页加载完成后,等待2s wait1 = WebDriverWait(driver, 2) # 获取每个list的url,返回结果存入了list_info get_list() #遍历每个url链接,打开 for j in range(len(list_info)): driver.get(list_info[j]) wait2 = WebDriverWait(driver, 2) #page=driver.page_source #print(page) url_info=driver.find_elements_by_xpath("//div[@class='artile_des']/table/tbody") for x in range (len(url_info)): img_url=url_info[x].find_element_by_tag_name("img").get_attribute("src") img_name = url_info[x].find_element_by_tag_name("img").get_attribute("alt") print("坐标" + str(m+1)+":"+str(j)+":"+str(x)) print(img_url+"----->"+img_name) getAndSaveImg(img_url,img_name) #print('第'+str(j+1)+"行")