环境:
selenium-java 3.9.1
firefox 57.0
geckodriver 0.19.1
1.大概的思路就是模拟用户点击行为,关于滚动条的问题,我是模拟下拉箭头,否则只能每个相册只能爬到30个链接
2.多开标签页的原因是因为爬取多个相册时,当你爬完第一个相册无论采取什么方式总会导致当前原来的相册列表刷新,从而导致selenium的元素附着失败的异常,所以我的思路是一个相册一个标签页,全部爬取完成后再统一关闭,最开始打开的页面并没有直接用于爬取第一个相册,如果你额外新打开了标签页注意修改for循环中句柄的index
3.使用selenium提取链接效率低下,因为总是要让程序等待页面加载,切换frame,js打开新标签页,句柄切换等页面跳转行为非常耗费时间,链接将按相册名进行保存
4.代码仅供测试,写的并不健壮.严格的讲,只要定位元素就应当try catch,因为drver如果无法find元素就会抛出unlocate异常,没法再去判断元素是否为null了
5.使用前请更改用户名用户密码,如果登录失败,请重新执行,默认登录后等待5s会重新登录
1 package selenium.ff;
2
3 import java.io.File;
4 import java.io.FileWriter;
5 import java.io.IOException;
6 import java.util.ArrayList;
7 import java.util.List;
8 import java.util.concurrent.TimeUnit;
9 import org.apache.commons.io.IOUtils;
10 import org.openqa.selenium.By;
11 import org.openqa.selenium.JavascriptExecutor;
12 import org.openqa.selenium.Keys;
13 import org.openqa.selenium.WebDriver;
14 import org.openqa.selenium.WebElement;
15 import org.openqa.selenium.firefox.FirefoxDriver;
16 import org.openqa.selenium.firefox.FirefoxOptions;
17 import org.openqa.selenium.interactions.Actions;
18
19 /**
20 * 模拟登录qq空间并保存相册的图片链接
21 * @author tele
22 *
23 */
24 public class QZImage {
25 static final int pageSize = 98;
26 public static void main(String[] args) throws Exception {
27 System.setProperty("webdriver.gecko.driver", "D:/browserdriver/geckodriver.exe");
28
29 FirefoxOptions options = new FirefoxOptions();
30 options.setBinary("F:/ff/firefox.exe");
31
32 WebDriver driver = new FirefoxDriver(options);
33 driver.manage().window().maximize();
34 // 超时
35 try {
36 driver.manage().timeouts().pageLoadTimeout(3, TimeUnit.SECONDS);
37 driver.manage().timeouts().setScriptTimeout(3, TimeUnit.SECONDS);
38 driver.get("https://i.qq.com/");
39 } catch (Exception e) {
40 System.out.println("所需元素已出现,停止加载页面");
41 } finally {
42 // 切换到登录login
43 driver.switchTo().frame("login_frame");
44
45 WebElement switcher_plogin = driver.findElement(By.id("switcher_plogin"));
46 System.out.println(switcher_plogin.getText());
47 if (switcher_plogin.isDisplayed()) {
48 switcher_plogin.click();
49 }
50 // 用户名
51 driver.findElement(By.id("u")).clear();
52 driver.findElement(By.id("u")).sendKeys("******");
53
54 // 密码
55 driver.findElement(By.id("p")).clear();
56 driver.findElement(By.id("p")).sendKeys("******");
57
58 // 登录
59 try {
60 driver.findElement(By.id("login_button")).click();
61 Thread.sleep(3000);
62 } catch (Exception e) {
63 e.printStackTrace();
64 } finally {
65 if ("https://i.qq.com/".equals(driver.getCurrentUrl())) {
66 System.out.println("登录失败!5秒后再次尝试登录");
67 Thread.sleep(5000);
68 driver.findElement(By.id("login_button")).click();
69 }
70 }
71
72 // 退出frame
73 driver.switchTo().defaultContent();
74
75 System.out.println(driver.getCurrentUrl());
76
77 // 点击相册
78 driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_4>a")).click();
79
80 Thread.sleep(1000);
81
82 // 切换到frame
83 driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));
84
85 JavascriptExecutor jsExecutor = (JavascriptExecutor) driver;
86
87 // 获得相册列表
88 List<WebElement> photoList = driver.findElements(By.xpath("//ul[@class='js-album-list-ul']/li"));
89 if (photoList == null || photoList.size() == 0) {
90 throw new RuntimeException("定位相册列表元素失败!");
91 }
92
93 // 构造不同相册的xpath路径
94 List<String> xpathList = new ArrayList<String>();
95 for (int i = 0; i < photoList.size(); i++) {
96 xpathList.add("//ul[@class='js-album-list-ul']/li[" + (i + 1) + "]");
97 }
98
99 // 窗口句柄
100 List<String> allHandles = new ArrayList<String>(driver.getWindowHandles());
101
102 // 遍历xpath
103 String newUrl = driver.getCurrentUrl();
104 for (int i = 0; i < xpathList.size(); i++) {
105 // 打开新标签页
106 String js = "window.open('" + newUrl + "');";
107 jsExecutor.executeScript(js);
108 allHandles = new ArrayList<String>(driver.getWindowHandles());
109
110 Thread.sleep(2000);
111 String xpath = xpathList.get(i);
112
113 // 句柄切换需要时间
114 driver.switchTo().window(allHandles.get(i + 1));
115 Thread.sleep(2000);
116 saveImageUrl(driver, xpath);
117 }
118
119 System.out.println("所有相册图片链接提取完毕,退出浏览器");
120 driver.quit();
121
122 }
123
124 }
125
126 /**
127 * 提取图片url
128 *
129 * @param driver
130 * @param xpath
131 * @throws InterruptedException
132 * @throws IOException
133 */
134 public static void saveImageUrl(WebDriver driver, String xpath) throws InterruptedException, IOException {
135
136 // 点击相册
137 driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_4>a")).click();
138
139 Thread.sleep(3000);
140
141 // 切换到图片的frame
142 driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));
143
144 // 获得相册名称
145 String photo_name = driver.findElement(By.xpath(xpath + "//a[@class='c-tx2 js-album-desc-a']")).getText();
146
147 //// 文件夹检测
148 File imageUrl = new File("f:/qz/" + photo_name + ".txt");
149 if (!imageUrl.getParentFile().exists()) {
150 imageUrl.mkdirs();
151 } else {
152 imageUrl.delete();
153 }
154
155 // 获得图片总数,每页最多98张图片
156 WebElement span = driver.findElement(By.xpath(xpath + "/div[1]/div[1]/a" + "/span"));
157 String text = span.getText();
158 int count = Integer.parseInt(text);
159
160 // 进入列表
161 driver.findElement(By.xpath(xpath + "/div[1]/div[1]/a")).click();
162 Thread.sleep(3000);
163
164 // 计算页数
165 int totalPage = (int) Math.ceil((double) count / (double) pageSize);
166 System.out.println(photo_name + "图片总数为----" + count + "张,共计---" + totalPage + "页");
167
168 FileWriter fileWriter = new FileWriter(imageUrl, true);
169
170 for (int i = 0; i < totalPage; i++) {
171
172 // 模拟按键加载图片
173 Actions actions = new Actions(driver);
174 for (int j = 0; j < 50; j++) {
175 if (j % 5 == 0) {
176 Thread.sleep(1000);
177 }
178 actions.sendKeys(Keys.ARROW_DOWN).perform();
179 }
180
181 // 提取本页的image链接
182 List<WebElement> list = driver.findElements(By.xpath("//img[@class='j-pl-photoitem-img']"));
183 if (list == null || list.size() == 0) {
184 // 相册无权限访问或定位失败
185 throw new RuntimeException("无法提取图片链接!");
186 }
187 for (WebElement element : list) {
188 String src = element.getAttribute("src") + "
";
189 IOUtils.write(src, fileWriter);
190 }
191 System.out.println("第" + (i + 1) + "页图片链接提取完毕");
192
193 Thread.sleep(1000);
194
195 // 跳转到下一页
196 if ((i + 2) <= totalPage) {
197 driver.findElement(By.xpath("//a[@id='pager_num_1_" + (i + 2) + "']")).click();
198 }
199 }
200
201 fileWriter.close();
202 }
203 }