1 package cn.hb.util;
2
3 import java.io.File;
4 import java.io.FileNotFoundException;
5 import java.io.FileWriter;
6 import java.io.IOException;
7 import java.util.ArrayList;
8 import java.util.List;
9 import java.util.Set;
10 import java.util.UUID;
11 import java.util.concurrent.TimeUnit;
12
13 import org.apache.commons.io.IOUtils;
14 import org.openqa.selenium.By;
15 import org.openqa.selenium.Cookie;
16 import org.openqa.selenium.JavascriptExecutor;
17 import org.openqa.selenium.Keys;
18 import org.openqa.selenium.WebDriver;
19 import org.openqa.selenium.WebElement;
20 import org.openqa.selenium.firefox.FirefoxDriver;
21 import org.openqa.selenium.firefox.FirefoxOptions;
22 import org.openqa.selenium.interactions.Actions;
23 import cn.edu.hfut.dmic.webcollector.conf.Configuration;
24 import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
25 import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
26 import cn.edu.hfut.dmic.webcollector.model.Page;
27 import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
28 import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
29 import cn.edu.hfut.dmic.webcollector.util.FileUtils;
30
31 /**
32 * 爬取空间图片 selenium登录后提取链接给webcollector处理即可
33 *
34 * @author tele
35 *
36 */
37 public class QZoneCrawler extends BreadthCrawler {
38 static String url = "https://user.qzone.qq.com/qq号";
39 static String cookies = "";
40 static final int pageSize = 98;
41 static List<String> crawdataList = new ArrayList<String>();
42 static File baseDir = new File("F:/qz/image");
43
44 public QZoneCrawler(String crawlPath, boolean autoParse) {
45 super(crawlPath, autoParse);
46 }
47
48 @Override
49 public void visit(Page page, CrawlDatums next) {
50 try {
51 Thread.sleep(3000);
52 } catch (InterruptedException e) {
53 e.printStackTrace();
54 }
55 String name = UUID.randomUUID().toString() + ".jpg";
56 try {
57 FileUtils.write(new File(baseDir, name), page.content());
58 } catch (FileNotFoundException e) {
59 e.printStackTrace();
60 } catch (IOException e) {
61 e.printStackTrace();
62 }
63 }
64
65 String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0";
66
67 // 设置cookies
68 @Override
69 public Page getResponse(CrawlDatum crawlDatum) throws Exception {
70 HttpRequest request = new HttpRequest(crawlDatum);
71 request.setCookie(cookies);
72 request.setUserAgent(userAgent);
73 return request.responsePage();
74 }
75
76 public static void main(String[] args) throws Exception {
77
78 QZoneCrawler qz = new QZoneCrawler("F:/qz/image/webcollector", true);
79
80 Configuration conf = Configuration.copyDefault();
81 conf.setAutoDetectImg(true);
82 conf.setConnectTimeout(5000);
83 conf.setReadTimeout(10000);
84
85 // 线程爬取间隔
86 conf.setExecuteInterval(5000);
87 qz.setConf(conf);
88 qz.setThreads(100);
89
90 login();
91 qz.addSeed(crawdataList);
92 qz.start(1);
93
94 }
95
96 /**
97 * 登录
98 *
99 * @throws InterruptedException
100 * @throws IOException
101 */
102 public static void login() throws InterruptedException, IOException {
103 System.setProperty("webdriver.gecko.driver", "D:/browserdriver/geckodriver.exe");
104
105 FirefoxOptions options = new FirefoxOptions();
106 options.setBinary("F:/ff/firefox.exe");
107
108 WebDriver driver = new FirefoxDriver(options);
109 driver.manage().window().maximize();
110 // 超时
111 try {
112 driver.manage().timeouts().pageLoadTimeout(3, TimeUnit.SECONDS);
113 driver.manage().timeouts().setScriptTimeout(3, TimeUnit.SECONDS);
114 driver.get(url);
115 } catch (Exception e) {
116 System.out.println("所需元素已出现,停止加载页面");
117 } finally {
118 // 切换到登录login
119 driver.switchTo().frame("login_frame");
120
121 WebElement switcher_plogin = driver.findElement(By.id("switcher_plogin"));
122 System.out.println(switcher_plogin.getText());
123 if (switcher_plogin.isDisplayed()) {
124 switcher_plogin.click();
125 }
126 // 用户名
127 driver.findElement(By.id("u")).clear();
128 driver.findElement(By.id("u")).sendKeys("账号");
129
130 // 密码
131 driver.findElement(By.id("p")).clear();
132 driver.findElement(By.id("p")).sendKeys("密码");
133
134 // 登录
135 try {
136 driver.findElement(By.id("login_button")).click();
137 Thread.sleep(3000);
138 } catch (Exception e) {
139 e.printStackTrace();
140 } finally {
141 if ("https://i.qq.com/".equals(driver.getCurrentUrl())) {
142 System.out.println("登录失败!5秒后再次尝试登录");
143 Thread.sleep(5000);
144 driver.findElement(By.id("login_button")).click();
145 }
146 }
147
148 // 退出frame
149 driver.switchTo().defaultContent();
150
151 System.out.println(driver.getCurrentUrl());
152
153 JavascriptExecutor jsExecutor = (JavascriptExecutor) driver;
154
155 // 如果有亲密度提示
156 /*
157 * try { WebElement fs_guide = driver.findElement(By.xpath(
158 * "//div[@id='friendship_promote_layer']/table[@class='tbl-fs-guide']//a"
159 * )); if(fs_guide != null && fs_guide.isDisplayed()) {
160 * fs_guide.click(); } } catch (Exception e) { e.printStackTrace();
161 * }finally {
162 *
163 * }
164 */
165
166 // 点击相册
167 driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_4>a")).click();
168
169 Thread.sleep(2000);
170
171 // 切换到frame
172 driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));
173
174 // 进入图片列表(说说相册)
175 // driver.findElement(By.xpath("//ul[@class='js-album-list-ul']/li[1]/div[1]/div[1]/a")).click();
176
177 // 拼接cookie
178 StringBuilder builder = new StringBuilder();
179 Set<Cookie> cookieSet = driver.manage().getCookies();
180 cookieSet.forEach(c -> builder.append(c.getName()).append("=").append(c.getValue()).append("; "));
181 cookies = builder.toString();
182
183 // 获得相册列表
184 List<WebElement> photoList = driver.findElements(By.xpath("//ul[@class='js-album-list-ul']/li"));
185 if (photoList == null || photoList.size() == 0) {
186 throw new RuntimeException("定位相册列表元素失败!");
187 }
188
189 // 构造不同相册的xpath路径
190 List<String> xpathList = new ArrayList<String>();
191 for (int i = 0; i < photoList.size(); i++) {
192 xpathList.add("//ul[@class='js-album-list-ul']/li[" + (i + 1) + "]");
193 }
194
195 // 窗口句柄
196 List<String> allHandles = new ArrayList<String>(driver.getWindowHandles());
197
198 // 遍历xpath
199 String newUrl = driver.getCurrentUrl();
200 for (int i = 0; i < xpathList.size(); i++) {
201 // 打开新标签页
202 jsExecutor.executeScript("window.open('" + newUrl + "');");
203 allHandles = new ArrayList<String>(driver.getWindowHandles());
204
205 Thread.sleep(2000);
206 String xpath = xpathList.get(i);
207
208 // 句柄切换需要时间
209 driver.switchTo().window(allHandles.get(i + 1));
210 Thread.sleep(2000);
211
212 List<String> urlList = getImageUrl(driver, xpath);
213 if (urlList == null) {
214 break;
215 }
216 crawdataList.addAll(urlList);
217 }
218
219 System.out.println("所有相册图片链接提取完毕,退出浏览器");
220 driver.quit();
221
222 }
223 }
224
225 /**
226 * 提取图片url
227 *
228 * @param driver
229 * @param xpath
230 * @throws InterruptedException
231 * @throws IOException
232 */
233 public static List<String> getImageUrl(WebDriver driver, String xpath) throws InterruptedException, IOException {
234 List<String> urlList = new ArrayList<String>();
235
236 // 点击相册
237 driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_4>a")).click();
238
239 // 切换到图片的frame
240 driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));
241 Thread.sleep(1000);
242
243 // 获得相册名称
244 String photo_name = driver.findElement(By.xpath(xpath + "//a[@class='c-tx2 js-album-desc-a']")).getText();
245
246 //// 文件夹检测
247 File imageUrl = new File("f:/qz/" + photo_name + ".txt");
248 if (!imageUrl.getParentFile().exists()) {
249 imageUrl.mkdirs();
250 } else {
251 imageUrl.delete();
252 }
253
254 // 获得图片总数,每页最多98张图片
255 WebElement span = driver.findElement(By.xpath(xpath + "/div[1]/div[1]/a" + "/span"));
256 String text = span.getText();
257 int count = Integer.parseInt(text);
258
259 // 进入列表
260 driver.findElement(By.xpath(xpath + "/div[1]/div[1]/a")).click();
261 Thread.sleep(3000);
262
263 // 计算页数
264 int totalPage = (int) Math.ceil((double) count / (double) pageSize);
265 System.out.println(photo_name + "图片总数为----" + count + "张,共计---" + totalPage + "页");
266
267 FileWriter fileWriter = new FileWriter(imageUrl, true);
268 Actions actions = new Actions(driver);
269 for (int i = 0; i < totalPage; i++) {
270
271 // 模拟按键加载图片
272 // Actions actions = new Actions(driver);
273 for (int j = 0; j < 50; j++) {
274 if (j % 5 == 0) {
275 Thread.sleep(1000);
276 }
277 actions.sendKeys(Keys.ARROW_DOWN).perform();
278 }
279
280 // 提取本页的image链接
281 List<WebElement> list = driver.findElements(
282 By.xpath("//a[@class='item-cover j-pl-photoitem-imgctn']/img[@class='j-pl-photoitem-img']"));
283 if (list == null || list.size() == 0) {
284 // 相册无权限访问或定位失败
285 System.out.println("无法提取图片链接!");
286 return null;
287 }
288 for (WebElement element : list) {
289 String src = element.getAttribute("src") + "
";
290 IOUtils.write(src, fileWriter);
291 System.out.println(src);
292 // 添加链接
293 urlList.add(src);
294 }
295 System.out.println("第" + (i + 1) + "页图片链接提取完毕");
296 Thread.sleep(1000);
297 // 跳转到下一页
298 if ((i + 2) <= totalPage) {
299 driver.findElement(By.xpath("//a[@id='pager_num_1_" + (i + 2) + "']")).click();
300 ;
301 }
302 }
303
304 fileWriter.close();
305 return urlList;
306 }
307
308 }
运行环境与上篇博文相同https://www.cnblogs.com/tele-share/p/9595265.html爬取结果