zoukankan      html  css  js  c++  java
  • selenium 爬取空间说说

      1 package cn.hb.util;
      2 
      3 import java.io.File;
      4 import java.io.FileWriter;
      5 import java.io.IOException;
      6 import java.util.Set;
      7 import java.util.concurrent.TimeUnit;
      8 import org.openqa.selenium.By;
      9 import org.openqa.selenium.Cookie;
     10 import org.openqa.selenium.JavascriptExecutor;
     11 import org.openqa.selenium.Keys;
     12 import org.openqa.selenium.WebDriver;
     13 import org.openqa.selenium.WebElement;
     14 import org.openqa.selenium.firefox.FirefoxDriver;
     15 import org.openqa.selenium.firefox.FirefoxOptions;
     16 import org.openqa.selenium.interactions.Actions;
     17 
     18 /**
     19  * 爬取说说写入到txt中,爬取100条
     20  * 
     21  * @author tele
     22  *
     23  */
     24 public class QZTwitterCrawler {
     25     static String url = "https://user.qzone.qq.com/1350560858";
     26     static int maxSize = 100;
     27     static int pageSize = 20;
     28     static String userName="qq"; 
     29     static String pwd = "密码";
     30     public static void main(String[] args) throws InterruptedException, IOException {
     31         login();
     32     }
     33 
     34     /**
     35      * 登录
     36      * 
     37      * @throws InterruptedException
     38      * @throws IOException
     39      */
     40     public static void login() throws InterruptedException, IOException {
     41         System.setProperty("webdriver.gecko.driver", "D:/browserdriver/geckodriver.exe");
     42 
     43         FirefoxOptions options = new FirefoxOptions();
     44         options.setBinary("F:/ff/firefox.exe");
     45 
     46         WebDriver driver = new FirefoxDriver(options);
     47         driver.manage().window().maximize();
     48         // 超时
     49         try {
     50             driver.manage().timeouts().pageLoadTimeout(3, TimeUnit.SECONDS);
     51             driver.manage().timeouts().setScriptTimeout(3, TimeUnit.SECONDS);
     52             driver.get(url);
     53         } catch (Exception e) {
     54             System.out.println("所需元素已出现,停止加载页面");
     55         } finally {
     56             // 切换到登录login
     57             driver.switchTo().frame("login_frame");
     58 
     59             WebElement switcher_plogin = driver.findElement(By.id("switcher_plogin"));
     60             System.out.println(switcher_plogin.getText());
     61             if (switcher_plogin.isDisplayed()) {
     62                 switcher_plogin.click();
     63             }
     64             // 用户名
     65             driver.findElement(By.id("u")).clear();
     66             driver.findElement(By.id("u")).sendKeys(userName);
     67 
     68             // 密码
     69             driver.findElement(By.id("p")).clear();
     70             driver.findElement(By.id("p")).sendKeys(pwd);
     71 
     72             // 登录
     73             try {
     74                 driver.findElement(By.id("login_button")).click();
     75                 Thread.sleep(3000);
     76             } catch (Exception e) {
     77                 e.printStackTrace();
     78             } finally {
     79                 if ("https://i.qq.com/".equals(driver.getCurrentUrl())) {
     80                     System.out.println("登录失败!5秒后再次尝试登录");
     81                     Thread.sleep(5000);
     82                     driver.findElement(By.id("login_button")).click();
     83                 }
     84             }
     85 
     86             // 退出frame
     87             driver.switchTo().defaultContent();
     88 
     89             System.out.println(driver.getCurrentUrl());
     90 
     91             JavascriptExecutor jsExecutor = (JavascriptExecutor) driver;
     92             // 如果有亲密度提示
     93             
     94               try { WebElement fs_guide = driver.findElement(By.xpath(
     95               "//div[@id='friendship_promote_layer']/table[@class='tbl-fs-guide']//a"
     96              )); if(fs_guide != null && fs_guide.isDisplayed()) {
     97               fs_guide.click(); } } catch (Exception e) { e.printStackTrace();
     98               }finally {
     99              
    100               }
    101              
    102 
    103             // 点击说说
    104             driver.findElement(By.cssSelector("#menuContainer ul.head-nav-menu>li.menu_item_311>a")).click();
    105 
    106             Thread.sleep(2000);
    107 
    108             // 切换到frame
    109             driver.switchTo().frame(driver.findElement(By.className("app_canvas_frame")));
    110 
    111             Thread.sleep(5000);
    112 
    113             // 拼接cookie
    114         /*    StringBuilder builder = new StringBuilder();
    115             Set<Cookie> cookieSet = driver.manage().getCookies();
    116             cookieSet.forEach(c -> builder.append(c.getName()).append("=").append(c.getValue()).append("; "));
    117             cookies = builder.toString();*/
    118 
    119             // 保存
    120             saveTwitter(driver);
    121 
    122             System.out.println("内容提取完毕,退出浏览器");
    123             driver.quit();
    124 
    125         }
    126     }
    127 
    128     /**
    129      * 序列化
    130      * @param driver
    131      * @return
    132      * @throws InterruptedException
    133      * @throws IOException
    134      */
    135     public static void saveTwitter(WebDriver driver) throws InterruptedException, IOException {
    136         File file = new File("f:/qz/twitter.txt");
    137 
    138         // 文件夹检测
    139         if (!file.getParentFile().exists()) {
    140             file.mkdirs();
    141         } else {
    142             file.delete();
    143         }
    144 
    145         
    146         
    147         FileWriter fileWriter = new FileWriter(file, true);
    148 
    149         String xpath;
    150         // 模拟按键进行滚动
    151         Actions actions = new Actions(driver);
    152 
    153         
    154         //说说总量
    155         String totalNumStr = driver.findElement(By.xpath("//div[@class='feed_num']/a")).getText();
    156         int totalNum = Integer.parseInt(totalNumStr);
    157         
    158         // 计算页数
    159         int totalPage = (int) Math.ceil((double)Math.min(maxSize, totalNum) / (double) pageSize);
    160 
    161         // 构造xpath
    162         for (int i = 0; i < totalPage; i++) {
    163 
    164             for (int j = 0; j < pageSize; j++) {
    165                 xpath = "//ol[@id='msgList']/li[" + (j + 1) + "]/div[3]/div[2]/pre[@class='content']";
    166                 // 获取说说内容
    167                 try {
    168                     WebElement element = driver.findElement(By.xpath(xpath));
    169                     String text = element.getText();
    170                     System.out.println("本页第" + (j + 1) + "条   :" + text);
    171                     fileWriter.write(text, 0, text.length());
    172 
    173                 } catch (Exception e) {
    174                     e.printStackTrace();
    175                 } finally {
    176 
    177                 }
    178                 if (j % 2 == 0) {
    179                     actions.sendKeys(Keys.ARROW_DOWN).perform();
    180                 }
    181             }
    182             System.out.println("" + (i + 1) + "页说说爬取完毕");
    183             // 分页
    184             if ((i + 2) <= totalPage) {
    185                 driver.findElement(By.xpath("//a[@id='pager_num_" + i + "_" + (i + 2) + "']")).click();
    186                 // 等待页面加载
    187                 Thread.sleep(3000);
    188             }
    189         }
    190 
    191         if (fileWriter != null) {
    192             fileWriter.close();
    193         }
    194     }
    195 
    196 }

    比爬取相册简单点,唯一有点弯的是页码的构造了,我写的这个只支持获取文字,可以用来生成词云

  • 相关阅读:
    四套读写方案
    如何保证ArrayList线程安全
    异常总结<经典例题>
    java.移位运算符
    java反射机制
    面试题:return和finally执行
    Spring_通过注解配置 Bean(1)
    Spring_通过 FactoryBean 配置 Bean
    Spring_通过工厂方法配置 Bean
    Spring_管理 Bean 的生命周期
  • 原文地址:https://www.cnblogs.com/tele-share/p/9693681.html
Copyright © 2011-2022 走看看