zoukankan      html  css  js  c++  java
  • IngCrawler

    import java.io.IOException;
    import java.security.SecureRandom;
    import java.security.cert.CertificateException;
    import java.security.cert.X509Certificate;
    import java.sql.Connection;
    import java.sql.DriverManager;
    import java.sql.PreparedStatement;
    import java.sql.ResultSet;
    import java.sql.SQLException;
    import java.sql.Statement;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.concurrent.ExecutorService;
    import java.util.concurrent.Executors;
    
    import javax.net.ssl.HostnameVerifier;
    import javax.net.ssl.HttpsURLConnection;
    import javax.net.ssl.SSLContext;
    import javax.net.ssl.SSLSession;
    import javax.net.ssl.X509TrustManager;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    
    /*
    CREATE TABLE `ing` (
      `id` int(11) unsigned NOT NULL,
      `url` varchar(500) DEFAULT NULL,
      `user` varchar(100) DEFAULT NULL,
      `date` varchar(30) DEFAULT NULL,
      `content` varchar(5000) DEFAULT NULL,
      `lucky` tinyint(4) DEFAULT NULL,
      `userlink` varchar(500) DEFAULT NULL,
      `mtime` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
    CREATE TABLE `comment` (
      `id` int(11) unsigned NOT NULL,
      `ingid` int(11) DEFAULT NULL,
      `user` varchar(100) DEFAULT NULL,
      `content` varchar(5000) DEFAULT NULL,
      `date` varchar(30) DEFAULT NULL,
      `userlink` varchar(100) DEFAULT NULL,
      `mtime` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
     * */
    public class IngCrawler {
        static {
            try {
                HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {
                    public boolean verify(String hostname, SSLSession session) {
                        return true;
                    }
                });
    
                SSLContext context = SSLContext.getInstance("TLS");
                context.init(null, new X509TrustManager[] { new X509TrustManager() {
                    public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
                    }
    
                    public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
                    }
    
                    public X509Certificate[] getAcceptedIssuers() {
                        return new X509Certificate[0];
                    }
                } }, new SecureRandom());
                HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    
        public static void main(String[] args) throws IOException {
            int id = Inserter.getNextId();
            int lastestid = Crawler.getLastestId();
    
            for (; id <= lastestid; id++) {
                Crawler.crawl("https://ing.cnblogs.com/u/1/status/" + id, id);
            }
        }
    
        static class Ing {
            int id;
            String url;
            String user;
            String date;
            String content;
            boolean lucky;
            String userlink;
            List<Comment> comments = new ArrayList<Comment>();
    
            @Override
            public String toString() {
                StringBuilder sb = new StringBuilder(id + " - [" + date + "][" + user + "] - " + content);
                for (Comment c : this.comments) {
                    sb.append("
    	" + c);
                }
                return sb.toString();
            }
    
            static Ing parseIng(Document doc, String url, int id) {
                Ing ing = new Ing();
                ing.id = id;
                ing.url = url;
    
                if (doc.select(".ing_detail_title").size() == 0) {
                    return ing;
                }
    
                ing.user = doc.select(".ing_item_author").text().trim();
                ing.userlink = doc.select(".ing_item_author").attr("href");
                ing.date = doc.select(".ing_detail_title").text().trim();
                if (ing.date.indexOf(":") != -1) {
                    ing.date = ing.date.substring(ing.date.indexOf(":") + 1).trim();
                }
                ing.content = doc.select("#ing_detail_body").text().trim();
                ing.lucky = doc.select(".ing_icon_lucky").size() > 0;
                for (Element e : doc.select("#comment_block_" + id).get(0).children()) {
                    ing.comments.add(Comment.parseComment(e, id));
                }
                return ing;
            }
    
            static class Comment {
                int id;
                int ingid;
                String user;
                String content;
                String date;
                String userlink;
    
                static Comment parseComment(Element e, int ingid) {
                    Comment comment = new Comment();
                    comment.id = Integer.parseInt(e.id().substring(8));
                    comment.ingid = ingid;
    
                    comment.user = e.select("#comment_author_" + comment.id).text().trim();
                    comment.userlink = e.select("#comment_author_" + comment.id).attr("href");
                    comment.date = e.select(".text_green").attr("title").trim();
    
                    e.select("#comment_author_" + comment.id).remove();
                    e.select(".text_green").remove();
                    e.select(".gray3").remove();
    
                    comment.content = e.select("div").text().trim();
                    if (comment.content.startsWith(":")) {
                        comment.content = comment.content.substring(1).trim();
                    }
    
                    return comment;
                }
    
                @Override
                public String toString() {
                    return "[" + user + "] - " + content;
                }
            }
    
        }
    
        static class Crawler implements Runnable {
            static ExecutorService crawler = Executors.newFixedThreadPool(10);
    
            String url;
            int id;
    
            public Crawler(String url, int id) {
                this.url = url;
                this.id = id;
            }
    
            public static int getLastestId() {
                return 1054304;
            }
    
            public static void crawl(String url, int id) {
                crawler.execute(new Crawler(url, id));
            }
    
            @Override
            public void run() {
                System.out.println("crawl for: " + url);
                try {
                    String cookie = "YOUR COOKIE HERE";
                    String useragent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36";
                    Inserter.insert(Ing.parseIng(Jsoup.connect(url).header("cookie", cookie).userAgent(useragent).get(), url, id));
                } catch (IOException e) {
                    e.printStackTrace();
                }
    
            }
    
        }
    
        static class Inserter implements Runnable {
            static ExecutorService inserter = Executors.newFixedThreadPool(1);
    
            static Connection conn;
            static PreparedStatement pstating, pstatcmt;
    
            static {
                try {
                    Class.forName("com.mysql.jdbc.Driver");
                    conn = DriverManager.getConnection(
                            "jdbc:mysql://localhost:3306/ing?useUnicode=true&characterEncoding=utf-8&autoReconnect=true", "root", "");
    
                    pstating = conn
                            .prepareStatement("insert into ing (id,url,user,date,content,lucky,userlink) values (?,?,?,?,?,?,?)");
                    pstatcmt = conn
                            .prepareStatement("insert into comment (id,ingid,user,content,date,userlink) values (?,?,?,?,?,?)");
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
    
            Ing ing;
    
            public Inserter(Ing ing) {
                this.ing = ing;
            }
    
            public static int getNextId() {
                try {
                    Statement stat = conn.createStatement();
                    ResultSet rs = stat.executeQuery("select max(id) as id from ing");
                    if (rs.next()) {
                        return rs.getInt("id") + 1;
                    }
                } catch (SQLException e) {
                    // ignore
                }
                return 1;
            }
    
            static int no = 0;
    
            public static void insert(Ing ing) {
                inserter.execute(new Inserter(ing));
            }
    
            @Override
            public void run() {
                System.out.println(++no + ". " + ing);
                try {
                    pstating.setInt(1, ing.id);
                    pstating.setString(2, ing.url);
                    pstating.setString(3, ing.user);
                    pstating.setString(4, ing.date);
                    pstating.setString(5, ing.content);
                    pstating.setInt(6, ing.lucky ? 1 : 0);
                    pstating.setString(7, ing.userlink);
                    pstating.executeUpdate();
    
                    for (Ing.Comment c : ing.comments) {
                        pstatcmt.setInt(1, c.id);
                        pstatcmt.setInt(2, c.ingid);
                        pstatcmt.setString(3, c.user);
                        pstatcmt.setString(4, c.content);
                        pstatcmt.setString(5, c.date);
                        pstatcmt.setString(6, c.userlink);
                        pstatcmt.executeUpdate();
                    }
                } catch (SQLException e) {
                    System.err.println("ERROR - " + e.getMessage() + " - " + ing);
                }
            }
    
        }
    
    }
  • 相关阅读:
    微软算法100题25 查找连续最长的数字串
    微软算法100题24 就地逆序单链表
    微软算法100题21 数列中所有和为特定值的组合
    微软算法100题20 字符串转整数 atoi
    约瑟夫环
    微软算法100题17 字符串中找到第一个只出现一次的字符
    微软算法100题16 按层遍历二叉树
    微软算法100题15 求二元查找树的镜像
    微软算法100题14 在排序数组中找到和为指定数的任意两个元素
    NLP(十) 主题识别
  • 原文地址:https://www.cnblogs.com/jieyuefeng/p/6292459.html
Copyright © 2011-2022 走看看