zoukankan      html  css  js  c++  java
  • crawler4j:轻量级多线程网络爬虫实例

    crawler4j是Java实现的开源网络爬虫。提供了简单易用的接口,可以在几分钟内创建一个多线程网络爬虫。

    下面实例结合jsoup(中文版API),javacvs 爬取自如租房网(http://sh.ziroom.com/z/nl/)租房信息。

    1.maven导入相关包

     1         <dependency>
     2             <groupId>edu.uci.ics</groupId>
     3             <artifactId>crawler4j</artifactId>
     4             <version>4.2</version>
     5         </dependency>
     6         <dependency>
     7             <groupId>org.jsoup</groupId>
     8             <artifactId>jsoup</artifactId>
     9             <version>1.8.3</version>
    10         </dependency>
    11         <dependency>
    12             <groupId>net.sourceforge.javacsv</groupId>
    13             <artifactId>javacsv</artifactId>
    14             <version>2.0</version>
    15         </dependency>

    2.创建自己的Crawler类 继承 WebCrawler 

      1 public class ZiroomCrawler extends WebCrawler {
      2     /** 爬取数据保存文件路径 */
      3     private final static String CSV_PATH = "data/crawl/ziroom.csv";
      4     /** 爬取匹配原则 */
      5     private final static Pattern FILTERS = Pattern.compile(".*(\.(css|js|bmp|gif|jpe?g|ico"
      6             + "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");
      7     /** 爬取link文件路径 */
      8     private final static String LINK_PATH = "data/crawl/link.csv";
      9     private static final Logger logger = LoggerFactory.getLogger(ZiroomCrawler.class);
     10 
     11     private final static String URL_PREFIX = "http://sh.ziroom.com/z/nl/";
     12 
     13     private final File csv;
     14 
     15     private final File csv2;
     16     private CsvWriter cw;
     17     private CsvWriter cw2;
     18 
     19     /**
     20      * You should implement this function to specify whether the given url
     21      * should be crawled or not (based on your crawling logic).
     22      */
     23     CrawlStat myCrawlStat;
     24 
     25     public ZiroomCrawler() throws IOException {
     26         myCrawlStat = new CrawlStat();
     27         csv = new File(CSV_PATH);
     28         csv2 = new File(LINK_PATH);
     29         if (csv.isFile()) {
     30             csv.delete();
     31         }
     32         if (csv2.isFile()) {
     33             csv2.delete();
     34         }
     35         cw2 = new CsvWriter(new FileWriter(csv2, true), ',');
     36         cw2.write("请求路径");
     37         cw2.endRecord();
     38         cw2.close();
     39         cw = new CsvWriter(new FileWriter(csv, true), ',');
     40         cw.write("图片");
     41         cw.write("价格");
     42         cw.write("地址");
     43         cw.write("说明");
     44         cw.endRecord();
     45         cw.close();
     46     }
     47 
     48     public void dumpMyData() {
     49         final int id = getMyId();
     50         // You can configure the log to output to file
     51         logger.info("Crawler {} > Processed Pages: {}", id, myCrawlStat.getTotalProcessedPages());
     52         logger.info("Crawler {} > Total Links Found: {}", id, myCrawlStat.getTotalLinks());
     53         logger.info("Crawler {} > Total Text Size: {}", id, myCrawlStat.getTotalTextSize());
     54     }
     55 
     56     @Override
     57     public Object getMyLocalData() {
     58         return myCrawlStat;
     59     }
     60 
     61     @Override
     62     public void onBeforeExit() {
     63         dumpMyData();
     64     }
     65 
     66     /*
     67      * 这个方法决定了要抓取的URL及其内容,例子中只允许抓取“http://sh.ziroom.com/z/nl/”这个域的页面,
     68      * 不允许.css、.js和多媒体等文件
     69      *
     70      * @see edu.uci.ics.crawler4j.crawler.WebCrawler#shouldVisit(edu.uci.ics.
     71      * crawler4j.crawler.Page, edu.uci.ics.crawler4j.url.WebURL)
     72      */
     73     @Override
     74     public boolean shouldVisit(Page referringPage, WebURL url) {
     75         final String href = url.getURL().toLowerCase();
     76 
     77         if (FILTERS.matcher(href).matches() || !href.startsWith(URL_PREFIX)) {
     78             return false;
     79         }
     80         return true;
     81     }
     82 
     83     /*
     84      * 当URL下载完成会调用这个方法。你可以轻松获取下载页面的url, 文本, 链接, html,和唯一id等内容。
     85      *
     86      * @see
     87      * edu.uci.ics.crawler4j.crawler.WebCrawler#visit(edu.uci.ics.crawler4j.
     88      * crawler.Page)
     89      */
     90     @Override
     91     public void visit(Page page) {
     92         final String url = page.getWebURL().getURL();
     93         System.out.println("-----------爬取路径:" + url);
     94         myCrawlStat.incProcessedPages();
     95         if (page.getParseData() instanceof HtmlParseData) {
     96             final HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
     97             final Set<WebURL> links = htmlParseData.getOutgoingUrls();
     98             try {
     99                 linkToCsv(links);
    100             } catch (final IOException e2) {
    101                 // TODO Auto-generated catch block
    102                 e2.printStackTrace();
    103             }
    104             myCrawlStat.incTotalLinks(links.size());
    105             try {
    106                 myCrawlStat.incTotalTextSize(htmlParseData.getText().getBytes("UTF-8").length);
    107             } catch (final UnsupportedEncodingException e1) {
    108                 // TODO Auto-generated catch block
    109                 e1.printStackTrace();
    110             }
    111             final String html = htmlParseData.getHtml();
    112 
    113             final Document doc = Jsoup.parse(html);
    114 
    115             final Elements contents = doc.select("li[class=clearfix]");
    116 
    117             for (final Element c : contents) {
    118                 // 图片
    119                 final String img = c.select(".img img").first().attr("src");
    120                 System.out.println("图片:" + img);
    121 
    122                 // 地址
    123                 final Element txt = c.select("div[class=txt]").first();
    124                 final String arr1 = txt.select("h3 a").first().text();
    125                 final String arr2 = txt.select("h4 a").first().text();
    126                 final String arr3 = txt.select("div[class=detail]").first().text();
    127 
    128                 final String arr = arr1.concat(arr1 + ",").concat(arr2 + ",").concat(arr3);
    129                 System.out.println("地址:" + arr);
    130                 // 说明
    131                 final String rank = txt.select("p").first().text();
    132                 System.out.println("说明:" + rank);
    133 
    134                 // 价格
    135                 final String pirce = c.select("p[class=price]").first().text();
    136 
    137                 try {
    138                     cw = new CsvWriter(new FileWriter(csv, true), ',');
    139                     cw.write(img);
    140                     cw.write(pirce);
    141                     cw.write(arr);
    142                     cw.write(rank);
    143                     cw.endRecord();
    144                     cw.flush();
    145                     cw.close();
    146                 } catch (final IOException e) {
    147                     e.printStackTrace();
    148                 }
    149             }
    150         }
    151     }
    152 
    153     private void linkToCsv(Set<WebURL> links) throws IOException {
    154         cw2 = new CsvWriter(new FileWriter(csv2, true), ',');
    155         for (final WebURL webURL : links) {
    156             cw2.write(webURL.getURL());
    157         }
    158         cw2.flush();
    159         cw2.endRecord();
    160         cw2.close();
    161 
    162     }
    View Code
    public class CrawlStat {
        private long totalLinks;
        private int totalProcessedPages;
        private long totalTextSize;
    
        public long getTotalLinks() {
            return totalLinks;
        }
    
        public int getTotalProcessedPages() {
            return totalProcessedPages;
        }
    
        public long getTotalTextSize() {
            return totalTextSize;
        }
    
        public void incProcessedPages() {
            this.totalProcessedPages++;
        }
    
        public void incTotalLinks(int count) {
            this.totalLinks += count;
        }
    
        public void incTotalTextSize(int count) {
            this.totalTextSize += count;
        }
    
        public void setTotalLinks(long totalLinks) {
            this.totalLinks = totalLinks;
        }
    
        public void setTotalProcessedPages(int totalProcessedPages) {
            this.totalProcessedPages = totalProcessedPages;
        }
    
        public void setTotalTextSize(long totalTextSize) {
            this.totalTextSize = totalTextSize;
        }
    }
    View Code

    3.编写运行脚本的类

    public class ZiroomController {
    
        public static void main(String[] args) {
            System.out.println("-------begin:" + new Timestamp(System.currentTimeMillis()));
            final String crawlStorageFolder = "data/crawl/root";
            final int numberOfCrawlers = 7;
    
            final CrawlConfig config = new CrawlConfig();
            config.setCrawlStorageFolder(crawlStorageFolder);
            config.setPolitenessDelay(1000);
            config.setIncludeBinaryContentInCrawling(false);
            config.setMaxPagesToFetch(50);
            // config.setResumableCrawling(true);
            /*
             * Instantiate the controller for this crawl.
             */
            final PageFetcher pageFetcher = new PageFetcher(config);
            final RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
            final RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
            CrawlController controller;
            try {
                controller = new CrawlController(config, pageFetcher, robotstxtServer);
                /*
                 * For each crawl, you need to add some seed urls. These are the
                 * first URLs that are fetched and then the crawler starts following
                 * links which are found in these pages
                 */
                controller.addSeed("http://sh.ziroom.com/z/nl/");
                // controller.addSeed("http://www.ziroom.com/z/nl/z3-u2.html/");
                // controller.addSeed("http://www.ics.uci.edu/~welling/");
                // controller.addSeed("http://www.ics.uci.edu/");
    
                /*
                 * Start the crawl. This is a blocking operation, meaning that your
                 * code will reach the line after this only when crawling is
                 * finished.
                 */
                controller.start(ZiroomCrawler.class, numberOfCrawlers);
    
                final List<Object> crawlersLocalData = controller.getCrawlersLocalData();
                long totalLinks = 0;
                long totalTextSize = 0;
                int totalProcessedPages = 0;
                for (final Object localData : crawlersLocalData) {
                    final CrawlStat stat = (CrawlStat) localData;
                    totalLinks += stat.getTotalLinks();
                    totalTextSize += stat.getTotalTextSize();
                    totalProcessedPages += stat.getTotalProcessedPages();
                }
    
                System.out.println("Aggregated Statistics:");
                System.out.println("	Processed Pages: {}" + totalProcessedPages);
                System.out.println("	Total Links found: {}" + totalLinks);
                System.out.println("	Total Text Size: {}" + totalTextSize);
            } catch (final Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }
    View Code

    刚开始写博客 仅供参考!请多指教!

    开源地址: https://github.com/yasserg/crawler4j

  • 相关阅读:
    [FJOI2007]轮状病毒
    [SHOI2013]发微博
    ATCODER ABC 099
    [HEOI2015]兔子与樱花
    [TJOI2017]可乐
    [TJOI2015]线性代数
    [Ceoi2008]order
    [CTSC2008]祭祀river
    [POI2009]Lyz
    数列分块(数据结构)学习笔记
  • 原文地址:https://www.cnblogs.com/liu-king/p/5322727.html
Copyright © 2011-2022 走看看