zoukankan      html  css  js  c++  java
  • JAVA爬虫实践(实践四:webMagic和phantomjs和淘宝爬虫)

    webMagic虽然方便,但是也有它不适用的地方,比如定向的某个单页面爬虫,或者存在大量ajax请求,页面的跳转请求全都混淆在js里。

    这时可以用webMagic结合phantomjs来真实模拟页面请求,即不仅仅获取数据,而是将整个页面完整渲染出来。虽然这样会使爬虫速度变慢很多,但是不失为一种快捷方便的解决方法。

    PhantomJS是一个基于 WebKit 的服务器端JavaScript API。它全面支持web而不需浏览器支持,其快速,原生支持各种Web标准: DOM 处理, CSS 选择器, JSON, Canvas, 和 SVG。 PhantomJS 可以用于页面自动化 , 网络监测, 网页截屏,以及 无界面测试 等。

    淘宝就是这种难以用普通爬虫方法爬取的网站。直接发送GET请求到淘宝基本获取不到什么有效的内容和链接。

    还好webMagic虽然默认使用httpClient获取网页,但是它也将它获取网页的方法Downloader开放出来。这样可以在Downloader里使用phantomjs获取页面。

    phantomjs使用方法

    1.下载安装phantomjs

    2.编写js脚本

    system = require('system')   //传递一些需要的参数给js文件  
      
    address = system.args[1];//获得命令行第二个参数 ,也就是指定要加载的页面地址,接下来会用到    
      
    var page = require('webpage').create();  
      
    var url = address;  
      
      
    page.open(url, function (status) {  
      
        if (status !== 'success') {  
      
            console.log('Unable to post!');  
        } else {  
      
            var encodings = ["euc-jp", "sjis", "utf8", "System"];//这一步是用来测试输出的编码格式,选择合适的编码格式很重要,不然你抓取下来的页面会乱码o(╯□╰)o,给出的几个编码格式是官网上的例子,根据具体需要自己去调整。  
      
            for (var i = 3; i < encodings.length; i++) {//我这里只要一种编码就OK啦  
      
                phantom.outputEncoding = encodings[i];  
      
                console.log(phantom.outputEncoding+page.content);//最后返回webkit加载之后的页面内容  
            }  
      
        }  
        phantom.exit();  
    });     
    View Code

    3.测试

    package util;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileWriter;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.io.PrintWriter;
    
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.Request;
    import us.codecraft.webmagic.selector.PlainText;
    
    public class GetAjaxHtml {
        public static String getAjaxContent(String url) throws Exception {
            Runtime rt = Runtime.getRuntime();
            Process p = rt
                    .exec("D:/phantomjs-2.1.1-windows/bin/phantomjs.exe D:/s.js "
                            + url);
            InputStream is = p.getInputStream();
            BufferedReader br = new BufferedReader(new InputStreamReader(is));
            StringBuffer sbf = new StringBuffer();
            String tmp = "";
            while ((tmp = br.readLine()) != null) {
                sbf.append(tmp + "
    ");
            }
            return sbf.toString();
        }
    
        public static Page download(Request request) {
            Page page = new Page();
            try {
                String url = request.getUrl();
                String html = getAjaxContent(url);
                page.setRawText(html);
                page.setUrl(new PlainText(url));
                page.setRequest(request);
                return page;
            } catch (Exception e) {
                System.out.println("download出错了!");
                return page;
            }
        }
    
        public static void main(String[] args) throws Exception {
            long start = System.currentTimeMillis();
            String result = getAjaxContent("http://www.taobao.com");
            System.out.println(result);
            // 创建新文件
            String path = "D:\testFile\taobao.html";
            PrintWriter printWriter = null;
            printWriter = new PrintWriter(new FileWriter(new File(path)));
            printWriter.write(result);
            printWriter.close();
            long end = System.currentTimeMillis();
            System.out.println("===============耗时:" + (end - start)
                    + "===============");
        }
    }
    View Code

    webMagic结合phantomjs淘宝爬虫

    package taobao;
    
    
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.Request;
    import us.codecraft.webmagic.Site;
    import us.codecraft.webmagic.Spider;
    import us.codecraft.webmagic.Task;
    import us.codecraft.webmagic.downloader.Downloader;
    import us.codecraft.webmagic.processor.PageProcessor;
    import util.GetAjaxHtml;
    import util.UuidUtil;
    import csdnblog.dao.TaobaoDao;
    import csdnblog.model.Taobao;
    
    public class TaobaoPageProcessor implements PageProcessor {
    
        private TaobaoDao taobaoDao = new TaobaoDao();
    
        // 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
        private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
    
        @Override
        public Site getSite() {
            return site;
        }
    
        @Override
        public void process(Page page) {
            page.addTargetRequests(page.getHtml().links()
                    .regex(".*item\.taobao\.com/item\.htm\?id=.*")
                    .all());
            page.addTargetRequests(page.getHtml().links()
                    .regex("https://s\.taobao\.com/list.*")
                    .all());
            
            //如果是详情页
            if(page.getUrl().regex("https://item\.taobao\.com/item\.htm\?id=.*").match()) {
                
                Taobao taobao = new Taobao();
                taobao.setId(UuidUtil.getId());
                taobao.setUrl(page.getUrl().toString());
                taobao.setMaintitle(page.getHtml().xpath("//h3[@class='tb-main-title']/text()").get());
                taobao.setSubtitle(page.getHtml().xpath("//p[@class='tb-subtitle']/text()").get());
                taobao.setPrice(page.getHtml().xpath("//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()").get());
                taobao.setTaobaoprice(page.getHtml().xpath("//em[@id='J_PromoPriceNum']/text()").get());
                taobao.setRatecounter(page.getHtml().xpath("//strong[@id='J_RateCounter']/text()").get());
                taobao.setSellcounter(page.getHtml().xpath("//strong[@id='J_SellCounter']/text()").get());
                // 把对象存入数据库
                taobaoDao.addTaobao(taobao);
                // 把对象输出控制台
                System.out.println(taobao.toString());
            }
        }
    
        public static void main(String[] args) {
            Spider.create(new TaobaoPageProcessor()).setDownloader(new Downloader() {
                
                @Override
                public void setThread(int threadNum) {
                }
                
                @Override
                public Page download(Request request, Task task) {
                    return GetAjaxHtml.download(request);
                }
            }).addUrl("https://s.taobao.com/list?q=%E5%A4%B9%E5%85%8B&cat=50344007&style=grid&seller_type=taobao").thread(5).run();
        }
    }
    View Code

    Model

    package csdnblog.model;
    
    public class Taobao {
    
        private String id;
        private String maintitle;
        private String subtitle;
        
        // url
        private String url;
        
        // 价格
        private String price;
        
        // 淘宝价
        private String taobaoprice;
        
        // 累计评价
        private String ratecounter;
        
        // 交易成功
        private String sellcounter;
    
        public String getId() {
            return id;
        }
    
        public void setId(String id) {
            this.id = id;
        }
    
        public String getMaintitle() {
            return maintitle;
        }
    
        public void setMaintitle(String maintitle) {
            this.maintitle = maintitle;
        }
    
        public String getSubtitle() {
            return subtitle;
        }
    
        public void setSubtitle(String subtitle) {
            this.subtitle = subtitle;
        }
    
        public String getPrice() {
            return price;
        }
    
        public void setPrice(String price) {
            this.price = price;
        }
    
        public String getTaobaoprice() {
            return taobaoprice;
        }
    
        public void setTaobaoprice(String taobaoprice) {
            this.taobaoprice = taobaoprice;
        }
    
        public String getRatecounter() {
            return ratecounter;
        }
    
        public void setRatecounter(String ratecounter) {
            this.ratecounter = ratecounter;
        }
    
        public String getSellcounter() {
            return sellcounter;
        }
    
        public void setSellcounter(String sellcounter) {
            this.sellcounter = sellcounter;
        }
    
        public Taobao(String id, String maintitle, String subtitle, String url,
                String price, String taobaoprice, String ratecounter,
                String sellcounter) {
            super();
            this.id = id;
            this.maintitle = maintitle;
            this.subtitle = subtitle;
            this.url = url;
            this.price = price;
            this.taobaoprice = taobaoprice;
            this.ratecounter = ratecounter;
            this.sellcounter = sellcounter;
        }
    
        public Taobao() {
            super();
        }
    
        @Override
        public String toString() {
            return "Taobao [id=" + id + ", maintitle=" + maintitle + ", subtitle="
                    + subtitle + ", url=" + url + ", price=" + price
                    + ", taobaoprice=" + taobaoprice + ", ratecounter="
                    + ratecounter + ", sellcounter=" + sellcounter + "]";
        }
    
        public String getUrl() {
            return url;
        }
    
        public void setUrl(String url) {
            this.url = url;
        }
    
    }
    View Code
  • 相关阅读:
    前后端分离,如何防止接口被其他人调用或恶意重发
    Session,Token相关区别
    【Spring事务的事务属性】
    【Java基础】一些问题
    【Java基础】重写equals需要重写hashcode
    【算法】哈希算法
    MySQL事务隔离级别
    @InitBinder装配自定义编辑器
    自定义转换器
    数据绑定流程
  • 原文地址:https://www.cnblogs.com/huangjian2/p/6757649.html
Copyright © 2011-2022 走看看