zoukankan      html  css  js  c++  java
  • JAVA实现网页抓取(htmlunit)

    准确条件

    加入依赖jar包

    <dependency>
         <groupId>net.sourceforge.htmlunit</groupId>
          <artifactId>htmlunit</artifactId>
          <version>2.15</version>
    </dependency>


    代码示例

    private WebClient initWc() throws IOException {
        WebClient wc = new WebClient(BrowserVersion.CHROME);
        wc.getOptions().setJavaScriptEnabled(false);
        wc.getOptions().setCssEnabled(false);
        wc.getOptions().setTimeout(8000);
        wc.setJavaScriptTimeout(8000);
        wc.setAjaxController(new NicelyResynchronizingAjaxController());
        wc.waitForBackgroundJavaScript(8000);
    //        Cache cache=new Cache();
    //        wc.setCache(cache);
        wc.getOptions().setThrowExceptionOnScriptError(false);
    //        wc.getOptions().setThrowExceptionOnFailingStatusCode(false);
        return wc;
    }

    public void loadData() {
      WebClient wc = null;

        if ( wc == null ) {
            try {
                wc = initWc();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        try {
            //图片中文字解析时使用
            IIORegistry registry = IIORegistry.getDefaultInstance();  
            registry.registerServiceProvider(new com.sun.media.imageioimpl.plugins.tiff.TIFFImageWriterSpi());  
            registry.registerServiceProvider(new com.sun.media.imageioimpl.plugins.tiff.TIFFImageReaderSpi());  

            StringBuffer errPage =new StringBuffer();
            for(int i =1 ; i<=97;i++){
                loadPage(i,errPage,wc);
                riskCompanyDao.flush();
            }
            log.info("errPage:"+errPage);
    //            loadPage(27,errPage,wc);
        } catch (Exception e) {
            log.warn("loadData error! ", e);
        } finally {
            wc.closeAllWindows();
        }
    }

    private void loadPage(int pageNo,StringBuffer errPage, WebClient wc){
       HtmlPage page;
        try {
            String refer="http://www.baidu.com/";
            URL link=new URL("http://www.kstba.org/minglu-79-"+pageNo+".html");
            WebRequest request=new WebRequest(link);
            request.setCharset("UTF-8");
            request.setAdditionalHeader("Referer", refer);//设置请求报文头里的refer字段
            设置请求报文头里的User-Agent字段
            request.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36");
            request.setAdditionalHeader("Connection", "keep-alive");
            request.setAdditionalHeader("Cookie", "ad_play_index=47; CNZZDATA1000215585=2014872656-1449554771-%7C1449572770");

            page = wc.getPage(request);

            HtmlPage pageResult = page;
            HtmlTable tableResult = (HtmlTable) pageResult.getElementsByTagName("table").get(0);
            HtmlTableBody body = (HtmlTableBody) tableResult.getChildNodes().get(1);
            int indexRow = 0;
            for ( DomNode node2 : body.getChildNodes() ) {

                if (node2 instanceof  HtmlTableRow ) {
                    HtmlTableRow row = (HtmlTableRow) node2;
                    List<HtmlTableCell> cells = row.getCells();
                    HtmlTableCell cell0=cells.get(0);
                    String companyName = cell0.getElementsByTagName("a").get(0).getTextContent();
                    String industryName = cell0.getElementsByTagName("div").get(0).getTextContent();
                    industryName = industryName.split(":")[1];
                    String addr = cell0.getElementsByTagName("div").get(1).getTextContent();
                    if (addr.split(":").length>1){
                        addr = addr.split(":")[1];
                    }else{
                        addr=null;
                    }
                    String mobile =null;
                    if (cell0.getElementsByTagName("div").get(2).getElementsByTagName("img").size()>0){
                        HtmlImage img =(HtmlImage)cell0.getElementsByTagName("div").get(2).getElementsByTagName("img").get(0);
                        String imgStr =img.getAttribute("src");
                        imgStr =imgStr.substring(0,imgStr.indexOf("&font=")).replace("fontsize=12", "fontsize=22");
                        mobile = ImageRead.getImgStr(imgStr);
                        log.info("mobile:"+mobile);
                    }
                   
                }
                indexRow++;
            }

        } catch (Exception e) {
            errPage.append(pageNo).append(",");
            log.warn("page error :"+pageNo,e);
        }

    }


    注意事项

        普通的httpConnection容易被拦截,需设置请求报文头,模拟浏览器请求
        WebClient在请求发起前初始化一次即可
        不同浏览器版返回的html代码有一定差异,需单独调试

  • 相关阅读:
    WebStorm2019
    微信公众号互阅平台-真实提高阅读量-「作者加鸡腿」
    macos 致命错误: 在类路径或引导类路径中找不到程序包 java.lang
    IDEA2020激活码 / IDEA 2020.1.2激活破解教程
    Linux命令大全
    2019年终总结-2020展望「定版」
    SpringBoot如何切换Redis默认库
    uniapp增加百度统计代码(h5)
    修改MyEclipse/Eclipse左侧文字大小(MacOS/Windows)
    Invalid connection string format, a valid format is: "host:port:sid"
  • 原文地址:https://www.cnblogs.com/cuihongyu3503319/p/15047161.html
Copyright © 2011-2022 走看看