zoukankan      html  css  js  c++  java
  • a code snip

    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import lombok.AllArgsConstructor;
    import lombok.Data;
    import lombok.NoArgsConstructor;
    import lombok.extern.slf4j.Slf4j;
    
    import org.apache.commons.lang3.StringEscapeUtils;
    import org.apache.commons.lang3.StringUtils;
    import org.apache.http.HttpEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import com.creditcloud.brick.task.CrawlTask;
    import com.creditcloud.brick.task.Extractor;
    import com.creditcloud.brick.task.Field;
    
    @NoArgsConstructor
    @AllArgsConstructor
    @Data
    @Slf4j
    public class DataCrawler {
        String                 url;
        String                 space;
        
        public HashMap<String, Object> doCrawl(CrawlTask task ) {
            HashMap<String, Object> result = new HashMap<String, Object>();
            this.url = task.getUrl();
            this.space = task.getId();
            //
            String content=this.doGet(url);
            if( StringUtils.isNotEmpty(content)) {
                Extractor actor=task.getExtractor();
                if( actor != null )
                    this.parse(actor, content, result);
            }
            return result;
        }
        
        public String doGet( String url ) {
            String data=null;
            //return Jsoup.connect(url).userAgent("Mozilla").get();
            CloseableHttpClient httpclient = HttpClients.createDefault();
            HttpGet hGet = new HttpGet(url);
            log.info(url);
            CloseableHttpResponse response = null;
            try {
                response = httpclient.execute(hGet);
                HttpEntity entity = response.getEntity();    
                System.out.println(response.getStatusLine());
                log.info( response.getStatusLine().toString() );
                //
                if (entity != null) {  
                    System.out.println("Response content length: " + entity.getContentLength());
                    data = EntityUtils.toString(entity);
                    System.out.println(data);  
                    EntityUtils.consume(entity);
                    
                }
            }
            catch(Exception e){
                log.error(e.getMessage());
            }
            //response
            try 
            {  
                if( response != null )
                    response.close();  
            }
            catch(Exception e) {
                log.error(e.getMessage());
            }
            //httpclient
            try {
                if( httpclient != null )
                    httpclient.close();
            } catch (Exception e) {
                log.error(e.getMessage());
            }
    
            return data;
        }
        
        public String removeHtmlLabel( String input ) {
            return input.replaceAll("<[^>]+>", "").replaceAll("&nbsp;"," ").trim();
        }
        
        //
        public ArrayList<String> match( Extractor extractor, String input ) {
            ArrayList<String> result = new ArrayList<String>();
            switch (extractor.getType()) {
            case css:    //call css
            {
                Document doc = Jsoup.parse(input);
                Elements elems = doc.select(extractor.getPattern());
                for( Element elem:elems ) {
                    result.add( elem.toString() );
                }
            }
            break;
            case regex: //call regex
            {
                Pattern p = Pattern.compile( extractor.getPattern());
                Matcher m = p.matcher( input );
                String matchValue = null;
                while(m.find()) {
                    matchValue = StringEscapeUtils.unescapeHtml4( m.group());
                    result.add(matchValue);
                }
            }
            break;
            case empty:
                result.add(input);
                break;
            }
            return result;
        }
        
        public void parse( Extractor extractor, String input, HashMap<String, Object> result ) {
            //1. match by css or regex
            ArrayList<String> strlist = this.match(extractor, input);
            if( strlist.isEmpty() ) {
                //result.put( extractor.getId(), null);
                return;
            }
            //2. call children extractors
            switch(extractor.getData()) {
            case array:{
                //result.setType(ResultDataType.array);
                ArrayList<HashMap<String, Object>> list = new ArrayList<HashMap<String, Object>>();
                for( String str:strlist ) {
                    HashMap<String, Object> childResult = new HashMap<String, Object>();
                    for( Extractor one:extractor.getChildren()) {
                        this.parse(one, str, childResult);
                    }
                    if( childResult.isEmpty() == false )
                        list.add(childResult);
                }
                if(list.isEmpty() == false )
                    result.put( extractor.getId(), list );
            }
            break;
            case field:{
                for(Field fd:extractor.getFields()) {
                    String val=strlist.get( fd.getIndex() );
                    result.put( fd.getName(), this.removeHtmlLabel(val) );
                }
            }
            break;
            case none: {
                for( String str:strlist ) {
                    for( Extractor one:extractor.getChildren()) {
                        this.parse(one, str, result);
                    }
                }
            }
            break;
            }
        }
    }
  • 相关阅读:
    jQuery live事件说明及移除live事件方法
    Jquery的html方法里包含特殊字符的处理
    mysql创建定时任务
    MySQL内置函数获取几天前的日期
    实战mysql分区
    TCP的TIME_WAIT状态
    openssl生成SSL证书的流程
    mysql备份的三种方式详解
    mysql创建唯一索引
    MYSQL双机热备份的配置实施(问题总结)
  • 原文地址:https://www.cnblogs.com/feika/p/4505434.html
Copyright © 2011-2022 走看看