zoukankan      html  css  js  c++  java
  • httpclient

    package com.test.crawler.service;
    import org.apache.http.HttpEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
    import org.apache.http.protocol.BasicHttpContext;
    import org.apache.http.protocol.HttpContext;
    import org.apache.http.util.EntityUtils;
    import java.io.IOException;
    import java.util.List;
    import java.util.concurrent.ExecutorService;
    import java.util.concurrent.Executors;
    import com.test.db.po.Tb_test_company_info;
    import com.test.crawler.htmlHandler.CompanyDetailHtmlHandler;
    
    public class ViewCompanyDetailService {
        
        private static final int MAX_THREAD_NUM = 100;
        
        public void ViewCompanyDetail(List<Tb_test_company_info> companyList) throws InterruptedException{
            
            if(companyList==null||companyList.size()==0){return;}
            PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager();
            connManager.setMaxTotal(MAX_THREAD_NUM);
            CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(connManager).build();
            try{
                //公司信息总数
                int iTotalComanyInfoNum = companyList.size();
                //多线程执行的次数
                int iMulitThreadRunTimes = 1;
                if(iTotalComanyInfoNum > MAX_THREAD_NUM){
                    iMulitThreadRunTimes = (iTotalComanyInfoNum/MAX_THREAD_NUM) + ( (iTotalComanyInfoNum%MAX_THREAD_NUM ==0) ? 0:1);
                }
                for(int iCurMulitThreadRunTimes = 0 ;iCurMulitThreadRunTimes < iMulitThreadRunTimes ;iCurMulitThreadRunTimes++ ){
                    //线程数
                    int iThreadNum = (iCurMulitThreadRunTimes+1)* MAX_THREAD_NUM <= iTotalComanyInfoNum ? 
                            MAX_THREAD_NUM : (iTotalComanyInfoNum - iCurMulitThreadRunTimes*MAX_THREAD_NUM);
                    ExecutorService exe = Executors.newFixedThreadPool(iThreadNum);
                    for (int i = 0; i < iThreadNum; i++) {
                        HttpGet httpget = new HttpGet(companyList.get(iCurMulitThreadRunTimes*MAX_THREAD_NUM+i).getCompanyUrl());
                        exe.execute(new ViewCompanyDetailThread(httpClient, httpget, companyList.get(iCurMulitThreadRunTimes*MAX_THREAD_NUM+i).getId()));
                    }
                    exe.shutdown();
                    while (true) {  
                        if (exe.isTerminated()) {  
                            System.out.println(MAX_THREAD_NUM + " Over !!");
                            Thread.sleep(15000);  
                            break;  
                        }  
                        Thread.sleep(200);  
                    }
                }
            }finally{
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            
        }
        static class ViewCompanyDetailThread extends Thread {
            private final CloseableHttpClient httpClient;
            private final HttpContext context;
            private final HttpGet httpget;
            private final int shopId;
            
            public ViewCompanyDetailThread(CloseableHttpClient httpClient, HttpGet httpget, int shopId) {
                this.httpClient = httpClient;
                this.context = new BasicHttpContext();
                this.httpget = httpget;
                this.shopId = shopId;
            }
            
            @Override
            public void run() {
                try {
                    //System.out.println(shopId + "Get");
                    CloseableHttpResponse response = httpClient.execute(httpget, context);
                    try {
                        HttpEntity entity = response.getEntity();
                        if (entity != null) {
                            String pageContent = EntityUtils.toString(entity,"UTF-8");
                            CompanyDetailHtmlHandler companyDetailHtmlHandler = new CompanyDetailHtmlHandler();
                            if(!companyDetailHtmlHandler.CompanyInfoParseAndSave(shopId, pageContent)){
                                System.out.println(shopId + " - CompanyInfoParseAndSave Failure");
                            }
                        }
                    } finally {
                        response.close();
                    }
                } catch (Exception e) {
                    System.out.println(shopId + " - error: " + e);
                }
            }
        }
    }
    package com.test.crawler.htmlHandler;
    
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.select.Elements;
    
    import com.test.db.dao.CompanyInfoDao;
    
    public class CompanyDetailHtmlHandler {
        
        /**
         * 公司信息解析并且保存
         * @param shopId
         * @param pageContent
         * @return
         */
        public synchronized boolean CompanyInfoParseAndSave(int shopId,String pageContent){
            if(shopId<=0 || pageContent == null){return false;}
            Document doc = Jsoup.parse(pageContent);
            String CompanyPhone = "";
            String CompanyBoss = "";
            String CompanyMobil = "";
            String CompanyAddr = "";
            String QQ = "";
            String Jyms = "";
            String createDatetime = "";
            Elements eleContents = doc.select("省略...");
            if(eleContents!=null && eleContents.size() >0 ){
                CompanyBoss = eleContents.first().select("省略...").first().text();
                try{
                    String qqHref = eleContents.first().select("省略...").first().attr("href");
                    Pattern p = Pattern.compile("http://wpa.qq.com/msgrd\?v=3\&uin=(\d*?)\&site=qq\&menu=yes");
                    Matcher m = p.matcher(qqHref);
                    if(m.find()) {
                        QQ = m.group(1);
                    }
                }catch(Exception e){}
                try{
                    Jyms = eleContents.first().select("省略...").get(0).text();
                    CompanyAddr = eleContents.first().select("省略...").get(1).text();
                    createDatetime = eleContents.first().select("省略...").get(2).text();
                }catch(Exception e){}
            }
            Elements eleContents2 = doc.select("div.wp-colsub div.wp-mdl div.wp-contact ul.contact-lst");
            if(eleContents2!=null && eleContents2.size() >0 ){
                try{
                    String regEx="[^0-9]";   
                    Pattern p = Pattern.compile(regEx);   
                    Matcher m = p.matcher(eleContents2.select("li").get(1).text());   
                    CompanyMobil = m.replaceAll("").trim();
                    String regEx2="[^0-9\-]";   
                    Pattern p2 = Pattern.compile(regEx2);   
                    Matcher m2 = p2.matcher(eleContents2.select("li").get(2).text());   
                    CompanyPhone = m2.replaceAll("").trim();
                }catch(Exception e){}
            }
            CompanyInfoDao dao = new CompanyInfoDao();
            if(CompanyBoss==null||"".equals(CompanyBoss.trim())){CompanyBoss="-";}
            return dao.Update(shopId, CompanyPhone, CompanyBoss, CompanyMobil, CompanyAddr, QQ, Jyms, createDatetime);
        }
    }
    package com.test.crawler.main;
    
    import com.test.crawler.service.ViewCompanyDetailService;
    import com.test.db.dao.CompanyInfoDao;
    
    public class TestMain {
    
        public static void main(String[] args) {
            
            //关闭httpclient多余日志
            System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.SimpleLog");
            System.setProperty("org.apache.commons.logging.simplelog.showdatetime", "true");
            System.setProperty("org.apache.commons.logging.simplelog.log.org.apache.commons.httpclient", "stdout");
            
            try{
                ViewCompanyDetailService ss = new ViewCompanyDetailService();
                CompanyInfoDao dao = new CompanyInfoDao();
                ss.ViewCompanyDetail(dao.ListForViewDetail());
            }catch(Exception ex){
                ex.printStackTrace();
            }
        }
    
    }

    Over

  • 相关阅读:
    Post提交和Get提交的区别
    Servlet 生命周期
    MVC
    HDU 5033 Building (维护单调栈)
    2014 ACM/ICPC Asia Regional Xi'an Online(HDU 5007 ~ HDU 5017)
    HDU 1026 Ignatius and the Princess I (BFS)
    URAL 1183 Brackets Sequence(DP)
    POJ 3384 Feng Shui(半平面交向内推进求最远点对)
    POJ 3525 Most Distant Point from the Sea (半平面交向内推进+二分半径)
    POJ 1279 Art Gallery(半平面交求多边形核的面积)
  • 原文地址:https://www.cnblogs.com/abinxm/p/4929629.html
Copyright © 2011-2022 走看看