zoukankan      html  css  js  c++  java
  • jeecms 强大的采集功能优化 转载 https://blog.csdn.net/jeff06143132/article/details/7099003

    ========================================================= 
     
    没办法附件上传不了,AcquisitionSvcImpl.java类: 
    //---------------------------------------------------------------------------- 
    package com.jeecms.cms.service; 
     
    import java.io.IOException; 
    import java.net.URI; 
    import java.util.ArrayList; 
    import java.util.List; 
    import java.util.regex.Matcher; 
    import java.util.regex.Pattern; 
    import org.apache.commons.lang.StringUtils; 
    import org.apache.http.HttpEntity; 
    import org.apache.http.HttpResponse; 
    import org.apache.http.StatusLine; 
    import org.apache.http.client.ClientProtocolException; 
    import org.apache.http.client.HttpClient; 
    import org.apache.http.client.HttpResponseException; 
    import org.apache.http.client.ResponseHandler; 
    import org.apache.http.client.methods.HttpGet; 
    import org.apache.http.impl.client.DefaultHttpClient; 
    import org.apache.http.util.EntityUtils; 
    import org.slf4j.Logger; 
    import org.slf4j.LoggerFactory; 
    import org.springframework.beans.factory.annotation.Autowired; 
    import org.springframework.stereotype.Service; 
    import com.jeecms.cms.entity.assist.CmsAcquisition; 
    import com.jeecms.cms.entity.main.Content; 
    import com.jeecms.cms.manager.assist.CmsAcquisitionMng; 
     
    @Service 
    public class AcquisitionSvcImpl implements AcquisitionSvc { 
    private Logger log = LoggerFactory.getLogger(AcquisitionSvcImpl.class); 
     
    public boolean start(Integer id) {  
    CmsAcquisition acqu = cmsAcquisitionMng.findById(id); 
    if (acqu == null || acqu.getStatus() == CmsAcquisition.START) { 
    return false; 
    } 
    Thread thread = new AcquisitionThread(acqu); 
    thread.start(); 
    return true; 
    } 
     
    private CmsAcquisitionMng cmsAcquisitionMng; 
     
    @Autowired 
    public void setCmsAcquisitionMng(CmsAcquisitionMng cmsAcquisitionMng) { 
    this.cmsAcquisitionMng = cmsAcquisitionMng; 
    } 
     
    private class AcquisitionThread extends Thread { 
    private CmsAcquisition acqu; 
     
    public AcquisitionThread(CmsAcquisition acqu) {  
    super(acqu.getClass().getName() + "#" + acqu.getId()); 
    this.acqu = acqu; 
    }  
     
    @Override 
    public void run() { 
    if (acqu == null) { 
    return; 
    } 
    acqu = cmsAcquisitionMng.start(acqu.getId()); 
    String[] plans = acqu.getAllPlans(); 
    HttpClient client = new DefaultHttpClient(); 
    CharsetHandler handler = new CharsetHandler(acqu.getPageEncoding()); 
    List<String> contentList; 
    String url; 
    int currNum = acqu.getCurrNum(); 
    int currItem = acqu.getCurrItem(); 
    Integer acquId = acqu.getId(); 
     
    for (int i = plans.length - currNum; i >= 0; i--)  
    { 
    url = plans[i]; 
     
    contentList = getContentList(client, handler, url, acqu.getLinksetStart(), acqu.getLinksetEnd(), acqu.getLinkStart(), acqu.getLinkEnd()); 
     
    String link; 
     
    if(contentList!=null) 
    { 
    for (int j = contentList.size() - currItem; j >= 0; j--)  
    { 
    if (cmsAcquisitionMng.isNeedBreak(acqu.getId(), plans.length - i, contentList.size() - j, contentList.size()))  
    { 
    client.getConnectionManager().shutdown(); 
    log.info("Acquisition#{} breaked", acqu.getId()); 
    return; 
    } 
    if (acqu.getPauseTime() > 0)  
    { 
    try  
    { 
    Thread.sleep(acqu.getPauseTime()); 
    }  
    catch (InterruptedException e)  
    { 
    log.warn("", e); 
    } 
    } 
    link = contentList.get(j); 
    saveContent(client, handler, acquId, link, acqu.getTitleStart(), acqu.getTitleEnd(), acqu.getContentStart(), acqu.getContentEnd()); 
    } 
    } 
    currItem = 1; 
    } 
    client.getConnectionManager().shutdown(); 
    cmsAcquisitionMng.end(acqu.getId()); 
    log.info("Acquisition#{} complete", acqu.getId()); 
    } 
     
     
    private List<String> getContentList(HttpClient client, 
    CharsetHandler handler, String url, String linksetStart, 
    String linksetEnd, String linkStart, String linkEnd) { 
     
    List<String> list = new ArrayList<String>(); 
     
    try  
    { 
    HttpGet httpget = new HttpGet(new URI(url)); 
    String html = client.execute(httpget, handler); 
     
    Pattern pt = Pattern.compile(linksetStart.trim()); 
        Matcher m = pt.matcher(html); 
         
        if(m.find()) 
        { 
         html = m.group(); 
        } 
         
        if(html!=null)  
        { 
         list = getUrlsList(html,linkStart); 
        } 
         
    }  
    catch (Exception e)   
    { 
    log.warn(null, e); 
    } 
    return list; 
    } 
     
    /** 
     * 得到地址集 
     *  
     * @param html 
     * @param linkStart 
     * @return 
     */ 
    private List<String> getUrlsList(String html,String linkStart) 
    { 
    List<String> list = new ArrayList<String>(); 
     
         Pattern pt = Pattern.compile(linkStart); 
         
         Matcher m = pt.matcher(html); 
         
         while(m.find()) 
         { 
         String link = m.group(1); 
         
         if(null!=link && !"".equals(link)) 
         { 
         //System.out.println("url : " + link); 
         list.add(link); 
         } 
         } 
         return list; 
    } 
     
    private Content saveContent(HttpClient client, CharsetHandler handler, 
    Integer acquId, String url, String titleStart, String titleEnd, 
    String contentStart, String contentEnd) { 
     
    try { 
     
    HttpGet httpget = new HttpGet(new URI(url)); 
    String html = client.execute(httpget, handler); 
     
    String title = ""; 
    Pattern pt = Pattern.compile(titleStart.trim()); 
    Matcher mt = pt.matcher(html); 
     
    if (mt.find())  
    { 
    title = mt.group(1); 
    //System.out.println("title : " + title); 
    } 
     
    String txt = ""; 
    pt = Pattern.compile(contentStart.trim()); 
    mt = pt.matcher(html); 
    if(mt.find()){ 
    txt = mt.group(); 
    //System.out.println("txt : " + txt); 
    } 
     
    return cmsAcquisitionMng.saveContent(title, txt, acquId); 
     
    }  
    catch (Exception e)  
    { 
    log.warn(null, e);  
    e.printStackTrace(); 
    return null; 
    } 
    } 
    } 
     
    private class CharsetHandler implements ResponseHandler<String> { 
    private String charset; 
     
    public CharsetHandler(String charset) { 
    this.charset = charset; 
    } 
     
    public String handleResponse(HttpResponse response) 
    throws ClientProtocolException, IOException { 
    StatusLine statusLine = response.getStatusLine(); 
    if (statusLine.getStatusCode() >= 300) { 
    throw new HttpResponseException(statusLine.getStatusCode(), 
    statusLine.getReasonPhrase()); 
    } 
    HttpEntity entity = response.getEntity(); 
    if (entity != null) { 
    if (!StringUtils.isBlank(charset)) { 
    return EntityUtils.toString(entity, charset); 
    } else { 
    return EntityUtils.toString(entity); 
    } 
    } else { 
    return null; 
    } 
    } 
    } 
    } 
    //-------------------------------------------------------------------------------- 

    1:将AcquisitionSvcImpl.java 替换原工程项目com.jeecms.cms.service包下的对应文件。 

    2:编译工程即可 

    3:登陆后台配相关规则,如下所示参数: 

    ==================================== 
    *采集名称: 韩寒博客 

    *页面编码: UTF-8 

      动态地址: http://blog.sina.com.cn/s/articlelist_1191258123_0_[page].html 

                            页码 从   1  到:  2 

    内容地址集:   <!-- 列表 START -->.*?<!-- 列表END --> 

    内容地址: target="_blank" href="(.*?)">(.*?)</a></span> 

    标题:         <title>(.*?)_韩寒_新浪博客</title> 

    内容:         <!-- 正文开始 -->(.*?)<!-- 正文结束 -->

  • 相关阅读:
    【POJ 3525】Most Distant Point from the Sea(直线平移、半平面交)
    【HDU 4940】Destroy Transportation system(无源无汇带上下界可行流)
    codevs 5962 [SDOI2017]数字表格
    【NOIP2016】天天爱跑步
    [2011WorldFinal]Chips Challenge[流量平衡]
    [Ahoi2014]支线剧情[无源汇有下界最小费用可行流]
    [NOI2008] 志愿者招募[流量平衡]
    [Wc2007]剪刀石头布[补集转化+拆边]
    poj3281 Dining[最大流]
    1458: 士兵占领[最大流]
  • 原文地址:https://www.cnblogs.com/Jeely/p/11196092.html
Copyright © 2011-2022 走看看