========================================================= 没办法附件上传不了,AcquisitionSvcImpl.java类: //---------------------------------------------------------------------------- package com.jeecms.cms.service; import java.io.IOException; import java.net.URI; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.StatusLine; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.HttpResponseException; import org.apache.http.client.ResponseHandler; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import com.jeecms.cms.entity.assist.CmsAcquisition; import com.jeecms.cms.entity.main.Content; import com.jeecms.cms.manager.assist.CmsAcquisitionMng; @Service public class AcquisitionSvcImpl implements AcquisitionSvc { private Logger log = LoggerFactory.getLogger(AcquisitionSvcImpl.class); public boolean start(Integer id) { CmsAcquisition acqu = cmsAcquisitionMng.findById(id); if (acqu == null || acqu.getStatus() == CmsAcquisition.START) { return false; } Thread thread = new AcquisitionThread(acqu); thread.start(); return true; } private CmsAcquisitionMng cmsAcquisitionMng; @Autowired public void setCmsAcquisitionMng(CmsAcquisitionMng cmsAcquisitionMng) { this.cmsAcquisitionMng = cmsAcquisitionMng; } private class AcquisitionThread extends Thread { private CmsAcquisition acqu; public AcquisitionThread(CmsAcquisition acqu) { super(acqu.getClass().getName() + "#" + acqu.getId()); this.acqu = acqu; } @Override public void run() { if (acqu == null) { return; } acqu = cmsAcquisitionMng.start(acqu.getId()); String[] plans = acqu.getAllPlans(); HttpClient client = new DefaultHttpClient(); CharsetHandler handler = new CharsetHandler(acqu.getPageEncoding()); List<String> contentList; String url; int currNum = acqu.getCurrNum(); int currItem = acqu.getCurrItem(); Integer acquId = acqu.getId(); for (int i = plans.length - currNum; i >= 0; i--) { url = plans[i]; contentList = getContentList(client, handler, url, acqu.getLinksetStart(), acqu.getLinksetEnd(), acqu.getLinkStart(), acqu.getLinkEnd()); String link; if(contentList!=null) { for (int j = contentList.size() - currItem; j >= 0; j--) { if (cmsAcquisitionMng.isNeedBreak(acqu.getId(), plans.length - i, contentList.size() - j, contentList.size())) { client.getConnectionManager().shutdown(); log.info("Acquisition#{} breaked", acqu.getId()); return; } if (acqu.getPauseTime() > 0) { try { Thread.sleep(acqu.getPauseTime()); } catch (InterruptedException e) { log.warn("", e); } } link = contentList.get(j); saveContent(client, handler, acquId, link, acqu.getTitleStart(), acqu.getTitleEnd(), acqu.getContentStart(), acqu.getContentEnd()); } } currItem = 1; } client.getConnectionManager().shutdown(); cmsAcquisitionMng.end(acqu.getId()); log.info("Acquisition#{} complete", acqu.getId()); } private List<String> getContentList(HttpClient client, CharsetHandler handler, String url, String linksetStart, String linksetEnd, String linkStart, String linkEnd) { List<String> list = new ArrayList<String>(); try { HttpGet httpget = new HttpGet(new URI(url)); String html = client.execute(httpget, handler); Pattern pt = Pattern.compile(linksetStart.trim()); Matcher m = pt.matcher(html); if(m.find()) { html = m.group(); } if(html!=null) { list = getUrlsList(html,linkStart); } } catch (Exception e) { log.warn(null, e); } return list; } /** * 得到地址集 * * @param html * @param linkStart * @return */ private List<String> getUrlsList(String html,String linkStart) { List<String> list = new ArrayList<String>(); Pattern pt = Pattern.compile(linkStart); Matcher m = pt.matcher(html); while(m.find()) { String link = m.group(1); if(null!=link && !"".equals(link)) { //System.out.println("url : " + link); list.add(link); } } return list; } private Content saveContent(HttpClient client, CharsetHandler handler, Integer acquId, String url, String titleStart, String titleEnd, String contentStart, String contentEnd) { try { HttpGet httpget = new HttpGet(new URI(url)); String html = client.execute(httpget, handler); String title = ""; Pattern pt = Pattern.compile(titleStart.trim()); Matcher mt = pt.matcher(html); if (mt.find()) { title = mt.group(1); //System.out.println("title : " + title); } String txt = ""; pt = Pattern.compile(contentStart.trim()); mt = pt.matcher(html); if(mt.find()){ txt = mt.group(); //System.out.println("txt : " + txt); } return cmsAcquisitionMng.saveContent(title, txt, acquId); } catch (Exception e) { log.warn(null, e); e.printStackTrace(); return null; } } } private class CharsetHandler implements ResponseHandler<String> { private String charset; public CharsetHandler(String charset) { this.charset = charset; } public String handleResponse(HttpResponse response) throws ClientProtocolException, IOException { StatusLine statusLine = response.getStatusLine(); if (statusLine.getStatusCode() >= 300) { throw new HttpResponseException(statusLine.getStatusCode(), statusLine.getReasonPhrase()); } HttpEntity entity = response.getEntity(); if (entity != null) { if (!StringUtils.isBlank(charset)) { return EntityUtils.toString(entity, charset); } else { return EntityUtils.toString(entity); } } else { return null; } } } } //--------------------------------------------------------------------------------
1:将AcquisitionSvcImpl.java 替换原工程项目com.jeecms.cms.service包下的对应文件。
2:编译工程即可
3:登陆后台配相关规则,如下所示参数:
====================================
*采集名称: 韩寒博客
*页面编码: UTF-8
动态地址: http://blog.sina.com.cn/s/articlelist_1191258123_0_[page].html
页码 从 1 到: 2
内容地址集: <!-- 列表 START -->.*?<!-- 列表END -->
内容地址: target="_blank" href="(.*?)">(.*?)</a></span>
标题: <title>(.*?)_韩寒_新浪博客</title>
内容: <!-- 正文开始 -->(.*?)<!-- 正文结束 -->