zoukankan      html  css  js  c++  java
  • 定时爬虫系统(以爬取[百度7日关注]为例)

    1、web.xml加载servlet

     1 <?xml version="1.0" encoding="UTF-8"?>
     2 <web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://java.sun.com/xml/ns/javaee" xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd" id="WebApp_ID" version="2.5">
     3   <display-name>TaskSchedule</display-name>
     4   <welcome-file-list>
     5     <welcome-file>index.html</welcome-file>
     6     <welcome-file>index.htm</welcome-file>
     7     <welcome-file>index.jsp</welcome-file>
     8     <welcome-file>default.html</welcome-file>
     9     <welcome-file>default.htm</welcome-file>
    10     <welcome-file>default.jsp</welcome-file>
    11   </welcome-file-list>
    12   <servlet>
    13       <servlet-name>hotword</servlet-name>
    14       <servlet-class>com.richinfo.asynctask.servlet.TaskScheduleServlet</servlet-class>
    15       <load-on-startup>1</load-on-startup>
    16   </servlet>
    17 </web-app>

    2、TaskScheduleServlet初始化init

     1 public class TaskScheduleServlet extends HttpServlet{
     2     private static final Log logger = LogFactory.getLog(TaskScheduleServlet.class);
     3     private static final long serialVersionUID = 9089148097823231232L;
     4     
     5     @Override
     6     public void init() throws ServletException {
     7         super.init();
     8         logger.info("TaskScheduleServlet init!");
     9         System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
    10         System.setProperty("sun.net.client.defaultReadTimeout", "30000");
    11         TaskRegister.getInstance().start();
    12         printLocalHost();
    13         logger.info("TaskScheduleServlet start!");
    14     }
    15     private void printLocalHost() {
    16         try {
    17             InetAddress local = InetAddress.getLocalHost();
    18             String ip = local.getHostAddress();
    19             String host = local.getHostName();
    20             logger.info("服务器信息:ip=" + ip + ", host=" + host);
    21         } catch (UnknownHostException e) {
    22             logger.error(e.getMessage());
    23         }
    24     }
    25 
    26     @Override
    27     public void destroy() {
    28         super.destroy();
    29         TaskRegister.getInstance().shutdown();
    30         logger.info("TaskScheduleServlet destroy!");
    31     }
    32 }

    3、TaskRegister.getInstance().start()

    4、定时任务的设置,这里不做赘述(有兴趣可以看我的另一篇有关定时任务配置的文章http://www.cnblogs.com/zhuziyu/p/7704661.html)

    5、爬虫具体实现

    (1)调用任务

     1 public class BaiduDayHotFocusGrabJob implements Job{
     2     private final static Log logger = LogFactory.getLog(BaiduDayHotFocusGrabJob.class);
     3     private UnifiedPosMgrPlatService service = new UnifiedPosMgrPlatSeviceImpl();
     4     @Override
     5     public void execute(JobExecutionContext context)
     6             throws JobExecutionException {
     7         JobKey key = context.getJobDetail().getKey();
     8         logger.info("开始任务["+key.getGroup()+"."+key.getName()+"]");
     9         
    10         try {
    11             resolve();
    12         } catch (Exception e) {
    13             logger.error(Tools.getStackInfo(e));
    14         }
    15     }
    16     
    17     public void resolve() throws IOException {
    18         Document doc = Jsoup.connect("http://top.baidu.com/").timeout(10000).get();
    19         Element attentionDiv = doc.select("#main div.tab-bd div.tab-box:eq(1)").first();
    20         Elements attentionEles = attentionDiv.select("ul li a.list-title");
    21         // 七日关注
    22         
    23         List<HotSpot> attentionList = new ArrayList<HotSpot>();
    24         int len = attentionEles.size();
    25         for (int i = 0; i < len; i++) {
    26             Element ele = attentionEles.get(i);
    27             String name = ele.text().trim();
    28             String uri = ele.attr("href").trim();
    29             attentionList.add(new HotSpot(Source.BAIDU_TOP, uri, name, i));
    30         }
    31         HotFocus focus = new HotFocus(Type.DAILY, Source.BAIDU_TOP, attentionList);
    32         int ret = service.addHotFocus(focus);
    33         if (ret != 1) {
    34             logger.warn("更新七日关注失败!");
    35         }
    36         else {
    37             logger.info("更新七日关注成功!");
    38         }
    39     }
    40 }

    (2)具体实现

     1 public class UnifiedPosMgrPlatSeviceImpl implements UnifiedPosMgrPlatService {
     2     private static final Logger logger = Logger.getLogger(UnifiedPosMgrPlatSeviceImpl.class);
     3     
     4     @Override
     5     public int addHotFocus(HotFocus focus) throws IOException {
     6         if (focus == null) return -1;
     7         List<HotSpot> spots = focus.getHots();
     8         if (spots == null || spots.isEmpty()) return -1;
     9         String uri = getAddHotWordInvokeUrl();
    10         String requestXml = requestAddHotWord(focus);
    11         String responseXml = invokeAddHotWords(uri, requestXml);
    12         return responseAddHotWords(responseXml);
    13     }
    14 }

    对应要调用的接口系统:本人的是http://127.0.0.1:8080/接口系统/方法

    6、对应系统的接口实现这里不赘述,最终执行数据库操作,将爬取内容写入数据库表

  • 相关阅读:
    Neo4j的查询语法笔记(二)
    NEO4J -模糊查询
    mybatis常用jdbcType数据类型
    Codrops 优秀教程:CSS 3D Transforms 实现书本效果
    案例分享:20佳应用大图片背景的优秀网站作品
    分享一套精美的现代 UI PSD 工具包【免费下载】
    经验分享:10个简单实用的 jQuery 代码片段
    Popline:帅气的浮动 HTML5 文本编辑器工具栏
    《分享》学习单页网站制作的20个优秀案例
    Minimit Anima – 硬件加速的 CSS3 动画插件
  • 原文地址:https://www.cnblogs.com/zhuziyu/p/8920349.html
Copyright © 2011-2022 走看看