zoukankan      html  css  js  c++  java
  • jsoup获取文章内容

    jsoup爬取文章内容

    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        // TODO Auto-generated method stub
        //response.getWriter().append("Served at: ").append(request.getContextPath());
        String agent1 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36";
        
        int pageNum=1;
        int pageSize=899;
        //for(pageNum=1;pageNum<101;pageNum++)
        for(pageNum=1;pageNum<2;pageNum++)
        {
            try {
                int page1= 277;
                Map<Integer,String> map1 = ManageMySQL.getNewsLinkInTable(page1,pageSize,"data_szyjglj");
                for(Integer key : map1.keySet())
                {
                    System.out.println(key+"  "+map1.get(key));
                    String news_link = map1.get(key);
                    String context1="";
                    String source1="";
                    //String context1 = getContentByURL(news_link).replace(" ", "");
                    
                    Document documentRoot = Jsoup.connect(news_link).userAgent(agent1).get();
                    Elements elements1 = documentRoot.select("div.source span");
                    if(elements1.size()==2)
                    {
                        Element span_ele = elements1.get(0);
                        source1 = span_ele.text();
                    }
                    
                    Elements elements2 = documentRoot.select("div.view_box");
                    if(elements2.size()==1)
                    {
                        Element div_ele = elements2.get(0);
                        context1 = div_ele.text();
                    }
                    
                    ManageMySQL.updateContextAndPublishDate(key, context1.replace("'", "").replace(""", ""),source1,"data_szyjglj");
                }
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            
        }
    }
  • 相关阅读:
    asp.net 对母版页的控件事件
    treeview操作集合
    使用GAppProxy时安全证书无效的解决办法
    向Excel模板中添加数据
    C# 重写 winform 关闭按钮
    完整ASP.Net Excel导入程序(支持2007)
    随笔二则
    标记枚举(flags)的使用
    System.Reflection.Missing.Value与Type.Missing
    Windows下Android源码下载方法
  • 原文地址:https://www.cnblogs.com/herd/p/11722013.html
Copyright © 2011-2022 走看看