zoukankan      html  css  js  c++  java
  • jsoup爬虫--博客园首页爬取和图片爬取

    jsoup爬虫

     

    1、导入pom依赖

    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
        xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <groupId>com.javaxl</groupId>
        <artifactId>T226_jsoup</artifactId>
        <version>0.0.1-SNAPSHOT</version>
        <packaging>jar</packaging>
    
        <name>T226_jsoup</name>
        <url>http://maven.apache.org</url>
    
        <properties>
            <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        </properties>
    
        <dependencies>
            <!-- jdbc驱动包 -->
            <dependency>
                <groupId>mysql</groupId>
                <artifactId>mysql-connector-java</artifactId>
                <version>5.1.44</version>
            </dependency>
    
            <!-- 添加Httpclient支持 -->
            <dependency>
                <groupId>org.apache.httpcomponents</groupId>
                <artifactId>httpclient</artifactId>
                <version>4.5.2</version>
            </dependency>
    
            <!-- 添加jsoup支持 -->
            <dependency>
                <groupId>org.jsoup</groupId>
                <artifactId>jsoup</artifactId>
                <version>1.10.1</version>
            </dependency>
    
    
            <!-- 添加日志支持 -->
            <dependency>
                <groupId>log4j</groupId>
                <artifactId>log4j</artifactId>
                <version>1.2.16</version>
            </dependency>
    
            <!-- 添加ehcache支持 -->
            <dependency>
                <groupId>net.sf.ehcache</groupId>
                <artifactId>ehcache</artifactId>
                <version>2.10.3</version>
            </dependency>
    
            <!-- 添加commons io支持 -->
            <dependency>
                <groupId>commons-io</groupId>
                <artifactId>commons-io</artifactId>
                <version>2.5</version>
            </dependency>
    
            <dependency>
                <groupId>com.alibaba</groupId>
                <artifactId>fastjson</artifactId>
                <version>1.2.47</version>
            </dependency>
        </dependencies>
    </project>

    2、网站爬取--BlogCrawlerStarter

      1 package com.javaxl.crawler;
      2 
      3 import java.io.File;
      4 import java.io.IOException;
      5 import java.sql.Connection;
      6 import java.sql.PreparedStatement;
      7 import java.sql.SQLException;
      8 import java.util.HashMap;
      9 import java.util.List;
     10 import java.util.Map;
     11 import java.util.UUID;
     12 
     13 import org.apache.commons.io.FileUtils;
     14 import org.apache.http.HttpEntity;
     15 import org.apache.http.client.ClientProtocolException;
     16 import org.apache.http.client.config.RequestConfig;
     17 import org.apache.http.client.methods.CloseableHttpResponse;
     18 import org.apache.http.client.methods.HttpGet;
     19 import org.apache.http.impl.client.CloseableHttpClient;
     20 import org.apache.http.impl.client.HttpClients;
     21 import org.apache.http.util.EntityUtils;
     22 import org.apache.log4j.Logger;
     23 import org.jsoup.Jsoup;
     24 import org.jsoup.nodes.Document;
     25 import org.jsoup.nodes.Element;
     26 import org.jsoup.select.Elements;
     27 
     28 import com.javaxl.util.DateUtil;
     29 import com.javaxl.util.DbUtil;
     30 import com.javaxl.util.PropertiesUtil;
     31 
     32 import net.sf.ehcache.Cache;
     33 import net.sf.ehcache.CacheManager;
     34 import net.sf.ehcache.Status;
     35 
     36 /**
     37  * @author Administrator
     38  *
     39  */
     40 public class BlogCrawlerStarter {
     41 
     42     private static Logger logger = Logger.getLogger(BlogCrawlerStarter.class);
     43 //    https://www.csdn.net/nav/newarticles
     44     private static String HOMEURL = "https://www.cnblogs.com/";
     45     private static CloseableHttpClient httpClient;
     46     private static Connection con;
     47     private static CacheManager cacheManager;
     48     private static Cache cache;
     49 
     50     /**
     51      * httpclient解析首页,获取首页内容
     52      */
     53     public static void parseHomePage() {
     54         logger.info("开始爬取首页:" + HOMEURL);
     55         
     56         cacheManager = CacheManager.create(PropertiesUtil.getValue("ehcacheXmlPath"));
     57         cache = cacheManager.getCache("cnblog");
     58         
     59         httpClient = HttpClients.createDefault();
     60         HttpGet httpGet = new HttpGet(HOMEURL);
     61         RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
     62         httpGet.setConfig(config);
     63         CloseableHttpResponse response = null;
     64         try {
     65             response = httpClient.execute(httpGet);
     66             if (response == null) {
     67                 logger.info(HOMEURL + ":爬取无响应");
     68                 return;
     69             }
     70 
     71             if (response.getStatusLine().getStatusCode() == 200) {
     72                 HttpEntity entity = response.getEntity();
     73                 String homePageContent = EntityUtils.toString(entity, "utf-8");
     74                 // System.out.println(homePageContent);
     75                 parseHomePageContent(homePageContent);
     76             }
     77 
     78         } catch (ClientProtocolException e) {
     79             logger.error(HOMEURL + "-ClientProtocolException", e);
     80         } catch (IOException e) {
     81             logger.error(HOMEURL + "-IOException", e);
     82         } finally {
     83             try {
     84                 if (response != null) {
     85                     response.close();
     86                 }
     87 
     88                 if (httpClient != null) {
     89                     httpClient.close();
     90                 }
     91             } catch (IOException e) {
     92                 logger.error(HOMEURL + "-IOException", e);
     93             }
     94         }
     95 
     96         if(cache.getStatus() ==  Status.STATUS_ALIVE) {
     97             cache.flush();
     98         }
     99         cacheManager.shutdown();
    100         logger.info("结束爬取首页:" + HOMEURL);
    101 
    102     }
    103 
    104     /**
    105      * 通过网络爬虫框架jsoup,解析网页类容,获取想要数据(博客的连接)
    106      * 
    107      * @param homePageContent
    108      */
    109     private static void parseHomePageContent(String homePageContent) {
    110         Document doc = Jsoup.parse(homePageContent);
    111         //#feedlist_id .list_con .title h2 a
    112         Elements aEles = doc.select("#post_list .post_item .post_item_body h3 a");
    113         for (Element aEle : aEles) {
    114 //            这个是首页中的博客列表中的单个链接URL
    115             String blogUrl = aEle.attr("href");
    116             if (null == blogUrl || "".equals(blogUrl)) {
    117                 logger.info("该博客未内容,不再爬取插入数据库!");
    118                 continue;
    119             }
    120             if(cache.get(blogUrl) != null) {
    121                 logger.info("该数据已经被爬取到数据库中,数据库不再收录!");
    122                 continue;
    123             }
    124 //            System.out.println("************************"+blogUrl+"****************************");
    125             
    126             parseBlogUrl(blogUrl);
    127         }
    128     }
    129 
    130     /**
    131      * 通过博客地址获取博客的标题,以及博客的类容
    132      * 
    133      * @param blogUrl
    134      */
    135     private static void parseBlogUrl(String blogUrl) {
    136 
    137         logger.info("开始爬取博客网页:" + blogUrl);
    138         httpClient = HttpClients.createDefault();
    139         HttpGet httpGet = new HttpGet(blogUrl);
    140         RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
    141         httpGet.setConfig(config);
    142         CloseableHttpResponse response = null;
    143         try {
    144             response = httpClient.execute(httpGet);
    145             if (response == null) {
    146                 logger.info(blogUrl + ":爬取无响应");
    147                 return;
    148             }
    149 
    150             if (response.getStatusLine().getStatusCode() == 200) {
    151                 HttpEntity entity = response.getEntity();
    152                 String blogContent = EntityUtils.toString(entity, "utf-8");
    153                 parseBlogContent(blogContent, blogUrl);
    154             }
    155 
    156         } catch (ClientProtocolException e) {
    157             logger.error(blogUrl + "-ClientProtocolException", e);
    158         } catch (IOException e) {
    159             logger.error(blogUrl + "-IOException", e);
    160         } finally {
    161             try {
    162                 if (response != null) {
    163                     response.close();
    164                 }
    165             } catch (IOException e) {
    166                 logger.error(blogUrl + "-IOException", e);
    167             }
    168         }
    169 
    170         logger.info("结束爬取博客网页:" + HOMEURL);
    171 
    172     }
    173 
    174     /**
    175      * 解析博客类容,获取博客中标题以及所有内容
    176      * 
    177      * @param blogContent
    178      */
    179     private static void parseBlogContent(String blogContent, String link) {
    180         Document doc = Jsoup.parse(blogContent);
    181         if(!link.contains("ansion2014")) {
    182             System.out.println(blogContent);
    183         }
    184         Elements titleEles = doc
    185                 //#mainBox main .blog-content-box .article-header-box .article-header .article-title-box h1
    186                 .select("#topics .post h1 a");
    187         System.out.println("123");
    188         System.out.println(titleEles.toString());
    189         System.out.println("123");
    190         if (titleEles.size() == 0) {
    191             logger.info("博客标题为空,不插入数据库!");
    192             return;
    193         }
    194         String title = titleEles.get(0).html();
    195 
    196         Elements blogContentEles = doc.select("#cnblogs_post_body ");
    197         if (blogContentEles.size() == 0) {
    198             logger.info("博客内容为空,不插入数据库!");
    199             return;
    200         }
    201         String blogContentBody = blogContentEles.get(0).html();
    202         
    203 //        Elements imgEles = doc.select("img");
    204 //        List<String> imgUrlList = new LinkedList<String>();
    205 //        if(imgEles.size() > 0) {
    206 //            for (Element imgEle : imgEles) {
    207 //                imgUrlList.add(imgEle.attr("src"));
    208 //            }
    209 //        }
    210 //        
    211 //        if(imgUrlList.size() > 0) {
    212 //            Map<String, String> replaceUrlMap = downloadImgList(imgUrlList);
    213 //            blogContent = replaceContent(blogContent,replaceUrlMap);
    214 //        }
    215 
    216         String sql = "insert into `t_jsoup_article` values(null,?,?,null,now(),0,0,null,?,0,null)";
    217         try {
    218             PreparedStatement pst = con.prepareStatement(sql);
    219             pst.setObject(1, title);
    220             pst.setObject(2, blogContentBody);
    221             pst.setObject(3, link);
    222             if(pst.executeUpdate() == 0) {
    223                 logger.info("爬取博客信息插入数据库失败");
    224             }else {
    225                 cache.put(new net.sf.ehcache.Element(link, link));
    226                 logger.info("爬取博客信息插入数据库成功");
    227             }
    228         } catch (SQLException e) {
    229             logger.error("数据异常-SQLException:",e);
    230         }
    231     }
    232 
    233     /**
    234      * 将别人博客内容进行加工,将原有图片地址换成本地的图片地址
    235      * @param blogContent
    236      * @param replaceUrlMap
    237      * @return
    238      */
    239     private static String replaceContent(String blogContent, Map<String, String> replaceUrlMap) {
    240         for(Map.Entry<String, String> entry: replaceUrlMap.entrySet()) {
    241             blogContent = blogContent.replace(entry.getKey(), entry.getValue());
    242         }
    243         return blogContent;
    244     }
    245 
    246     /**
    247      * 别人服务器图片本地化
    248      * @param imgUrlList
    249      * @return
    250      */
    251     private static Map<String, String> downloadImgList(List<String> imgUrlList) {
    252         Map<String, String> replaceMap = new HashMap<String, String>();
    253         for (String imgUrl : imgUrlList) {
    254             CloseableHttpClient httpClient = HttpClients.createDefault();
    255             HttpGet httpGet = new HttpGet(imgUrl);
    256             RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
    257             httpGet.setConfig(config);
    258             CloseableHttpResponse response = null;
    259             try {
    260                 response = httpClient.execute(httpGet);
    261                 if (response == null) {
    262                     logger.info(HOMEURL + ":爬取无响应");
    263                 }else {
    264                     if (response.getStatusLine().getStatusCode() == 200) {
    265                         HttpEntity entity = response.getEntity();
    266                         String blogImagesPath = PropertiesUtil.getValue("blogImages");
    267                         String dateDir = DateUtil.getCurrentDatePath();
    268                         String uuid = UUID.randomUUID().toString();
    269                         String subfix = entity.getContentType().getValue().split("/")[1];
    270                         String fileName = blogImagesPath + dateDir + "/" + uuid + "." + subfix;
    271                         
    272                         FileUtils.copyInputStreamToFile(entity.getContent(), new File(fileName));
    273                         replaceMap.put(imgUrl, fileName);
    274                     }
    275                 }
    276             } catch (ClientProtocolException e) {
    277                 logger.error(imgUrl + "-ClientProtocolException", e);
    278             } catch (IOException e) {
    279                 logger.error(imgUrl + "-IOException", e);
    280             } catch (Exception e) {
    281                 logger.error(imgUrl + "-Exception", e);
    282             } finally {
    283                 try {
    284                     if (response != null) {
    285                         response.close();
    286                     }
    287                 } catch (IOException e) {
    288                     logger.error(imgUrl + "-IOException", e);
    289                 }
    290             }
    291         
    292         }
    293         return replaceMap;
    294     }
    295 
    296     public static void start() {
    297         while(true) {
    298             DbUtil dbUtil = new DbUtil();
    299             try {
    300                 con = dbUtil.getCon();
    301                 parseHomePage();
    302             } catch (Exception e) {
    303                 logger.error("数据库连接势失败!");
    304             } finally {
    305                 try {
    306                     if (con != null) {
    307                         con.close();
    308                     }
    309                 } catch (SQLException e) {
    310                     logger.error("数据关闭异常-SQLException:",e);
    311                 }
    312             }
    313             try {
    314                 Thread.sleep(1000*60);
    315             } catch (InterruptedException e) {
    316                 logger.error("主线程休眠异常-InterruptedException:",e);
    317             }
    318         }
    319     }
    320 
    321     public static void main(String[] args) {
    322         start();
    323     }
    324 }

    博客园首页信息图片

    爬取到的数据

    3、简单图片爬取 --DownloadImg

    package com.javaxl.crawler;
    
    import java.io.File;
    import java.io.IOException;
    import java.util.UUID;
    
    import org.apache.commons.io.FileUtils;
    import org.apache.http.HttpEntity;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.config.RequestConfig;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.log4j.Logger;
    
    import com.javaxl.util.DateUtil;
    import com.javaxl.util.PropertiesUtil;
    
    public class DownloadImg {
        private static Logger logger = Logger.getLogger(DownloadImg.class);
        private static String URL = "http://photocdn.sohu.com/20120625/Img346436473.jpg";
        public static void main(String[] args) {
            logger.info("开始爬取首页:" + URL);
            CloseableHttpClient httpClient = HttpClients.createDefault();
            HttpGet httpGet = new HttpGet(URL);
            RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
            httpGet.setConfig(config);
            CloseableHttpResponse response = null;
            try {
                response = httpClient.execute(httpGet);
                if (response == null) {
                    logger.info("连接超时!!!");
                } else {
                    HttpEntity entity = response.getEntity();
                    String imgPath = PropertiesUtil.getValue("blogImages");
                    String dateDir = DateUtil.getCurrentDatePath();
                    String uuid = UUID.randomUUID().toString();
                    String subfix = entity.getContentType().getValue().split("/")[1];
                    String localFile = imgPath+dateDir+"/"+uuid+"."+subfix;
    //                System.out.println(localFile);
                    FileUtils.copyInputStreamToFile(entity.getContent(), new File(localFile));
                }
            } catch (ClientProtocolException e) {
                logger.error(URL+"-ClientProtocolException", e);
            } catch (IOException e) {
                logger.error(URL+"-IOException", e);
            } catch (Exception e) {
                logger.error(URL+"-Exception", e);
            } finally {
                try {
                    if (response != null) {
                        response.close();
                    }
                    if(httpClient != null) {
                        httpClient.close();
                    }
                } catch (IOException e) {
                    logger.error(URL+"-IOException", e);
                }
            }
            
    
            logger.info("结束首页爬取:" + URL);
        
        }
    }

    爬取图片样式

    爬取结果

     

    谢谢观看!!!

  • 相关阅读:
    安装虚拟机及学习linux系统 20155222卢梓杰
    技能获取与编程学习 卢梓杰20155222
    人生第一篇博客
    20155228 2016-2017-2 《Java程序设计》第1周学习总结
    20155228 基于VirtualBox安装Ubuntu和学习linux命令的学习经历和心得
    20155228 获取技能的成功经验和关于C语言学习的调查
    20155228 你期望的师生关系是什么?
    预备作业03:安装虚拟机
    足球运动训练心得及经验分析-c语言学习调查
    我期望的师生关系
  • 原文地址:https://www.cnblogs.com/ly-0919/p/11639134.html
Copyright © 2011-2022 走看看