爬虫简介
爬虫:
网络爬虫机器人,以互联网自由抓取数据的程序
爬虫的作用:
- 搜索引擎
- 商品比价
- 知乎的数据分析平台
如何写一个爬虫?
网页的三大特征:
- 每一个网页都有一个唯一的url(统一资源定位符)来进行定位
- 网页都是通过HTML<超文本>文本展示的
- 所有的网页都是通过HTTP<超文本传输协议>(HTTPS)协议来传输的
爬虫的流程:
1.分析网站,得到新的url
+2. 根据url,发起请求,获取页面的HTML源码
3.从页面中提取数据
a.提取到目标数据,做数据的筛选和持久化储存
b.从页面中提取到新的url地址,继续执行第二操作
4.爬虫结束:所有的目标url都提取完毕,并且得到了数据,再也没有其他请求任务,意味着爬虫结束
pom依赖
1 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 2 xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 3 <modelVersion>4.0.0</modelVersion> 4 5 <groupId>com.javaxl</groupId> 6 <artifactId>T226_jsoup</artifactId> 7 <version>0.0.1-SNAPSHOT</version> 8 <packaging>jar</packaging> 9 10 <name>T226_jsoup</name> 11 <url>http://maven.apache.org</url> 12 13 <properties> 14 <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> 15 </properties> 16 17 <dependencies> 18 <!-- jdbc驱动包 --> 19 <dependency> 20 <groupId>mysql</groupId> 21 <artifactId>mysql-connector-java</artifactId> 22 <version>5.1.44</version> 23 </dependency> 24 25 <!-- 添加Httpclient支持 --> 26 <dependency> 27 <groupId>org.apache.httpcomponents</groupId> 28 <artifactId>httpclient</artifactId> 29 <version>4.5.2</version> 30 </dependency> 31 32 <!-- 添加jsoup支持 --> 33 <dependency> 34 <groupId>org.jsoup</groupId> 35 <artifactId>jsoup</artifactId> 36 <version>1.10.1</version> 37 </dependency> 38 39 40 <!-- 添加日志支持 --> 41 <dependency> 42 <groupId>log4j</groupId> 43 <artifactId>log4j</artifactId> 44 <version>1.2.16</version> 45 </dependency> 46 47 <!-- 添加ehcache支持 --> 48 <dependency> 49 <groupId>net.sf.ehcache</groupId> 50 <artifactId>ehcache</artifactId> 51 <version>2.10.3</version> 52 </dependency> 53 54 <!-- 添加commons io支持 --> 55 <dependency> 56 <groupId>commons-io</groupId> 57 <artifactId>commons-io</artifactId> 58 <version>2.5</version> 59 </dependency> 60 61 <dependency> 62 <groupId>com.alibaba</groupId> 63 <artifactId>fastjson</artifactId> 64 <version>1.2.47</version> 65 </dependency> 66 </dependencies> 67 </project>
我们来爬一个图片
DownloadImg .java
1 package com.javaxl.crawler; 2 3 import java.io.File; 4 import java.io.IOException; 5 import java.util.UUID; 6 7 import org.apache.commons.io.FileUtils; 8 import org.apache.http.HttpEntity; 9 import org.apache.http.client.ClientProtocolException; 10 import org.apache.http.client.config.RequestConfig; 11 import org.apache.http.client.methods.CloseableHttpResponse; 12 import org.apache.http.client.methods.HttpGet; 13 import org.apache.http.impl.client.CloseableHttpClient; 14 import org.apache.http.impl.client.HttpClients; 15 import org.apache.log4j.Logger; 16 17 import com.javaxl.util.DateUtil; 18 import com.javaxl.util.PropertiesUtil; 19 20 public class DownloadImg { 21 private static Logger logger = Logger.getLogger(DownloadImg.class); 22 private static String URL = "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1570570095818&di=f6f36a34fb29850cce69942cce9e8259&imgtype=0&src=http%3A%2F%2Fpic1.win4000.com%2Fpic%2F6%2F56%2F85ea1629713.jpg"; 23 public static void main(String[] args) { 24 logger.info("开始爬取首页:" + URL); 25 CloseableHttpClient httpClient = HttpClients.createDefault(); 26 HttpGet httpGet = new HttpGet(URL); 27 RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build(); 28 httpGet.setConfig(config); 29 CloseableHttpResponse response = null; 30 try { 31 response = httpClient.execute(httpGet); 32 if (response == null) { 33 logger.info("连接超时!!!"); 34 } else { 35 HttpEntity entity = response.getEntity(); 36 String imgPath = PropertiesUtil.getValue("blogImages"); 37 String dateDir = DateUtil.getCurrentDatePath(); 38 String uuid = UUID.randomUUID().toString(); 39 String subfix = entity.getContentType().getValue().split("/")[1]; 40 String localFile = imgPath+dateDir+"/"+uuid+"."+subfix; 41 // System.out.println(localFile); 42 FileUtils.copyInputStreamToFile(entity.getContent(), new File(localFile)); 43 } 44 } catch (ClientProtocolException e) { 45 logger.error(URL+"-ClientProtocolException", e); 46 } catch (IOException e) { 47 logger.error(URL+"-IOException", e); 48 } catch (Exception e) { 49 logger.error(URL+"-Exception", e); 50 } finally { 51 try { 52 if (response != null) { 53 response.close(); 54 } 55 if(httpClient != null) { 56 httpClient.close(); 57 } 58 } catch (IOException e) { 59 logger.error(URL+"-IOException", e); 60 } 61 } 62 63 64 logger.info("结束首页爬取:" + URL); 65 66 } 67 }
爬博客园的首页
BlogCrawlerStarter.java
1 package com.javaxl.crawler; 2 3 import java.io.File; 4 import java.io.IOException; 5 import java.sql.Connection; 6 import java.sql.PreparedStatement; 7 import java.sql.SQLException; 8 import java.util.HashMap; 9 import java.util.List; 10 import java.util.Map; 11 import java.util.UUID; 12 13 import org.apache.commons.io.FileUtils; 14 import org.apache.http.HttpEntity; 15 import org.apache.http.client.ClientProtocolException; 16 import org.apache.http.client.config.RequestConfig; 17 import org.apache.http.client.methods.CloseableHttpResponse; 18 import org.apache.http.client.methods.HttpGet; 19 import org.apache.http.impl.client.CloseableHttpClient; 20 import org.apache.http.impl.client.HttpClients; 21 import org.apache.http.util.EntityUtils; 22 import org.apache.log4j.Logger; 23 import org.jsoup.Jsoup; 24 import org.jsoup.nodes.Document; 25 import org.jsoup.nodes.Element; 26 import org.jsoup.select.Elements; 27 28 import com.javaxl.util.DateUtil; 29 import com.javaxl.util.DbUtil; 30 import com.javaxl.util.PropertiesUtil; 31 32 import net.sf.ehcache.Cache; 33 import net.sf.ehcache.CacheManager; 34 import net.sf.ehcache.Status; 35 36 /** 37 * @author Administrator 38 * 39 */ 40 public class BlogCrawlerStarter { 41 42 private static Logger logger = Logger.getLogger(BlogCrawlerStarter.class); 43 // https://www.csdn.net/nav/newarticles 44 private static String HOMEURL = "https://www.cnblogs.com/"; 45 private static CloseableHttpClient httpClient; 46 private static Connection con; 47 private static CacheManager cacheManager; 48 private static Cache cache; 49 50 /** 51 * httpclient解析首页,获取首页内容 52 */ 53 public static void parseHomePage() { 54 logger.info("开始爬取首页:" + HOMEURL); 55 56 cacheManager = CacheManager.create(PropertiesUtil.getValue("ehcacheXmlPath")); 57 cache = cacheManager.getCache("cnblog"); 58 59 httpClient = HttpClients.createDefault(); 60 HttpGet httpGet = new HttpGet(HOMEURL); 61 RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build(); 62 httpGet.setConfig(config); 63 CloseableHttpResponse response = null; 64 try { 65 response = httpClient.execute(httpGet); 66 if (response == null) { 67 logger.info(HOMEURL + ":爬取无响应"); 68 return; 69 } 70 71 if (response.getStatusLine().getStatusCode() == 200) { 72 HttpEntity entity = response.getEntity(); 73 String homePageContent = EntityUtils.toString(entity, "utf-8"); 74 // System.out.println(homePageContent); 75 parseHomePageContent(homePageContent); 76 } 77 78 } catch (ClientProtocolException e) { 79 logger.error(HOMEURL + "-ClientProtocolException", e); 80 } catch (IOException e) { 81 logger.error(HOMEURL + "-IOException", e); 82 } finally { 83 try { 84 if (response != null) { 85 response.close(); 86 } 87 88 if (httpClient != null) { 89 httpClient.close(); 90 } 91 } catch (IOException e) { 92 logger.error(HOMEURL + "-IOException", e); 93 } 94 } 95 96 if(cache.getStatus() == Status.STATUS_ALIVE) { 97 cache.flush(); 98 } 99 cacheManager.shutdown(); 100 logger.info("结束爬取首页:" + HOMEURL); 101 102 } 103 104 /** 105 * 通过网络爬虫框架jsoup,解析网页类容,获取想要数据(博客的连接) 106 * 107 * @param homePageContent 108 */ 109 private static void parseHomePageContent(String homePageContent) { 110 Document doc = Jsoup.parse(homePageContent); 111 //#feedlist_id .list_con .title h2 a 112 Elements aEles = doc.select("#post_list .post_item .post_item_body h3 a"); 113 for (Element aEle : aEles) { 114 // 这个是首页中的博客列表中的单个链接URL 115 String blogUrl = aEle.attr("href"); 116 if (null == blogUrl || "".equals(blogUrl)) { 117 logger.info("该博客未内容,不再爬取插入数据库!"); 118 continue; 119 } 120 if(cache.get(blogUrl) != null) { 121 logger.info("该数据已经被爬取到数据库中,数据库不再收录!"); 122 continue; 123 } 124 // System.out.println("************************"+blogUrl+"****************************"); 125 126 parseBlogUrl(blogUrl); 127 } 128 } 129 130 /** 131 * 通过博客地址获取博客的标题,以及博客的类容 132 * 133 * @param blogUrl 134 */ 135 private static void parseBlogUrl(String blogUrl) { 136 137 logger.info("开始爬取博客网页:" + blogUrl); 138 httpClient = HttpClients.createDefault(); 139 HttpGet httpGet = new HttpGet(blogUrl); 140 RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build(); 141 httpGet.setConfig(config); 142 CloseableHttpResponse response = null; 143 try { 144 response = httpClient.execute(httpGet); 145 if (response == null) { 146 logger.info(blogUrl + ":爬取无响应"); 147 return; 148 } 149 150 if (response.getStatusLine().getStatusCode() == 200) { 151 HttpEntity entity = response.getEntity(); 152 String blogContent = EntityUtils.toString(entity, "utf-8"); 153 parseBlogContent(blogContent, blogUrl); 154 } 155 156 } catch (ClientProtocolException e) { 157 logger.error(blogUrl + "-ClientProtocolException", e); 158 } catch (IOException e) { 159 logger.error(blogUrl + "-IOException", e); 160 } finally { 161 try { 162 if (response != null) { 163 response.close(); 164 } 165 } catch (IOException e) { 166 logger.error(blogUrl + "-IOException", e); 167 } 168 } 169 170 logger.info("结束爬取博客网页:" + HOMEURL); 171 172 } 173 174 /** 175 * 解析博客类容,获取博客中标题以及所有内容 176 * 177 * @param blogContent 178 */ 179 private static void parseBlogContent(String blogContent, String link) { 180 Document doc = Jsoup.parse(blogContent); 181 if(!link.contains("ansion2014")) { 182 System.out.println(blogContent); 183 } 184 Elements titleEles = doc 185 //#mainBox main .blog-content-box .article-header-box .article-header .article-title-box h1 186 .select("#topics .post h1 a"); 187 System.out.println("123"); 188 System.out.println(titleEles.toString()); 189 System.out.println("123"); 190 if (titleEles.size() == 0) { 191 logger.info("博客标题为空,不插入数据库!"); 192 return; 193 } 194 String title = titleEles.get(0).html(); 195 196 Elements blogContentEles = doc.select("#cnblogs_post_body "); 197 if (blogContentEles.size() == 0) { 198 logger.info("博客内容为空,不插入数据库!"); 199 return; 200 } 201 String blogContentBody = blogContentEles.get(0).html(); 202 203 // Elements imgEles = doc.select("img"); 204 // List<String> imgUrlList = new LinkedList<String>(); 205 // if(imgEles.size() > 0) { 206 // for (Element imgEle : imgEles) { 207 // imgUrlList.add(imgEle.attr("src")); 208 // } 209 // } 210 // 211 // if(imgUrlList.size() > 0) { 212 // Map<String, String> replaceUrlMap = downloadImgList(imgUrlList); 213 // blogContent = replaceContent(blogContent,replaceUrlMap); 214 // } 215 216 String sql = "insert into `t_jsoup_article` values(null,?,?,null,now(),0,0,null,?,0,null)"; 217 try { 218 PreparedStatement pst = con.prepareStatement(sql); 219 pst.setObject(1, title); 220 pst.setObject(2, blogContentBody); 221 pst.setObject(3, link); 222 if(pst.executeUpdate() == 0) { 223 logger.info("爬取博客信息插入数据库失败"); 224 }else { 225 cache.put(new net.sf.ehcache.Element(link, link)); 226 logger.info("爬取博客信息插入数据库成功"); 227 } 228 } catch (SQLException e) { 229 logger.error("数据异常-SQLException:",e); 230 } 231 } 232 233 /** 234 * 将别人博客内容进行加工,将原有图片地址换成本地的图片地址 235 * @param blogContent 236 * @param replaceUrlMap 237 * @return 238 */ 239 private static String replaceContent(String blogContent, Map<String, String> replaceUrlMap) { 240 for(Map.Entry<String, String> entry: replaceUrlMap.entrySet()) { 241 blogContent = blogContent.replace(entry.getKey(), entry.getValue()); 242 } 243 return blogContent; 244 } 245 246 /** 247 * 别人服务器图片本地化 248 * @param imgUrlList 249 * @return 250 */ 251 private static Map<String, String> downloadImgList(List<String> imgUrlList) { 252 Map<String, String> replaceMap = new HashMap<String, String>(); 253 for (String imgUrl : imgUrlList) { 254 CloseableHttpClient httpClient = HttpClients.createDefault(); 255 HttpGet httpGet = new HttpGet(imgUrl); 256 RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build(); 257 httpGet.setConfig(config); 258 CloseableHttpResponse response = null; 259 try { 260 response = httpClient.execute(httpGet); 261 if (response == null) { 262 logger.info(HOMEURL + ":爬取无响应"); 263 }else { 264 if (response.getStatusLine().getStatusCode() == 200) { 265 HttpEntity entity = response.getEntity(); 266 String blogImagesPath = PropertiesUtil.getValue("blogImages"); 267 String dateDir = DateUtil.getCurrentDatePath(); 268 String uuid = UUID.randomUUID().toString(); 269 String subfix = entity.getContentType().getValue().split("/")[1]; 270 String fileName = blogImagesPath + dateDir + "/" + uuid + "." + subfix; 271 272 FileUtils.copyInputStreamToFile(entity.getContent(), new File(fileName)); 273 replaceMap.put(imgUrl, fileName); 274 } 275 } 276 } catch (ClientProtocolException e) { 277 logger.error(imgUrl + "-ClientProtocolException", e); 278 } catch (IOException e) { 279 logger.error(imgUrl + "-IOException", e); 280 } catch (Exception e) { 281 logger.error(imgUrl + "-Exception", e); 282 } finally { 283 try { 284 if (response != null) { 285 response.close(); 286 } 287 } catch (IOException e) { 288 logger.error(imgUrl + "-IOException", e); 289 } 290 } 291 292 } 293 return replaceMap; 294 } 295 296 public static void start() { 297 while(true) { 298 DbUtil dbUtil = new DbUtil(); 299 try { 300 con = dbUtil.getCon(); 301 parseHomePage(); 302 } catch (Exception e) { 303 logger.error("数据库连接势失败!"); 304 } finally { 305 try { 306 if (con != null) { 307 con.close(); 308 } 309 } catch (SQLException e) { 310 logger.error("数据关闭异常-SQLException:",e); 311 } 312 } 313 try { 314 Thread.sleep(1000*60); 315 } catch (InterruptedException e) { 316 logger.error("主线程休眠异常-InterruptedException:",e); 317 } 318 } 319 } 320 321 public static void main(String[] args) { 322 start(); 323 } 324 }
数据库中