java简单web爬虫(网页图片)
效果,执行main()方法后图片就下载道C盘的res文件夹中。没有的话创建一个文件夹
代码里的常量根据自己的需求修改,代码附到下面。
package com.sinitek.sirm.common.utils; import java.io.*; import java.net.URL; import java.net.URLConnection; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * java简单web爬虫(网页图片) */ public class Main { // 地址 private static final String URL = "http://www.xxx"; // 获取img标签正则 private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>"; // 获取src路径的正则 private static final String IMGSRC_REG = "src\s*=\s*"?(.*?)("|>|\s+)"; //图片原始路径(如果src里的路径正确则不用) private static final String IMG_LUJING = "http://xxx/"; //下载路径 private static final String LUJING = "C:/res/"; public static void main(String[] args) { try { Main cm=new Main(); //获得html文本内容 String HTML = cm.getHtml(URL); //获取图片标签 List<String> imgUrl = cm.getImageUrl(HTML); //获取图片src地址 List<String> imgSrc = cm.getImageSrc(imgUrl); //下载图片 cm.Download(imgSrc); }catch (Exception e){ System.out.println("发生错误"); } } //获取HTML内容 private String getHtml(String url)throws Exception{ URL url1=new URL(url); URLConnection connection=url1.openConnection(); InputStream in=connection.getInputStream(); InputStreamReader isr=new InputStreamReader(in); BufferedReader br=new BufferedReader(isr); String line; StringBuffer sb=new StringBuffer(); while((line=br.readLine())!=null){ sb.append(line,0,line.length()); sb.append(' '); } br.close(); isr.close(); in.close(); return sb.toString(); } //获取ImageUrl地址 private List<String> getImageUrl(String html){ Matcher matcher=Pattern.compile(IMGURL_REG).matcher(html); List<String>listimgurl=new ArrayList<String>(); while (matcher.find()){ listimgurl.add(matcher.group()); } return listimgurl; } //获取ImageSrc地址 private List<String> getImageSrc(List<String> listimageurl){ List<String> listImageSrc=new ArrayList<String>(); for (String image:listimageurl){ // 匹配<img>中的src数据 Matcher m = Pattern.compile(IMGSRC_REG).matcher(image); while (m.find()) { String a = m.group(1);//获取图片路径 a = IMG_LUJING+a;//数据拼接 listImageSrc.add(a); } } return listImageSrc; } //下载图片 private void Download(List<String> listImgSrc) { try { //开始时间 Date begindate = new Date(); for (String url : listImgSrc) { //开始时间 Date begindate2 = new Date(); String imageName = url.substring(url.lastIndexOf("/") + 1, url.length()); URL uri = new URL(url); InputStream in = uri.openStream(); FileOutputStream fo = new FileOutputStream(new File(LUJING+imageName));//路径 byte[] buf = new byte[1024]; int length = 0; System.out.println("开始下载:" + url); while ((length = in.read(buf, 0, buf.length)) != -1) { fo.write(buf, 0, length); } in.close(); fo.close(); System.out.println(imageName + "下载完成"); //结束时间 Date overdate2 = new Date(); double time = overdate2.getTime() - begindate2.getTime(); System.out.println("耗时:" + time / 1000 + "s"); } Date overdate = new Date(); double time = overdate.getTime() - begindate.getTime(); System.out.println("总耗时:" + time / 1000 + "s"); } catch (Exception e) { System.out.println("下载失败"); } } }