zoukankan      html  css  js  c++  java
  • 2020年寒假假期总结0116

      Jsoup实战爬取(京东手机数据)

      相关依赖和配置说明已经在上一篇随笔中交代:https://www.cnblogs.com/heiyang/p/12199113.html

      新建关于手机商品的POJO(JavaBean的一种):Item.class

    @Entity
    @Table(name = "jd_item")//与数据库中的jd_item对应上
    public class Item {
        //主键
        @Id
        @GeneratedValue(strategy = GenerationType.IDENTITY)
        private Long id;
        //标准产品单位(商品集合)
        private Long spu;
        //库存量单位(最小品类单元)
        private Long sku;
        //商品标题
        private String title;
        //商品价格
        private Double price;
        //商品图片
        private String pic;
        //商品详情地址
        private String url;
    
        public Long getId() {
            return id;
        }
    
        public void setId(Long id) {
            this.id = id;
        }
    
        public Long getSpu() {
            return spu;
        }
    
        public void setSpu(Long spu) {
            this.spu = spu;
        }
    
        public Long getSku() {
            return sku;
        }
    
        public void setSku(Long sku) {
            this.sku = sku;
        }
    
        public String getTitle() {
            return title;
        }
    
        public void setTitle(String title) {
            this.title = title;
        }
    
        public Double getPrice() {
            return price;
        }
    
        public void setPrice(Double price) {
            this.price = price;
        }
    
        public String getPic() {
            return pic;
        }
    
        public void setPic(String pic) {
            this.pic = pic;
        }
    
        public String getUrl() {
            return url;
        }
    
        public void setUrl(String url) {
            this.url = url;
        }
    
        public Date getCreated() {
            return created;
        }
    
        public void setCreated(Date created) {
            this.created = created;
        }
    
        public Date getUpdated() {
            return updated;
        }
    
        public void setUpdated(Date updated) {
            this.updated = updated;
        }
    
        //创建时间
        private Date created;
        //更新时间
        private Date updated;
    
    }

      保存数据到数据库的接口:ItemServiceImpl

    public interface ItemServiceImpl {
        /**
         * 保存获取的Item类
         * @param item
         */
        public void save(Item item);
    
        /**
         * 查询是否已经保存该Item
         * @param item
         * @return
         */
        public List<Item> findAll(Item item);
    
    }

      实现接口的类:ItemService

    @Service
    public class ItemService implements ItemServiceImpl {
    
        @Autowired
        private ItemDao itemDao;
    
        @Override
        public void save(Item item) {
            this.itemDao.save(item);
        }
    
        @Override
        public List<Item> findAll(Item item) {
            //声明查询条件
            Example<Item> example=Example.of(item);
            //依据查询条件进行查询数据
            List<Item> list =this.itemDao.findAll(example);
    
            return list;
        }
    }

      操作数据库的接口,继承JpaRepository:ItemDao

    public interface ItemDao extends JpaRepository <Item,Long> {
          //此处只需要继承,使用父类的方法就好  
    }

      HttpUtils工具类:HttpUtils

    @Component
    public class HttpUtils {
        private PoolingHttpClientConnectionManager cm;
    
        public HttpUtils() {
            this.cm = new PoolingHttpClientConnectionManager();
            //    设置最大连接数
            cm.setMaxTotal(200);
            //    设置每个主机的并发数
            cm.setDefaultMaxPerRoute(20);
    
        }
    
        /**
         * 依据请求的地址下载网页数据
         *
         * @param url
         * @return
         */
        public String doGetHtml(String url) {
    
    
    
            // 获取HttpClient对象
            CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
    
            // 声明httpGet请求对象
            HttpGet httpGet = new HttpGet(url);
            // 设置请求参数RequestConfig
            httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
            httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
            httpGet.setHeader("Accept-Language", "en-US,en;q=0.8,zh-CN;q=0.5,zh;q=0.3");
            httpGet.setHeader("Referer", "https://www.jd.com/");
            httpGet.setHeader("DNT","1");
            httpGet.setHeader("Connection","keep-alive");
            httpGet.setHeader("Upgrade-Insecure-Requests", "1");
            httpGet.setHeader("TE", "Trailers");
            httpGet.setConfig(this.getConfig());
    
            CloseableHttpResponse response = null;
            try {
                // 使用HttpClient发起请求,返回response
                response = httpClient.execute(httpGet);
                // 解析response返回数据
                if (response.getStatusLine().getStatusCode() == 200) {
                    String html = "";
    
                    // 如果response。getEntity获取的结果是空,在执行EntityUtils.toString会报错
                    // 需要对Entity进行非空的判断
                    if (response.getEntity() != null) {
                        html = EntityUtils.toString(response.getEntity(), "UTF-8");
                        System.out.println(html);
                    }
    
                    return html;
                }
    
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                try {
                    if (response != null) {
                        // 关闭连接
                        response.close();
                    }
                    // 不能关闭,现在使用的是连接管理器
                    // httpClient.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
            return "";
    
        }
    
        public String doGetImage(String url) {
            // 获取HttpClient对象
            CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
    
            // 声明httpGet请求对象
            HttpGet httpGet = new HttpGet(url);
            // 设置请求参数RequestConfig
            httpGet.setConfig(this.getConfig());
    
            CloseableHttpResponse response = null;
            try {
                // 使用HttpClient发起请求,返回response
                response = httpClient.execute(httpGet);
                // 解析response下载图片
                if (response.getStatusLine().getStatusCode() == 200) {
                    // 获取文件类型
                    String extName = url.substring(url.lastIndexOf("."));
                    // 使用uuid生成图片名
                    String imageName = UUID.randomUUID().toString() + extName;
    
                    // 声明输出的文件
                    OutputStream outstream = new FileOutputStream(new File("E:/images/" + imageName));
                    // 使用响应体输出文件
                    response.getEntity().writeTo(outstream);
    
                    // 返回生成的图片名
                    return imageName;
                }
    
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                try {
                    if (response != null) {
                        // 关闭连接
                        response.close();
                    }
                    // 不能关闭,现在使用的是连接管理器
                    // httpClient.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
    
            return null;
    
        }
    
        //获取内容
        public String getHtml(String url) {
            // 获取HttpClient对象
            CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
    
            // 声明httpGet请求对象
            HttpGet httpGet = new HttpGet(url);
            // 设置请求参数RequestConfig
            httpGet.setConfig(this.getConfig());
    
            CloseableHttpResponse response = null;
            try {
                // 使用HttpClient发起请求,返回response
                response = httpClient.execute(httpGet);
                // 解析response返回数据
                if (response.getStatusLine().getStatusCode() == 200) {
                    String html = "";
    
                    // 如果response。getEntity获取的结果是空,在执行EntityUtils.toString会报错
                    // 需要对Entity进行非空的判断
                    if (response.getEntity() != null) {
                        html = EntityUtils.toString(response.getEntity(), "UTF-8");
                    }
    
                    return html;
                }
    
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                try {
                    if (response != null) {
                        // 关闭连接
                        response.close();
                    }
                    // 不能关闭,现在使用的是连接管理器
                    // httpClient.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
    
            return null;
        }
    
        //获取图片
        public String getImage(String url) {
            // 获取HttpClient对象
            CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
    
            // 声明httpGet请求对象
            HttpGet httpGet = new HttpGet(url);
            // 设置请求参数RequestConfig
            httpGet.setConfig(this.getConfig());
    
            CloseableHttpResponse response = null;
            try {
                // 使用HttpClient发起请求,返回response
                response = httpClient.execute(httpGet);
                // 解析response下载图片
                if (response.getStatusLine().getStatusCode() == 200) {
                    // 获取文件类型
                    String extName = url.substring(url.lastIndexOf("."));
                    // 使用uuid生成图片名
                    String imageName = UUID.randomUUID().toString() + extName;
    
                    // 声明输出的文件
                    OutputStream outstream = new FileOutputStream(new File("D:/images/" + imageName));
                    // 使用响应体输出文件
                    response.getEntity().writeTo(outstream);
    
                    // 返回生成的图片名
                    return imageName;
                }
    
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                try {
                    if (response != null) {
                        // 关闭连接
                        response.close();
                    }
                    // 不能关闭,现在使用的是连接管理器
                    // httpClient.close();
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
    
            return null;
        }
    
    
        //获取请求参数对象
        private RequestConfig getConfig() {
            RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)// 设置创建连接的超时时间
                    .setConnectionRequestTimeout(500) // 设置获取连接的超时时间
                    .setSocketTimeout(10000) // 设置连接的超时时间
                    .build();
    
            return config;
        }
    
    }

      工作任务类(爬取任务):ItemTask

    @Component
    public class ItemTask {
    
        @Autowired
        private HttpUtils httpUtils;
        @Autowired
        private ItemService itemService;
    
        private static final ObjectMapper MAPPER =  new ObjectMapper();
    
    
        //当下载任务完成后,间隔多长时间进行下一次的任务。
        @Scheduled(fixedDelay = 100 * 1000)
        public void itemTask() throws Exception {
            //声明需要解析的初始地址
            String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq" +
                    "=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&s=113&click=0&page=";
    
            //按照页面对手机的搜索结果进行遍历解析
            for (int i = 1; i < 10; i = i + 2) {
                String html = httpUtils.doGetHtml(url + i);
                //解析页面,获取商品数据并存储
                this.parse(html);
            }
    
    
            System.out.println("手机数据抓取完成!");
    
    
        }
    
        //解析页面,获取商品数据并存储
        private void parse(String html) throws Exception {
            //解析html获取Document
            Document doc = Jsoup.parse(html);
    
            //获取spu信息
            Elements spuEles = doc.select("div#J_goodsList > ul > li");
            System.out.println(html);
            System.out.println("-----------------数量为:"+spuEles.size());
            for (Element spuEle : spuEles) {
                //获取spu
                long spu = Long.parseLong(spuEle.attr("data-spu"));
    
                //获取sku信息
                Elements skuEles = spuEle.select("li.ps-item");
    
                for (Element skuEle : skuEles) {
                    //获取sku
                    long sku = Long.parseLong(skuEle.select("[data-sku]").attr("data-sku"));
    
                    System.out.println("商品的Sku值为"+sku);
                    //根据sku查询商品数据
                    Item item = new Item();
                    item.setSku(sku);
                    List<Item> list = this.itemService.findAll(item);
    
                    if(list.size()>0) {
                        //如果商品存在,就进行下一个循环,该商品不保存,因为已存在
                        continue;
                    }
    
                    //设置商品的spu
                    item.setSpu(spu);
    
                    //获取商品的详情的url
                    String itemUrl = "https://item.jd.com/" + sku + ".html";
                    item.setUrl(itemUrl);
    
    
                    //获取商品的图片
                    String picUrl ="https:"+ skuEle.select("img[data-sku]").first().attr("data-lazy-img");
                    picUrl = picUrl.replace("/n9/","/n1/");
                    String picName = this.httpUtils.doGetImage(picUrl);
                    item.setPic(picName);
    
                    System.out.println("商品的图片地址:"+picName);
    
                    //获取商品的价格
                    String priceJson = this.httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=J_" + sku);
                    double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble();
                    item.setPrice(price);
                    System.out.println("商品的价格:"+picName);
    
                    //获取商品的标题
                    String itemInfo = this.httpUtils.doGetHtml(item.getUrl());
                    String title = Jsoup.parse(itemInfo).select("div.sku-name").text();
                    item.setTitle(title);
                    System.out.println("商品的标题:"+picName);
    
    
                    item.setCreated(new Date());
                    item.setUpdated(item.getCreated());
    
                    //保存商品数据到数据库中
                    this.itemService.save(item);
    
                }
            }
        }
    
    }

      最后一步添加引导类:Application

    @SpringBootApplication
    //使用定时任务,需要先开启定时任务
    @EnableScheduling
    public class Application {
        public static void main(String[] args) {
            SpringApplication.run(Application.class,args);
        }
    }

      资源文件夹图:

      爬取结果:

       注意点:自己观看的视频发布时间稍微早一点,当时京东还没有反爬,现在需要加上header就可以了。

  • 相关阅读:
    史上最强验证
    Yii2 return redirect()
    一次线上问题引发的思考
    一次前端体验优化
    RSA For PHP
    判断是否字符串是否是JSON
    过滤Xss
    Yii2 中日志的记录
    Yii2 中禁用csrf校验
    开始。
  • 原文地址:https://www.cnblogs.com/heiyang/p/12208353.html
Copyright © 2011-2022 走看看