zoukankan      html  css  js  c++  java
  • 电商站新蛋价格字段爬取(价格字段图片显示)

    分析

    新蛋详情页的价格字段是用图片显示的,虽然其它电商都已经认识到这是没什么卵用还浪费资源的行为但貌似新蛋不这样认为,所以尝试爬取一下。

    价格字段大概是这个样子:

    image

    这个图片也是很纯净的能够识别率百分百的。

    代码实现

    还是上之前写的一个小小工具库:https://github.com/CC11001100/commons-simple-character-ocr

    首先需要抓取一些图片来生成标注数据,这里选择了智能手机下的前十页,将前十页商品的价格字段图片爬下来生成标注数据,代码如下:

    package org.cc11001100.t1;
    
    import cc11001100.ocr.OcrUtil;
    import org.apache.http.client.fluent.Request;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    
    import javax.imageio.ImageIO;
    import java.awt.image.BufferedImage;
    import java.io.ByteArrayInputStream;
    import java.io.File;
    import java.io.IOException;
    import java.io.UnsupportedEncodingException;
    import java.util.UUID;
    import java.util.concurrent.ExecutorService;
    import java.util.concurrent.Executors;
    import java.util.concurrent.TimeUnit;
    
    /**
     * 新蛋: http://www.newegg.cn/
     *
     * @author CC11001100
     */
    public class NeweggCrawler {
    
    	private static OcrUtil ocrUtil;
    
    	static {
    		ocrUtil = new OcrUtil();
    	}
    
    	/**
    	 * 在智能手机类下面收集一些图片
    	 */
    	public static void grabTrainImage(String saveBasePath) {
    		ExecutorService executorService = Executors.newFixedThreadPool(10);
    		String url = "http://www.newegg.cn/SubCategory/1043-%d.htm";
    		for (int i = 1; i <= 10; i++) {
    			Document doc = getDocument(String.format(url, i));
    			doc.select(".catepro li p.title a").forEach(detailPageLinkElt -> {
    				executorService.execute(() -> {
    					String detailPageUrl = detailPageLinkElt.attr("href");
    					Document detailPage = getDocument(detailPageUrl);
    					// 原类名即如此...
    					String imgLink = detailPage.select(".godds_info_data img[src~=PriceImage]").attr("src");
    					byte[] imgBytes = download(imgLink);
    					try {
    						BufferedImage img = ImageIO.read(new ByteArrayInputStream(imgBytes));
    						String savePath = saveBasePath + UUID.randomUUID().toString() + ".png";
    						ImageIO.write(img, "png", new File(savePath));
    					} catch (IOException e) {
    						e.printStackTrace();
    					}
    				});
    			});
    		}
    		try {
    			executorService.shutdown();
    			executorService.awaitTermination(10, TimeUnit.DAYS);
    		} catch (InterruptedException e) {
    			e.printStackTrace();
    		}
    	}
    
    	private static Document getDocument(String url) {
    		byte[] responseBytes = download(url);
    		String html = null;
    		try {
    			html = new String(responseBytes, "gb2312");
    		} catch (UnsupportedEncodingException e) {
    			e.printStackTrace();
    		}
    		return Jsoup.parse(html);
    	}
    
    	private static byte[] download(String url) {
    		for (int i = 0; i < 3; i++) {
    			try {
    				return Request.Get(url).execute().returnContent().asBytes();
    			} catch (IOException e) {
    				e.printStackTrace();
    			}
    		}
    		return new byte[0];
    	}
    
    	public static void main(String[] args) {
    		grabTrainImage("E:/test/crawler/newegg/raw/");
    		new OcrUtil().init("E:/test/crawler/newegg/raw/", "E:/test/crawler/newegg/char/");
    	}
    
    }

    所有的价格图片都是由下面这些字符组成的:

    image

    手动将每张图片的文件名修改为图片所表示的意思:

    image

    时间有限,只增加一个测试方法用来测试结果是否正确,完整代码如下:

    package org.cc11001100.t1;
    
    import cc11001100.ocr.OcrUtil;
    import com.alibaba.fastjson.JSON;
    import com.alibaba.fastjson.JSONObject;
    import org.apache.http.client.fluent.Request;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    
    import javax.imageio.ImageIO;
    import java.awt.image.BufferedImage;
    import java.io.ByteArrayInputStream;
    import java.io.File;
    import java.io.IOException;
    import java.io.UnsupportedEncodingException;
    import java.util.UUID;
    import java.util.concurrent.ExecutorService;
    import java.util.concurrent.Executors;
    import java.util.concurrent.TimeUnit;
    
    /**
     * 新蛋: http://www.newegg.cn/
     *
     * @author CC11001100
     */
    public class NeweggCrawler {
    
    	private static OcrUtil ocrUtil;
    
    	static {
    		ocrUtil = new OcrUtil();
    		ocrUtil.loadDictionaryMap("E:/test/crawler/newegg/char/");
    	}
    
    	/**
    	 * 在智能手机类下面收集一些图片
    	 */
    	public static void grabTrainImage(String saveBasePath) {
    		ExecutorService executorService = Executors.newFixedThreadPool(10);
    		String url = "http://www.newegg.cn/SubCategory/1043-%d.htm";
    		for (int i = 1; i <= 10; i++) {
    			Document doc = getDocument(String.format(url, i));
    			doc.select(".catepro li p.title a").forEach(detailPageLinkElt -> {
    				executorService.execute(() -> {
    					String detailPageUrl = detailPageLinkElt.attr("href");
    					Document detailPage = getDocument(detailPageUrl);
    					// 原类名即如此...
    					String imgLink = detailPage.select(".godds_info_data img[src~=PriceImage]").attr("src");
    					byte[] imgBytes = download(imgLink);
    					try {
    						BufferedImage img = ImageIO.read(new ByteArrayInputStream(imgBytes));
    						String savePath = saveBasePath + UUID.randomUUID().toString() + ".png";
    						ImageIO.write(img, "png", new File(savePath));
    					} catch (IOException e) {
    						e.printStackTrace();
    					}
    				});
    			});
    		}
    		try {
    			executorService.shutdown();
    			executorService.awaitTermination(10, TimeUnit.DAYS);
    		} catch (InterruptedException e) {
    			e.printStackTrace();
    		}
    	}
    
    	/**
    	 * 测试抓取结果是否正确
    	 *
    	 * @param detailPageUrl
    	 * @return
    	 */
    	public static JSONObject parse(String detailPageUrl) {
    		JSONObject product = new JSONObject();
    		Document doc = getDocument(detailPageUrl);
    		String imgLink = doc.select(".godds_info_data img[src~=PriceImage]").attr("src");
    		byte[] imgBytes = download(imgLink);
    		BufferedImage img = null;
    		try {
    			img = ImageIO.read(new ByteArrayInputStream(imgBytes));
    			double price = Double.parseDouble(ocrUtil.ocr(img));
    			product.put("price", price);
    		} catch (IOException e) {
    			e.printStackTrace();
    		}
    
    		String productTitle = doc.select("#productTitle").text();
    		product.put("title", productTitle);
    
    		return product;
    	}
    
    	private static Document getDocument(String url) {
    		byte[] responseBytes = download(url);
    		String html = null;
    		try {
    			html = new String(responseBytes, "gb2312");
    		} catch (UnsupportedEncodingException e) {
    			e.printStackTrace();
    		}
    		return Jsoup.parse(html);
    	}
    
    	private static byte[] download(String url) {
    		for (int i = 0; i < 3; i++) {
    			try {
    				return Request.Get(url).execute().returnContent().asBytes();
    			} catch (IOException e) {
    				e.printStackTrace();
    			}
    		}
    		return new byte[0];
    	}
    
    	public static void main(String[] args) {
    //		grabTrainImage("E:/test/crawler/newegg/raw/");
    //		new OcrUtil().init("E:/test/crawler/newegg/raw/", "E:/test/crawler/newegg/char/");
    
    		String url = "http://www.newegg.cn/Product/A28-032-7Q5.htm";
    		System.out.println(JSON.toJSONString(parse(url), true));
    	}
    
    }
    
  • 相关阅读:
    SpringBoot多数据库连接(mysql+oracle)
    SOAP与REST API的区别
    我与OAuth 2.0那点荒唐的小秘密
    我与Git的那些破事(下)--分支模型
    我与Git的那些破事(上)--代码管理
    Salesforce学习之路(十三)Aura案例实战分析
    Salesforce学习之路(十二)Aura组件表达式
    react 脚手架初次npm start时候运行报错
    兼容ie 提示用户升级浏览器 <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
    一行代码实现网站一键变灰功能
  • 原文地址:https://www.cnblogs.com/cc11001100/p/8648320.html
Copyright © 2011-2022 走看看