管理系统用户头像出现 未识别图像 , 即cdn路径下的图片文件正常,只是较多图片为大小5k的小图片,现需找出这些图片对应的用户。
注:本博只是为避免将来重复造轮子,不做额外赘述
思路
下载 id - img 一一对应的图片到本地
通过opencv,图片文件读取为Mat矩阵,转灰度图,获取dhash,发现未识别图片对应的dhash距离小于1
故,基于dhash完成图片对应用户id的筛选即可
图片爬取
页面爬取
#! /usr/bin/env python # -*- coding:utf-8 -*- # __author__ = "NYA" import urllib import urllib.request from bs4 import BeautifulSoup import sys import chardet def download(url, path): req = urllib.request.Request(url) content = urllib.request.urlopen(req).read() typeEncode = sys.getfilesystemencoding() infoencode = chardet.detect(content).get('encoding', 'utf-8') html = content.decode(infoencode, 'ignore').encode(typeEncode) soup = BeautifulSoup(html, 'html.parser') table_tr = soup.select('table > tr') print(len(table_tr)) index = 0 for tr_td in table_tr: if index > 0: td_index=0 name = '' url = '' for td in tr_td.findAll('td'): # print(td) if td_index==0: name=td.getText() if td_index==4: url=td.getText() td_index = td_index + 1 local_path = path + name print(url) urllib.request.urlretrieve(url, local_path) index = index + 1 url = 'aaa' path = 'G:images\used\' download(url, path)
csv爬取
#!/usr/bin/python3 # -*- coding:utf8 -*- import imghdr import urllib.request path = 'G:images\used\' csv_path = 'G:images\robot1.csv' f = open('G:images\check_error.txt', 'wb') data = [] error_images = [] with open(csv_path, 'r', encoding="utf-8") as f: header = f.readline().split(',') counter = 0 for line in f: try: fields = line.split(",") name = fields[0] url = fields[2] local_path = path + name + '.jpg' print(url.strip()) print(local_path) urllib.request.urlretrieve(url, local_path) check = imghdr.what(local_path) if check == None: f.write(name) f.write(',') f.write(url) f.write(',') f.write(local_path) f.write(' ') error_images.append(url) except: print(line) print(counter) counter = counter + 1 print(counter) print(error_images)
dhash筛选
public class ImageCheck { static { System.loadLibrary(Core.NATIVE_LIBRARY_NAME); } public static void main(String[] args) throws IOException { // Mat mat = Imgcodecs.imread("G:\images\used\4.jpg"); // Mat mat1 = Imgcodecs.imread("G:\images\used\7.jpg"); // Integer dhashGRAY28 = DhashDetector.getDhashGRAY28(mat); // Integer dhashGRAY281 = DhashDetector.getDhashGRAY28(mat1); // System.out.println(dhashGRAY28); // System.out.println(dhashGRAY281); File file = new File("G:\images\used"); File[] tempFiles = file.listFiles(); List<String> params = new ArrayList<>(); Integer dhash = 268423200; for (int i = 0 ; i < tempFiles.length;i++) { if (tempFiles[i].isFile()){ params.add(tempFiles[i].getPath()); } } BufferedWriter bwScdPca = new BufferedWriter(new FileWriter("G:\images\error1.txt")); List<Integer> res = new ArrayList<>(); for (String path:params) { String[] split = path.split("\\"); String now = split[split.length - 1].replace(".jpg", ""); System.out.println(now); Mat mat = Imgcodecs.imread(path); try { Integer dhashGRAY28 = DhashDetector.getDhashGRAY28(mat); if (DhashDetector.calcHammingDistance(dhash,dhashGRAY28) < 2) { res.add(Integer.parseInt(now)); } } catch (Exception e) { System.out.println(e); } mat.release(); } res = res.stream().sorted().collect(Collectors.toList()); System.out.println(res); for (Integer re : res) { bwScdPca.write(re+""); bwScdPca.newLine(); bwScdPca.flush(); } bwScdPca.close(); } }