zoukankan      html  css  js  c++  java
  • 爬取图片(搜狐)

    package com.cxqy.officialserver.dto.personalsub;
    import org.apache.http.Header;
    import org.apache.http.HttpEntity;
    import org.apache.http.NameValuePair;
    import org.apache.http.client.entity.UrlEncodedFormEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.methods.HttpPost;
    import org.apache.http.client.methods.HttpUriRequest;
    import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
    import org.apache.http.conn.ssl.TrustStrategy;
    import org.apache.http.entity.StringEntity;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.message.BasicNameValuePair;
    import org.apache.http.ssl.SSLContextBuilder;
    import org.apache.http.util.EntityUtils;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;

    import javax.net.ssl.HostnameVerifier;
    import javax.net.ssl.SSLContext;
    import javax.net.ssl.SSLSession;
    import java.io.IOException;
    import java.io.UnsupportedEncodingException;
    import java.security.GeneralSecurityException;
    import java.security.cert.CertificateException;
    import java.security.cert.X509Certificate;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;

    /**
    * @Author yjl
    * @Date 2021/12/31 14:56
    * @Version 1.0
    */



    public class HttpClientUtils {

    public static Map<String, List<String>> convertHeaders(Header[] headers) {
    Map<String, List<String>> results = new HashMap<String, List<String>>();
    for (Header header : headers) {
    List<String> list = results.get(header.getName());
    if (list == null) {
    list = new ArrayList<String>();
    results.put(header.getName(), list);
    }
    list.add(header.getValue());
    }
    return results;
    }

    /**
    * httpget请求
    * @param url
    */
    public static String get(String url) {
    return get(url, "UTF-8");
    }

    public static Logger logger = LoggerFactory.getLogger(HttpClientUtils.class);

    /**
    * httpget请求
    * @param url
    */
    public static String get(String url, String charset) {
    HttpGet httpGet = new HttpGet(url);
    return executeRequest(httpGet, charset);
    }

    /**
    * httpget请求,增加异步请求头参数
    * @param url
    */
    public static String ajaxGet(String url) {
    return ajaxGet(url, "UTF-8");
    }

    /**
    * httpget请求,增加异步请求头参数
    *
    * @param url
    */
    public static String ajaxGet(String url, String charset) {
    HttpGet httpGet = new HttpGet(url);
    httpGet.setHeader("X-Requested-With", "XMLHttpRequest");
    return executeRequest(httpGet, charset);
    }

    /**
    * @param url
    * @return
    */
    public static String ajaxGet(CloseableHttpClient httpclient, String url) {
    HttpGet httpGet = new HttpGet(url);
    httpGet.setHeader("X-Requested-With", "XMLHttpRequest");
    return executeRequest(httpclient, httpGet, "UTF-8");
    }

    /**
    * httppost请求,传递map格式参数
    */
    public static String post(String url, Map<String, String> dataMap) {
    return post(url, dataMap, "UTF-8");
    }

    /**
    * httppost请求,传递map格式参数
    */
    public static String post(String url, Map<String, String> dataMap, String charset) {
    HttpPost httpPost = new HttpPost(url);
    try {
    if (dataMap != null) {
    List<NameValuePair> nvps = new ArrayList<NameValuePair>();
    for (Map.Entry<String, String> entry : dataMap.entrySet()) {
    nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
    }
    UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset);
    formEntity.setContentEncoding(charset);
    httpPost.setEntity(formEntity);
    }
    } catch (UnsupportedEncodingException e) {
    e.printStackTrace();
    }
    return executeRequest(httpPost, charset);
    }

    /**
    * httppost请求,增加异步请求头参数,传递map格式参数
    */
    public static String ajaxPost(String url, Map<String, String> dataMap) {
    return ajaxPost(url, dataMap, "UTF-8");
    }

    /**
    * httppost请求,增加异步请求头参数,传递map格式参数
    */
    public static String ajaxPost(String url, Map<String, String> dataMap, String charset) {
    HttpPost httpPost = new HttpPost(url);
    httpPost.setHeader("X-Requested-With", "XMLHttpRequest");
    try {
    if (dataMap != null) {
    List<NameValuePair> nvps = new ArrayList<NameValuePair>();
    for (Map.Entry<String, String> entry : dataMap.entrySet()) {
    nvps.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
    }
    UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nvps, charset);
    formEntity.setContentEncoding(charset);
    httpPost.setEntity(formEntity);
    }
    } catch (UnsupportedEncodingException e) {
    e.printStackTrace();
    }
    return executeRequest(httpPost, charset);
    }

    /**
    * httppost请求,增加异步请求头参数,传递json格式参数
    */
    public static String ajaxPostJson(String url, String jsonString) {
    return ajaxPostJson(url, jsonString, "UTF-8");
    }

    /**
    * httppost请求,增加异步请求头参数,传递json格式参数
    */
    public static String ajaxPostJson(String url, String jsonString, String charset) {
    HttpPost httpPost = new HttpPost(url);
    httpPost.setHeader("X-Requested-With", "XMLHttpRequest");

    StringEntity stringEntity = new StringEntity(jsonString, charset);// 解决中文乱码问题
    stringEntity.setContentEncoding(charset);
    stringEntity.setContentType("application/json");
    httpPost.setEntity(stringEntity);
    return executeRequest(httpPost, charset);
    }

    /**
    * 执行一个http请求,传递HttpGetHttpPost参数
    */
    public static String executeRequest(HttpUriRequest httpRequest) {
    return executeRequest(httpRequest, "UTF-8");
    }

    /**
    * 执行一个http请求,传递HttpGetHttpPost参数
    */
    public static String executeRequest(HttpUriRequest httpRequest, String charset) {
    CloseableHttpClient httpclient;
    if ("https".equals(httpRequest.getURI().getScheme())) {
    httpclient = createSSLInsecureClient();
    } else {
    httpclient = HttpClients.createDefault();
    }
    String result = "";
    try {
    try {
    CloseableHttpResponse response = httpclient.execute(httpRequest); HttpEntity entity =
    null;
    try {
    entity = response.getEntity(); result = EntityUtils.
    toString(entity, charset);
    } finally {
    EntityUtils.consume(entity); response.close
    ();
    }
    } finally {
    httpclient.close();
    }
    } catch (IOException ex) {
    ex.printStackTrace();
    }
    return result;
    }

    public static String executeRequest(CloseableHttpClient httpclient, HttpUriRequest httpRequest, String charset) {
    String result = "";
    try {
    try {
    CloseableHttpResponse response = httpclient.execute(httpRequest); HttpEntity entity =
    null;
    try {
    entity = response.getEntity(); result = EntityUtils.
    toString(entity, charset);
    } finally {
    EntityUtils.consume(entity); response.close
    ();
    }
    } finally {
    httpclient.close();
    }
    } catch (IOException ex) {
    ex.printStackTrace();
    }
    return result;
    }

    /**
    * 创建 SSL连接
    */
    public static CloseableHttpClient createSSLInsecureClient() {
    try {
    SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(new TrustStrategy() {
    @Override
    public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {
    return true;
    }
    }).build(); SSLConnectionSocketFactory sslsf =
    new SSLConnectionSocketFactory(sslContext, new HostnameVerifier() {
    @Override
    public boolean verify(String hostname, SSLSession session) {
    return true;
    }
    });
    return HttpClients.custom().setSSLSocketFactory(sslsf).build();
    } catch (GeneralSecurityException ex) {
    throw new RuntimeException(ex);
    }
    }
    }



    ----------------------------------

    package com.cxqy.officialserver.dto.personalsub;
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.InputStream;
    import java.net.URL;
    import java.net.URLConnection;
    import java.util.List;
    import java.util.Objects;
    import java.util.concurrent.ExecutorService;
    import java.util.concurrent.Executors;
    import java.util.concurrent.TimeUnit;
    import java.util.concurrent.atomic.AtomicInteger;
    /**
    * @Author yjl
    * @Date 2021/12/31 14:54
    * @Version 1.0
    */


    public class SougouImgPipeline {

    private String extension = ".jpg";
    private String path;

    private volatile AtomicInteger suc;
    private volatile AtomicInteger fails;

    public SougouImgPipeline() {
    setPath("E:/pipeline/sougous");
    suc = new AtomicInteger();
    fails = new AtomicInteger();
    }

    public SougouImgPipeline(String path) {
    setPath(path);
    suc = new AtomicInteger();
    fails = new AtomicInteger();
    }

    public SougouImgPipeline(String path, String extension) {
    setPath(path);
    this.extension = extension;
    suc = new AtomicInteger();
    fails = new AtomicInteger();
    }

    public void setPath(String path) {
    this.path = path;
    }

    /**
    * 下载
    * @param url
    * @param cate
    * @throws Exception
    */
    private void downloadImg(String url, String cate, String name) throws Exception {
    String path = this.path + "/" + cate + "/";
    File dir = new File(path);
    if (!dir.exists()) { // 目录不存在则创建目录
    dir.mkdirs();
    }
    String realExt = url.substring(url.lastIndexOf(".")); // 获取扩展名
    String fileName = name + realExt;
    fileName = fileName.replace("-", "");
    String filePath = path + fileName;
    File img = new File(filePath);
    if(img.exists()){ // 若文件之前已经下载过,则跳过
    System.out.println(String.format("文件%s已存在本地目录",fileName));
    return;
    }

    URLConnection con = new URL(url).openConnection();
    con.setConnectTimeout(5000);
    con.setReadTimeout(5000);
    InputStream inputStream = con.getInputStream();
    byte[] bs = new byte[1024];

    File file = new File(filePath);
    FileOutputStream os = new FileOutputStream(file, true);
    // 开始读取 写入
    int len;
    while ((len = inputStream.read(bs)) != -1) {
    os.write(bs, 0, len);
    }
    System.out.println("picUrl: " + url);
    System.out.println(String.format("正在下载第%s张图片", suc.getAndIncrement()));
    }

    /**
    * 单线程处理
    *
    * @param data
    * @param word
    */
    public void process(List<String> data, String word) {
    long start = System.currentTimeMillis();
    for (String picUrl : data) {
    if (picUrl == null)
    continue;
    try {
    downloadImg(picUrl, word, picUrl);
    } catch (Exception e) {
    fails.incrementAndGet();
    }
    }
    System.out.println("下载成功: " + suc.get());
    System.out.println("下载失败: " + fails.get());
    long end = System.currentTimeMillis();
    System.out.println("耗时:" + (end - start) / 1000 + "");
    }


    /**
    * 多线程处理
    *
    * @param data
    * @param word
    */
    public void processSync(List<String> data, String word) {
    long start = System.currentTimeMillis();
    int count = 0;
    ExecutorService executorService = Executors.newCachedThreadPool(); // 创建缓存线程池
    for (int i=0;i<data.size();i++) {
    String picUrl = data.get(i);
    if (picUrl == null)
    continue;
    String name = "";
    if(i<10){
    name="000"+i;
    }else if(i<100){
    name="00"+i;
    }else if(i<1000){
    name="0"+i;
    }
    String finalName = name;
    executorService.execute(() -> {
    try {
    downloadImg(picUrl, word, finalName);
    } catch (Exception e) {
    fails.incrementAndGet();
    }
    });
    count++;
    }
    executorService.shutdown();
    try {
    if (!executorService.awaitTermination(60, TimeUnit.SECONDS)) {
    // 超时的时候向线程池中所有的线程发出中断(interrupted)
    // executorService.shutdownNow();
    }
    System.out.println("AwaitTermination Finished");
    System.out.println("共有URL: "+data.size());
    System.out.println("下载成功: " + suc);
    System.out.println("下载失败: " + fails);

    File dir = new File(this.path + "/" + word + "/");
    int len = Objects.requireNonNull(dir.list()).length;
    System.out.println("当前共有文件: "+len);

    long end = System.currentTimeMillis();
    System.out.println("耗时:" + (end - start) / 1000.0 + "");
    } catch (InterruptedException e) {
    e.printStackTrace();
    }

    }


    /**
    * 多线程分段处理
    *
    * @param data
    * @param word
    * @param threadNum
    */
    public void processSync2(List<String> data, final String word, int threadNum) {
    if (data.size() < threadNum) {
    process(data, word);
    } else {
    ExecutorService executorService = Executors.newCachedThreadPool();
    int num = data.size() / threadNum; //每段要处理的数量
    for (int i = 0; i < threadNum; i++) {
    int start = i * num;
    int end = (i + 1) * num;
    if (i == threadNum - 1) {
    end = data.size();
    }
    final List<String> cutList = data.subList(start, end); executorService.execute
    (() -> process(cutList, word));
    }
    executorService.shutdown();
    }
    }
    }



    --------------------------



    package com.cxqy.officialserver.dto.personalsub;
    import com.alibaba.fastjson.JSONObject;


    import java.util.ArrayList;
    import java.util.List;
    /**
    * @Author yjl
    * @Date 2021/12/31 14:53
    * @Version 1.0
    */


    public class SougouImgProcessor {

    private String url;
    private SougouImgPipeline pipeline;
    private List<JSONObject> dataList;
    private List<String> urlList;
    private String word;

    public SougouImgProcessor(String url,String word) {
    this.url = url;
    this.word = word;
    this.pipeline = new SougouImgPipeline();
    this.dataList = new ArrayList<>();
    this.urlList = new ArrayList<>();
    }

    public void process(int idx, int size) {
    String res = HttpClientUtils.get(String.format(this.url, idx, size, this.word));
    JSONObject object = JSONObject.parseObject(res);
    List<JSONObject> items = (List<JSONObject>)((JSONObject)object.get("data")).get("items");
    for(JSONObject item : items){
    this.urlList.add(item.getString("picUrl"));
    }
    this.dataList.addAll(items);
    }

    // 下载
    public void pipelineData(){
    // 多线程
    pipeline.processSync(this.urlList, this.word);
    }


    public static void main(String[] args) {
    String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";
    SougouImgProcessor processor = new SougouImgProcessor(url,"明星");

    int start = 0, size = 50, limit = 1000; // 定义爬取开始索引、每次爬取数量、总共爬取数量

    for(int i=start;i<start+limit;i+=size)
    processor.process(i, size);

    processor.pipelineData();

    }
    }



    =====================

    爬取图片(搜狐)
  • 相关阅读:
    fork和Vfork的区别
    exer4.13.c(undone)
    Exer4.6.c(undone)
    好习惯
    c语言中的register修饰符
    请教如何在页面之间传递dataSet?不用session
    ultraEdite编辑shell或perl程序时注意
    PowerBuilder程序中取数据库中值,值异常(正数变成负数或异常)
    pb程序的编译发布
    关于sql server2000 的1068 与 1069 问题
  • 原文地址:https://www.cnblogs.com/yangsanluo/p/15753201.html
Copyright © 2011-2022 走看看