zoukankan      html  css  js  c++  java
  • java爬虫 案例

    package com.zjazn;
    
    import com.sun.org.apache.bcel.internal.generic.RETURN;
    import com.sun.xml.internal.ws.api.server.InstanceResolver;
    import org.apache.http.HttpEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClientBuilder;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    import sun.net.www.http.HttpClient;
    
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.net.MalformedURLException;
    import java.net.URL;
    import java.net.URLConnection;
    import java.util.ArrayList;
    import java.util.List;
    
    public class Data {
    
       //2、解析数据
        public static void main(String[] args) {
            String html = getData();
            Document htmledThisDocument = Jsoup.parse(html);
            List<MyData> myData=new ArrayList<MyData>();
            Elements courses = htmledThisDocument.select(".learn-path-container>div");
            for (Element course:courses){
                String courseName = course.select("a>div").first().text();
                String courseNum = course.select("a>div").last().text();
                if(courseNum.indexOf("门")>-1){
                    int num = Integer.parseInt(courseNum.substring(0, courseNum.indexOf("门")));
                    String imgPath = course.select("a>img").attr("src");
                    String fuffix = imgPath.substring(imgPath.lastIndexOf("."));
                    MyData myData6 = new MyData();
                        myData6.setName(courseName);
                        myData6.setImgPath(imgPath);
                        myData6.setNum(num);
                    myData.add(myData6);
                    downloadFile(imgPath,"E://myimg",courseName+fuffix);
                }
            }
            System.out.println(myData.toString());
    
    
        }
      //1、获取数据(html)
    public static String getData(){ CloseableHttpClient httpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet("https://www.lanqiao.cn/paths/"); CloseableHttpResponse response=null; HttpEntity entity=null; String html=null; try { response = httpClient.execute(httpGet);//发送请求 if(response.getStatusLine().getStatusCode() ==200){ entity = response.getEntity();//获取html html= EntityUtils.toString(entity,"UTF-8");//用指定编码解析html } return html; } catch (IOException e) { e.printStackTrace(); } return null; }
    //传入资源链接,下载资源的方法,比如下载图片   
    public static void downloadFile(String urlStr,String directory,String fileName){//#有些页面抓取不了,网站设置了反爬,拒绝反爬取,请看最下面
            FileOutputStream out =null;
            InputStream in=null;
            try {
                URL url=new URL(urlStr);
                URLConnection urlConnection = url.openConnection();
                in=urlConnection.getInputStream();
                byte[] buf=new byte[1000];
                File dir = new File(directory);
                if(!dir.exists() ){
                    dir.mkdir();
                }
                out=new FileOutputStream(directory+"\"+fileName);
                int len=-1;
                while ((len=in.read(buf))!=-1){
                    out.write(buf,0,len);
    
                }
    
            } catch (MalformedURLException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            }finally {
                try {
                    if(in != null){
                        in.close();
                    }
                    if (out !=null){
                        out.close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    
    }
    package com.zjazn;
    
    import lombok.Data;
    
    @Data
    public class MyData {
        private String name;
        private String imgPath;
        private Integer num;
    
    }

    ##拒绝反爬取

    public static String getData(String TargetUrl) throws IOException {
    URL url = new URL(TargetUrl);//目标URL
    
    HttpURLConnection conn = (HttpURLConnection) url.openConnection();//模拟浏览器,反拒绝爬取
    conn.setRequestMethod("GET");
    conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36");
    
    BufferedReader fr = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8"));
    char[] chars = new char[1024];
    int len=0;
    String html = null;
    while ((len=fr.read(chars))!=-1){
    html+=new String(chars,0,len);
    }
    
    fr.close();
    return html;
    
    
    }
  • 相关阅读:
    財智V6.0(完美破解序列号特别版)
    垂死挣扎还是涅槃重生 -- Delphi XE5 公布会归来感想
    HDU1006
    HDU 1385 Minimum Transport Cost 最短路径题解
    fast-json.jar的用法
    curl命令具体解释
    mysql很全的和完整的总结
    MongoDB入门简单介绍
    Tuxedo入门学习
    BP神经网络基本原理
  • 原文地址:https://www.cnblogs.com/zjazn/p/14188395.html
Copyright © 2011-2022 走看看