zoukankan      html  css  js  c++  java
  • 爬取自己的csdn目录

    csdn目录的分页:
    https://blog.csdn.net/qq_38366063/article/list/1
    后面是几就是几个分页,简单点,直接遍历增加就好了,

    导包就不细说了:

    	<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
    		<dependency>
    			<groupId>net.sourceforge.htmlunit</groupId>
    			<artifactId>htmlunit</artifactId>
    			<version>2.35.0</version>
    		</dependency>
    		<!-- 解析html -->
    		<dependency>
    			<groupId>org.jsoup</groupId>
    			<artifactId>jsoup</artifactId>
    			<version>1.11.3</version>
    		</dependency>
    	<dependency>
    			<groupId>fr.opensagres.xdocreport</groupId>
    			<artifactId>fr.opensagres.xdocreport.converter.docx.xwpf</artifactId>
    			<version>2.0.1</version>
    		</dependency>
    		
    		<!-- 阿里JSON解析器 -->
    		<dependency>
    			<groupId>com.alibaba</groupId>
    			<artifactId>fastjson</artifactId>
    			<version>1.2.31</version>
    		</dependency>
    
    		<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-text -->
    		<dependency>
    			<groupId>org.apache.commons</groupId>
    			<artifactId>commons-text</artifactId>
    			<version>1.4</version>
    		</dependency>
    
    
     public static void main(String[] args) {
    		String name = "qq_38366063";
    		String url = "https://blog.csdn.net/"+name+"/article/list/";
    
    		//多少页:
    		for (int i = 0; i < 14; i++) {
    			String oneUrl = url + i;
    			try {
    				getCSDNArticleUrlList2(name,oneUrl,new ArrayList<String>());
    			} catch (IOException e) {
    				e.printStackTrace();
    			}
    		}
    
        }
    
        public static void getCSDNArticleUrlList2(String name, String oneUrl, List<String> urlList)
                throws FailingHttpStatusCodeException, MalformedURLException, IOException {
            // 模拟浏览器操作
            InputStream inputStream = HttpUtil.doGet(oneUrl);
            String content = StreamUtil.inputStreamToString(inputStream, "UTF-8");
            Document doc = Jsoup.parse(content);
            Element pageMsg22 = doc.select("div.article-list").first();
            if (pageMsg22 == null) {
                return;
            }
            Elements pageMsg = pageMsg22.select("div.article-item-box");
            Element linkNode;
            for (Element e : pageMsg) {
                linkNode = e.select("h4 a").first();
                // 不知为何,所有的bloglist第一条都是这个:https://blog.csdn.net/yoyo_liyy/article/details/82762601
                if (linkNode.attr("href").contains(name)) {
    //					System.out.println(linkNode.attr("href"));
                    TextNode textNode = linkNode.textNodes().get(1);
                    System.out.println("[" + textNode + "](" + linkNode.attr("href") + ")");
                    urlList.add(linkNode.attr("href"));
                }
            }
            return;
        }
    

    工具类方法,HttpUtil的一个,和流转字符串的一个

     public static InputStream doGet(String urlstr, Map<String, String> headers) throws IOException {
            URL url = new URL(urlstr);
            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
            conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 " +
                    "(KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
            conn.setRequestProperty("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp," +
                    "image/apng,*/*;q=0" +
                    ".8");
    
            if (headers != null) {
                Set<String> keys = headers.keySet();
                for (String key : keys) {
                    conn.setRequestProperty(key, headers.get(key));
                }
            }
            Random random = new Random();
            String ip =
                    (random.nextInt(100) + 100) + "." + (random.nextInt(100) + 100) + "." + (random.nextInt(100) + 100) + "." + (random.nextInt(100) + 100);
            conn.setRequestProperty("x-forwarded-for", ip);
            InputStream inputStream = conn.getInputStream();
            return inputStream;
        }
    
      public static String inputStreamToString(InputStream is, String charset) throws IOException {
    
            byte[] bytes = new byte[1024];
            int byteLength = 0;
            StringBuffer sb = new StringBuffer();
            while ((byteLength = is.read(bytes)) != -1) {
                sb.append(new String(bytes, 0, byteLength, charset));
            }
            return sb.toString();
        }
    
    

    爬取结果:
    在这里插入图片描述
    然后直接考到csdn写文章里面去就可以了:
    https://blog.csdn.net/qq_38366063/article/details/101760545

    世界上所有的不公平都是由于当事人能力不足造成的.
  • 相关阅读:
    CentOS7 PXE安装批量安装操作系统
    004_MySQL 主从配置
    CentOS 桥接网卡配置
    玩转 Jupyter Notebook (CentOS)
    搭建专属于自己的Leanote云笔记本
    wetty 安装(web+tty)
    wget命令详解
    linux 下find---xargs以及find--- -exec结合使用
    Linux 交换分区swap
    Linux 时区的修改
  • 原文地址:https://www.cnblogs.com/javayida/p/13346843.html
Copyright © 2011-2022 走看看