zoukankan      html  css  js  c++  java
  • 爬取自己的csdn目录

    csdn目录的分页:
    https://blog.csdn.net/qq_38366063/article/list/1
    后面是几就是几个分页,简单点,直接遍历增加就好了,

    导包就不细说了:

    	<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
    		<dependency>
    			<groupId>net.sourceforge.htmlunit</groupId>
    			<artifactId>htmlunit</artifactId>
    			<version>2.35.0</version>
    		</dependency>
    		<!-- 解析html -->
    		<dependency>
    			<groupId>org.jsoup</groupId>
    			<artifactId>jsoup</artifactId>
    			<version>1.11.3</version>
    		</dependency>
    	<dependency>
    			<groupId>fr.opensagres.xdocreport</groupId>
    			<artifactId>fr.opensagres.xdocreport.converter.docx.xwpf</artifactId>
    			<version>2.0.1</version>
    		</dependency>
    		
    		<!-- 阿里JSON解析器 -->
    		<dependency>
    			<groupId>com.alibaba</groupId>
    			<artifactId>fastjson</artifactId>
    			<version>1.2.31</version>
    		</dependency>
    
    		<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-text -->
    		<dependency>
    			<groupId>org.apache.commons</groupId>
    			<artifactId>commons-text</artifactId>
    			<version>1.4</version>
    		</dependency>
    
    
     public static void main(String[] args) {
    		String name = "qq_38366063";
    		String url = "https://blog.csdn.net/"+name+"/article/list/";
    
    		//多少页:
    		for (int i = 0; i < 14; i++) {
    			String oneUrl = url + i;
    			try {
    				getCSDNArticleUrlList2(name,oneUrl,new ArrayList<String>());
    			} catch (IOException e) {
    				e.printStackTrace();
    			}
    		}
    
        }
    
        public static void getCSDNArticleUrlList2(String name, String oneUrl, List<String> urlList)
                throws FailingHttpStatusCodeException, MalformedURLException, IOException {
            // 模拟浏览器操作
            InputStream inputStream = HttpUtil.doGet(oneUrl);
            String content = StreamUtil.inputStreamToString(inputStream, "UTF-8");
            Document doc = Jsoup.parse(content);
            Element pageMsg22 = doc.select("div.article-list").first();
            if (pageMsg22 == null) {
                return;
            }
            Elements pageMsg = pageMsg22.select("div.article-item-box");
            Element linkNode;
            for (Element e : pageMsg) {
                linkNode = e.select("h4 a").first();
                // 不知为何,所有的bloglist第一条都是这个:https://blog.csdn.net/yoyo_liyy/article/details/82762601
                if (linkNode.attr("href").contains(name)) {
    //					System.out.println(linkNode.attr("href"));
                    TextNode textNode = linkNode.textNodes().get(1);
                    System.out.println("[" + textNode + "](" + linkNode.attr("href") + ")");
                    urlList.add(linkNode.attr("href"));
                }
            }
            return;
        }
    

    工具类方法,HttpUtil的一个,和流转字符串的一个

     public static InputStream doGet(String urlstr, Map<String, String> headers) throws IOException {
            URL url = new URL(urlstr);
            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
            conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 " +
                    "(KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
            conn.setRequestProperty("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp," +
                    "image/apng,*/*;q=0" +
                    ".8");
    
            if (headers != null) {
                Set<String> keys = headers.keySet();
                for (String key : keys) {
                    conn.setRequestProperty(key, headers.get(key));
                }
            }
            Random random = new Random();
            String ip =
                    (random.nextInt(100) + 100) + "." + (random.nextInt(100) + 100) + "." + (random.nextInt(100) + 100) + "." + (random.nextInt(100) + 100);
            conn.setRequestProperty("x-forwarded-for", ip);
            InputStream inputStream = conn.getInputStream();
            return inputStream;
        }
    
      public static String inputStreamToString(InputStream is, String charset) throws IOException {
    
            byte[] bytes = new byte[1024];
            int byteLength = 0;
            StringBuffer sb = new StringBuffer();
            while ((byteLength = is.read(bytes)) != -1) {
                sb.append(new String(bytes, 0, byteLength, charset));
            }
            return sb.toString();
        }
    
    

    爬取结果:
    在这里插入图片描述
    然后直接考到csdn写文章里面去就可以了:
    https://blog.csdn.net/qq_38366063/article/details/101760545

    世界上所有的不公平都是由于当事人能力不足造成的.
  • 相关阅读:
    linux常用命令
    mysql 开发基础系列20 事务控制和锁定语句(上)
    sql server 性能调优之 资源等待 CXPACKET
    mysql 开发基础系列19 触发器
    mysql 开发基础系列18 存储过程和函数(下)
    mysql 开发基础系列17 存储过程和函数(上)
    sql server 性能调优之 资源等待PAGEIOLATCH
    mysql 开发基础系列16 视图
    mysql 开发基础系列15 索引的设计和使用
    sql server 性能调优之 当前用户请求分析 (1)
  • 原文地址:https://www.cnblogs.com/javayida/p/13346843.html
Copyright © 2011-2022 走看看