zoukankan      html  css  js  c++  java
  • 博客园博文爬取 标签爬取(含源代码)

    爬取思路:

    1,在首页上爬取这些推荐博文:https://www.cnblogs.com/

     2,根据这些推荐博文进一步到发布这些推荐博文的博主主页中:

     3,爬取标签的话可以查看这些博主的标签 只用在博主主页后加一个/tag/就可以跳转到标签页中

     4,如果要爬取内容的话,就可以进入这些博主的所有页面中进行爬取

    下面是我的代码:

      1 package use;
      2 
      3 import java.sql.Connection;
      4 import java.sql.PreparedStatement;
      5 import java.util.ArrayList;
      6 import java.util.Date;
      7 import java.util.List;
      8 
      9 import com.dao.ClarifyDao;
     10 import com.dao.InfoDao;
     11 import org.jsoup.Jsoup;
     12 import org.jsoup.nodes.Document;
     13 
     14 import us.codecraft.webmagic.Page;
     15 import us.codecraft.webmagic.Site;
     16 import us.codecraft.webmagic.Spider;
     17 import us.codecraft.webmagic.processor.PageProcessor;
     18 
     19 public class 博客园内容 implements PageProcessor {
     20     static int nn=0;
     21     static String regEx="[
    `'' ]";
     22    // static String regEx="[
    `~!@#$%^&()+=|{}':;',\[\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。, 、?? ]";
     23     static String aa = "";//这里是将特殊字符换为aa字符串," "代表直接去掉
     24     private static Connection conn = null;
     25 
     26     private static PreparedStatement ps = null;
     27     // 标题和链接获取
     28 
     29     private static String TITLEQUERY = "div.post_item_body h3 a.titlelnk";
     30 
     31     private static String TITLE = "div.post h1 a.postTitle2";
     32     // 作者
     33 
     34     private static String AUTHORQUERY = "div.post_item_foot a.lightblue ";
     35 
     36 
     37     //初始化带爬取网页地址
     38     private static List<String> urls() {
     39         List listUrl=new ArrayList<String>();
     40         for(int i=1;i<=200;i++) {
     41             listUrl.add("https://www.cnblogs.com/sitehome/p/"+i);
     42 
     43         }
     44         listUrl.toArray(new String[listUrl.size()]);
     45         return listUrl;
     46     }
     47     private static void add_urls_child(Page page) {
     48         List listUrl=new ArrayList<String>();
     49         listUrl= page.getHtml().xpath("//*[@id="post_list"]//*/div[2]/div/a//@href").all();
     50 
     51         listUrl.toArray(new String[listUrl.size()]);
     52         page.addTargetRequests(listUrl);
     53 
     54     }
     55 
     56     private static void add_urls_child_page(Page page) {
     57         List listUrl=new ArrayList<String>();
     58         listUrl= page.getHtml().xpath("//div[@class="postTitle"]/a//@href").all();
     59 
     60         listUrl.toArray(new String[listUrl.size()]);
     61         page.addTargetRequests(listUrl);
     62 
     63     }
     64 
     65     //jsoup根据html字符串和语法来获取内容
     66     private static String selectDocumentText(String htmlText,String Query) {
     67         Document doc=Jsoup.parse(htmlText);
     68         String select=doc.select(Query).text();
     69         return select;
     70     }
     71 
     72     //jsoup根据html字符串和语法获取链接地址
     73     private static String selectDocumentLink(String htmlText,String Query) {
     74         Document doc=Jsoup.parse(htmlText);
     75         String select=doc.select(Query).attr("href");
     76         return select;
     77     }
     78 
     79     @Override
     80     public Site getSite() {
     81         return Site.me().setSleepTime(1000).setRetryTimes(10);
     82     }
     83 
     84     //编写抽取逻辑
     85     @Override
     86     public void process(Page page) {
     87         nn=nn+1;
     88         if(nn==1)
     89         {
     90             System.out.println("TTTTTTTTTTTTT");
     91             page.addTargetRequests(urls());
     92         }
     93 
     94         String str = page.getUrl().get();
     95 
     96         if(str.matches("https://www.cnblogs.com/sitehome/p/[0-9]+"))
     97         {
     98             System.out.println("AAAAA");
     99             add_urls_child(page);
    100         }
    101         else if(str.matches("https://www.cnblogs.com/[A-Za-z0-9_-]+/"))
    102         {
    103             System.out.println("BBBBBBB");
    104             add_urls_child_page(page);
    105         }else
    106         {
    107             System.out.println("DDDDDD");
    108 
    109             String title=page.getHtml().xpath("//*[@id='cb_post_title_url']//text()").get();
    110 
    111             String URL=page.getUrl().get();
    112 
    113 
    114 
    115             String author=page.getHtml().xpath("//*[@id='Header1_HeaderTitle']//text()").get();
    116             List<String> values=new ArrayList<String>();
    117             values=page.getHtml().xpath("//*[@id='cnblogs_post_body']//*//text()").all();
    118             String info="";
    119             for(String value:values)
    120             {
    121                 info+=value;
    122             }
    123             info=info.replaceAll(regEx, aa);
    124             System.out.println("Title:	"+title);
    125             System.out.println("AUTHOR:	"+author);
    126             System.out.println(  "VALUE:	"+info);
    127             System.out.println("URL:	"+URL);
    128             ClarifyDao.add("blog_info","",title,author,info,URL);
    129 
    130         }
    131 
    132 
    133 
    134 
    135  /*
    136         //定义如何抽取页面信息
    137 
    138         List<String> htmls=page.getHtml().xpath("//div[@class='post_item']/html()").all();
    139 
    140        // List<JavaBokeModel> javaBokes=new ArrayList<JavaBokeModel>();
    141         for(String html:htmls) {
    142           //  JavaBokeModel javaBoke=new JavaBokeModel();
    143             //标题和链接
    144             String title=selectDocumentText(html,TITLEQUERY);
    145 
    146             String linke=selectDocumentLink(html,TITLEQUERY);
    147             //作者和作者主页
    148             String author=selectDocumentText(html,AUTHORQUERY);
    149 
    150             System.out.println(
    151                     "TITLE	"+title+
    152                             "Link	"+linke+
    153                             "Author	"+author
    154             );
    155 
    156 
    157 
    158         }
    159  */
    160         //File.WriteStringToFile2(javaBokes);
    161 
    162 
    163     }
    164 
    165     public static void main(String[] args) {
    166         long startTime,endTime;
    167         //DBUtil.getConnection();
    168         startTime=new Date().getTime();
    169         InfoDao.delete("blog_info");
    170         Spider create=Spider.create(new 博客园内容());
    171         create.addUrl("https://www.cnblogs.com/").thread(5).run();
    172         try {
    173             ps.close();
    174             conn.close();
    175         }catch(Exception e) {
    176 
    177         }
    178         endTime=new Date().getTime();
    179         System.out.println("用时为:"+(endTime-startTime)/1000+"s");
    180 
    181     }
    182 
    183 }
    博文内容代码
      1 package use;
      2 
      3 import java.sql.Connection;
      4 import java.sql.PreparedStatement;
      5 import java.util.ArrayList;
      6 import java.util.Date;
      7 import java.util.List;
      8 
      9 import com.dao.InfoDao;
     10 import org.jsoup.Jsoup;
     11 import org.jsoup.nodes.Document;
     12 
     13 import us.codecraft.webmagic.Page;
     14 import us.codecraft.webmagic.Site;
     15 import us.codecraft.webmagic.Spider;
     16 import us.codecraft.webmagic.processor.PageProcessor;
     17 
     18 public class 博客园标签 implements PageProcessor {
     19     static int nn=0;
     20     private static Connection conn = null;
     21 
     22     private static PreparedStatement ps = null;
     23     // 标题和链接获取
     24 
     25     private static String TITLEQUERY = "div.post_item_body h3 a.titlelnk";
     26 
     27     private static String TITLE = "div.post h1 a.postTitle2";
     28     // 作者
     29 
     30     private static String AUTHORQUERY = "div.post_item_foot a.lightblue ";
     31 
     32 
     33     //初始化带爬取网页地址
     34     private static List<String> urls() {
     35         List listUrl=new ArrayList<String>();
     36         for(int i=2;i<=200;i++) {
     37             listUrl.add("https://www.cnblogs.com/sitehome/p/"+i);
     38 
     39         }
     40         listUrl.toArray(new String[listUrl.size()]);
     41         return listUrl;
     42     }
     43     private static void add_urls_child(Page page) {
     44         List listUrl=new ArrayList<String>();
     45         List<String> Urls=new ArrayList<String>();
     46         Urls= page.getHtml().xpath("//*[@id="post_list"]//*/div[2]/div/a//@href").all();
     47 
     48         for(String ur:Urls)
     49         {
     50             ur+="tag/";
     51             listUrl.add(ur);
     52         }
     53         listUrl.toArray(new String[listUrl.size()]);
     54         page.addTargetRequests(listUrl);
     55 
     56     }
     57 
     58     //jsoup根据html字符串和语法来获取内容
     59     private static String selectDocumentText(String htmlText,String Query) {
     60         Document doc=Jsoup.parse(htmlText);
     61         String select=doc.select(Query).text();
     62         return select;
     63     }
     64 
     65     //jsoup根据html字符串和语法获取链接地址
     66     private static String selectDocumentLink(String htmlText,String Query) {
     67         Document doc=Jsoup.parse(htmlText);
     68         String select=doc.select(Query).attr("href");
     69         return select;
     70     }
     71 
     72     @Override
     73     public Site getSite() {
     74         return Site.me().setSleepTime(1000).setRetryTimes(10);
     75     }
     76 
     77     //编写抽取逻辑
     78     @Override
     79     public void process(Page page) {
     80         nn=nn+1;
     81         if(nn==1)
     82         {
     83             page.addTargetRequests(urls());
     84         }
     85         if(page.getUrl().regex("https://www.cnblogs.com/sitehome/p/[0-9]+").match())
     86         {
     87             add_urls_child(page);
     88         }
     89 
     90         else
     91         {
     92             System.out.println("DDDDDD");
     93 
     94             String title=page.getHtml().xpath("//*[@id="Header1_HeaderTitle"]//text()").get();
     95             String URL=page.getUrl().get();
     96             System.out.println("Title:	"+title);
     97             System.out.println("URL:	"+URL);
     98             List<String> tags=new ArrayList<String>();
     99             tags=page.getHtml().xpath("//*[@id="MyTag1_dtTagList"]/tbody//a//text()").all();
    100             for(String tag:tags)
    101             {
    102                 System.out.println(
    103                         "TAG:	"+tag
    104                 );
    105                 InfoDao.add("blog",tag,title,URL);
    106             }
    107 
    108 
    109         }
    110 
    111 
    112 
    113 
    114  /*
    115         //定义如何抽取页面信息
    116 
    117         List<String> htmls=page.getHtml().xpath("//div[@class='post_item']/html()").all();
    118 
    119        // List<JavaBokeModel> javaBokes=new ArrayList<JavaBokeModel>();
    120         for(String html:htmls) {
    121           //  JavaBokeModel javaBoke=new JavaBokeModel();
    122             //标题和链接
    123             String title=selectDocumentText(html,TITLEQUERY);
    124 
    125             String linke=selectDocumentLink(html,TITLEQUERY);
    126             //作者和作者主页
    127             String author=selectDocumentText(html,AUTHORQUERY);
    128 
    129             System.out.println(
    130                     "TITLE	"+title+
    131                             "Link	"+linke+
    132                             "Author	"+author
    133             );
    134 
    135 
    136 
    137         }
    138  */
    139         //File.WriteStringToFile2(javaBokes);
    140 
    141 
    142     }
    143 
    144     public static void main(String[] args) {
    145         long startTime,endTime;
    146         //DBUtil.getConnection();
    147         startTime=new Date().getTime();
    148 
    149         Spider create=Spider.create(new 博客园标签());
    150         create.addUrl("http://www.cnblogs.com/").thread(5).run();
    151         try {
    152             ps.close();
    153             conn.close();
    154         }catch(Exception e) {
    155 
    156         }
    157         endTime=new Date().getTime();
    158         System.out.println("用时为:"+(endTime-startTime)/1000+"s");
    159 
    160     }
    161 
    162 }
    标签代码
  • 相关阅读:
    通过异常处理错误-2
    通过异常处理错误-1
    线程池
    Synchronized
    持有对象-4
    持有对象-3
    持有对象-2 迭代器深入理解
    ServletContextListener
    持有对象-1
    行为参数化
  • 原文地址:https://www.cnblogs.com/smartisn/p/12250572.html
Copyright © 2011-2022 走看看