zoukankan      html  css  js  c++  java
  • 爬虫

     1 import java.io.IOException;
     2 import java.util.Set;
     3 
     4 import org.htmlparser.Node;
     5 import org.htmlparser.Parser;
     6 import org.htmlparser.filters.TagNameFilter;
     7 import org.htmlparser.util.NodeList;
     8 import org.htmlparser.util.ParserException;
     9 
    10 
    11 public class wikiCrawler {
    12      private void initCrawlerWithSeeds(String[] seeds) {
    13         for(int i=0;i<seeds.length;i++){
    14             LinkQueue.addUnvisitedUrl(seeds[i]);
    15         }
    16     }
    17      public void crawling(String[] seeds) throws IOException, ParserException{
    18          initCrawlerWithSeeds(seeds);
    19          while(!LinkQueue.unVisitedUrlEmpty()&&LinkQueue.getVisitedUrlNum()<1000){
    20              String visitUrl=(String)LinkQueue.unVisitedUrlDequeue();
    21              
    22              TagNameFilter tagNameFilter=new TagNameFilter("title");
    23              
    24              
    25               DownLoadFile downLoadFile=new DownLoadFile("D://");
    26              String filepath=downLoadFile.downloadFile(visitUrl);
    27              System.out.println(filepath);
    28              if(filepath!=null){
    29              String contentString=HtmlContent.getHtml(filepath);
    30              
    31              NodeList list=new Parser(contentString).extractAllNodesThatMatch(tagNameFilter);
    32              String title=((Node)list.elementAt(0)).toPlainTextString();
    33              System.out.println(title);
    34              LinkQueue.addVisitedUrl(visitUrl);
    35              if(contentString!=null){
    36                  Set<String> linksSet=WikiParseHtml.extractLinkSet(contentString);
    37                  for(Object link:linksSet){
    38                      LinkQueue.addUnvisitedUrl((String) link);
    39                  }
    40              }
    41              }
    42          }
    43      }
    44      public static void main(String[] args) throws IOException, ParserException{
    45          wikiCrawler crawler=new wikiCrawler();
    46          crawler.crawling(new String[]{"http://free0007.iteye.com"});
    47      }
    48 }

    html content

     1 import java.io.BufferedReader;
     2 import java.io.DataOutputStream;
     3 import java.io.File;
     4 import java.io.FileInputStream;
     5 import java.io.FileOutputStream;
     6 import java.io.FileReader;
     7 import java.io.IOException;
     8 import java.io.InputStream;
     9 import java.io.InputStreamReader;
    10 import java.io.OutputStream;
    11 
    12 import org.apache.http.HttpEntity;
    13 import org.apache.http.client.ClientProtocolException;
    14 import org.apache.http.client.methods.CloseableHttpResponse;
    15 import org.apache.http.client.methods.HttpGet;
    16 import org.apache.http.impl.client.CloseableHttpClient;
    17 import org.apache.http.impl.client.HttpClients;
    18 import org.apache.http.util.EntityUtils;
    19 
    20 
    21 public class HtmlContent {
    22 
    23     public static String getHtml(String filepath) throws IOException {
    24         //File file = new File(filepath);
    25 
    26        /*StringBuffer sb = new StringBuffer();
    27        String s ="";
    28        BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(filepath),"UTF-8"));
    29 
    30        while( (s = br.readLine()) != null) {
    31          sb.append(s + "
    ");
    32          }
    33 
    34        br.close();
    35        String str = sb.toString();
    36       return str;*/
    37 
    38         try {  
    39             BufferedReader bis = new BufferedReader(new InputStreamReader(new FileInputStream( new File(filepath)),"UTF-8") );  
    40             StringBuffer stringBuffer=new StringBuffer(250000);
    41             String szTemp;  
    42               
    43             while ( (szTemp = bis.readLine()) != null) {  
    44                 stringBuffer.append(szTemp+"
    ");  
    45             }  
    46             bis.close();  
    47             return stringBuffer.toString();  
    48         }  
    49         catch( Exception e ) {  
    50             return "";  
    51         }  
    52     }
    53     /*public static void main(String[] args){
    54         try {
    55             System.out.print(HtmlContent.getHtml("D://zh.wikipedia.org_wiki_Wikipedia_%E9%A6%96%E9%A1%B5.html"));
    56         } catch (IOException e) {
    57             System.out.print("error");
    58             e.printStackTrace();
    59         }
    60     }*/
    61 }
    View Code

    downloadfile

     1 import java.io.DataOutputStream;
     2 import java.io.File;
     3 import java.io.FileNotFoundException;
     4 import java.io.FileOutputStream;
     5 import java.io.IOException;
     6 import java.io.InputStream;
     7 import java.io.OutputStreamWriter;
     8 
     9 import org.apache.http.Header;
    10 import org.apache.http.HttpEntity;
    11 import org.apache.http.HttpStatus;
    12 import org.apache.http.client.ClientProtocolException;
    13 import org.apache.http.client.HttpClient;
    14 import org.apache.http.client.config.RequestConfig;
    15 import org.apache.http.client.config.RequestConfig.Builder;
    16 import org.apache.http.client.methods.CloseableHttpResponse;
    17 import org.apache.http.client.methods.HttpGet;
    18 import org.apache.http.impl.client.CloseableHttpClient;
    19 import org.apache.http.impl.client.HttpClients;
    20 import org.apache.http.util.EntityUtils;
    21 
    22 
    23 public class DownLoadFile {
    24     private String filepath="";
    25     public String getFileNameByUrl(String url, String contentType){
    26         url=url.substring(7);
    27         if(contentType.indexOf("html")!=-1){
    28             url=url.replaceAll("[\?/:*|<>"]", "_")+".html";
    29             return url;
    30         }
    31         //application/pdf
    32         else{
    33             return url.replaceAll("[\?/:*|<>"]", "_")+contentType.substring(contentType.indexOf("/")+1);
    34         }
    35     }
    36     public DownLoadFile(String filepath){
    37         this.filepath=filepath;
    38     }
    39     private void saveToLocal(InputStream is,String filePath) throws IOException{
    40         try {
    41             DataOutputStream outputStream=new DataOutputStream(new FileOutputStream(new File(filePath)));
    42             int len=0;
    43             byte[] buffer=new byte[1024];
    44             while((len=is.read(buffer))!=-1){
    45                 outputStream.write(buffer, 0, len);
    46             }
    47             outputStream.flush();
    48             outputStream.close();
    49         } catch (FileNotFoundException e) {
    50             
    51             e.printStackTrace();
    52         }
    53     }
    54     public String downloadFile(String url) throws IOException{
    55         String filePathString=null;
    56         CloseableHttpClient httpClient=HttpClients.createDefault();
    57         HttpGet httpGet=new HttpGet(url);
    58         RequestConfig requestConfig=RequestConfig.copy(RequestConfig.custom().build()).setConnectTimeout(5000).build();
    59         httpGet.setConfig(requestConfig);
    60         CloseableHttpResponse response=httpClient.execute(httpGet);
    61         try {
    62             String statusCode=response.getStatusLine().toString();
    63             if(Integer.parseInt(statusCode.split(" ")[1])!=HttpStatus.SC_OK){
    64                 System.err.println(url+" Failed:"+statusCode);
    65                 filePathString=null;
    66             }
    67             else {
    68                 HttpEntity entity=response.getEntity();
    69                 InputStream input= entity.getContent();
    70                 Header header= entity.getContentType();
    71                 filePathString=filepath+getFileNameByUrl(url, header.getValue());
    72                 
    73                 saveToLocal(input, filePathString);
    74                 
    75                 
    76             }
    77 
    78         } catch (ClientProtocolException e) {
    79             // TODO Auto-generated catch block
    80             e.printStackTrace();
    81         } catch (IOException e) {
    82             // TODO Auto-generated catch block
    83             e.printStackTrace();
    84         }finally{
    85             response.close();
    86             httpClient.close();
    87         }
    88         
    89         return filePathString;
    90     }
    91 
    92     
    93 }
    View Code

    wikiparserhtml

     1 import java.util.ArrayList;
     2 import java.util.HashSet;
     3 import java.util.Set;
     4 
     5 import org.htmlparser.Node;
     6 import org.htmlparser.NodeFilter;
     7 import org.htmlparser.Parser;
     8 import org.htmlparser.filters.NodeClassFilter;
     9 import org.htmlparser.tags.LinkTag;
    10 import org.htmlparser.util.NodeList;
    11 
    12 
    13 public class WikiParseHtml {
    14     public static Set<String> extractLinkSet(String content){
    15         Set<String> linksSet=new HashSet<String>();
    16         try {
    17             Parser parser= Parser.createParser(content, "utf-8");
    18             NodeClassFilter nodeClassFilter=new NodeClassFilter(LinkTag.class);
    19             NodeList list=parser.extractAllNodesThatMatch(nodeClassFilter);
    20             for(int i=0;i<list.size();i++){
    21                 Node tagNode=list.elementAt(i);
    22                 if(tagNode instanceof LinkTag){
    23                     LinkTag linkTag=(LinkTag)tagNode;
    24                     String urlString=linkTag.getLink();
    25                     if(urlString.startsWith("http://")){
    26                         linksSet.add(urlString);
    27                     }
    28                 }
    29             }
    30         } catch (Exception e) {
    31             e.printStackTrace();
    32         }
    33         return linksSet;
    34         
    35     }
    36 }
    View Code

    queue

     1 import java.util.LinkedList;
     2 
     3 
     4 public class Queue {
     5     private LinkedList<Object> queue=new LinkedList<Object>();
     6     public void enQueue(Object t){
     7         queue.addLast(t);
     8     }
     9     public Object deQueue(){
    10         return queue.removeFirst();
    11     }
    12     public boolean isQueueEmpty(){
    13         return queue.isEmpty();
    14     }
    15     public boolean contains(Object t){
    16         return queue.contains(t);
    17     }
    18     public boolean empty(){
    19         return queue.isEmpty();
    20     }
    21 }
    View Code

    linkqueue

     1 import java.util.HashSet;
     2 import java.util.Set;
     3 
     4 
     5 public class LinkQueue {
     6     private static Set visitedUrl=new HashSet();
     7     private static Queue unvisitedUrl=new Queue();
     8     public static Queue getUnvisitedUrl(){
     9         return unvisitedUrl;
    10     }
    11     public static void addVisitedUrl(String url){
    12         visitedUrl.add(url);
    13     }
    14     public static void removeVisitedUrl(String url){
    15         visitedUrl.remove(url);
    16     }
    17     public static Object unVisitedUrlDequeue(){
    18         return unvisitedUrl.deQueue();
    19     }
    20     public static void addUnvisitedUrl(String url){
    21         if(url!=null||!url.trim().equals("")||!visitedUrl.contains(url)||!unvisitedUrl.contains(url))
    22             unvisitedUrl.enQueue(url);
    23     }
    24     public static int getVisitedUrlNum(){
    25         return visitedUrl.size();
    26     }
    27     public static boolean unVisitedUrlEmpty(){
    28         return unvisitedUrl.isQueueEmpty();
    29     }
    30 }
    View Code
  • 相关阅读:
    poj 3528 (三维几何求凸包+凸包表面积)
    dijkstra模板(好像是斐波那契额堆优化,但我为什么看起来像优先队列优化,和spfa一样)
    最大空凸包模板
    ICPC 2017–2018, NEERC, Northern Subregional Contest St Petersburg, November 4, 2017 I题
    hdu 5248 序列变换
    hdu 2063(二分图模板测试)
    组合数
    85. Maximal Rectangle 由1拼出的最大矩形
    750. Number Of Corner Rectangles四周是点的矩形个数
    801. Minimum Swaps To Make Sequences Increasing 为使两个数组严格递增,所需要的最小交换次数
  • 原文地址:https://www.cnblogs.com/qianwei/p/4002160.html
Copyright © 2011-2022 走看看