zoukankan      html  css  js  c++  java
  • 爬虫

     1 import java.io.IOException;
     2 import java.util.Set;
     3 
     4 import org.htmlparser.Node;
     5 import org.htmlparser.Parser;
     6 import org.htmlparser.filters.TagNameFilter;
     7 import org.htmlparser.util.NodeList;
     8 import org.htmlparser.util.ParserException;
     9 
    10 
    11 public class wikiCrawler {
    12      private void initCrawlerWithSeeds(String[] seeds) {
    13         for(int i=0;i<seeds.length;i++){
    14             LinkQueue.addUnvisitedUrl(seeds[i]);
    15         }
    16     }
    17      public void crawling(String[] seeds) throws IOException, ParserException{
    18          initCrawlerWithSeeds(seeds);
    19          while(!LinkQueue.unVisitedUrlEmpty()&&LinkQueue.getVisitedUrlNum()<1000){
    20              String visitUrl=(String)LinkQueue.unVisitedUrlDequeue();
    21              
    22              TagNameFilter tagNameFilter=new TagNameFilter("title");
    23              
    24              
    25               DownLoadFile downLoadFile=new DownLoadFile("D://");
    26              String filepath=downLoadFile.downloadFile(visitUrl);
    27              System.out.println(filepath);
    28              if(filepath!=null){
    29              String contentString=HtmlContent.getHtml(filepath);
    30              
    31              NodeList list=new Parser(contentString).extractAllNodesThatMatch(tagNameFilter);
    32              String title=((Node)list.elementAt(0)).toPlainTextString();
    33              System.out.println(title);
    34              LinkQueue.addVisitedUrl(visitUrl);
    35              if(contentString!=null){
    36                  Set<String> linksSet=WikiParseHtml.extractLinkSet(contentString);
    37                  for(Object link:linksSet){
    38                      LinkQueue.addUnvisitedUrl((String) link);
    39                  }
    40              }
    41              }
    42          }
    43      }
    44      public static void main(String[] args) throws IOException, ParserException{
    45          wikiCrawler crawler=new wikiCrawler();
    46          crawler.crawling(new String[]{"http://free0007.iteye.com"});
    47      }
    48 }

    html content

     1 import java.io.BufferedReader;
     2 import java.io.DataOutputStream;
     3 import java.io.File;
     4 import java.io.FileInputStream;
     5 import java.io.FileOutputStream;
     6 import java.io.FileReader;
     7 import java.io.IOException;
     8 import java.io.InputStream;
     9 import java.io.InputStreamReader;
    10 import java.io.OutputStream;
    11 
    12 import org.apache.http.HttpEntity;
    13 import org.apache.http.client.ClientProtocolException;
    14 import org.apache.http.client.methods.CloseableHttpResponse;
    15 import org.apache.http.client.methods.HttpGet;
    16 import org.apache.http.impl.client.CloseableHttpClient;
    17 import org.apache.http.impl.client.HttpClients;
    18 import org.apache.http.util.EntityUtils;
    19 
    20 
    21 public class HtmlContent {
    22 
    23     public static String getHtml(String filepath) throws IOException {
    24         //File file = new File(filepath);
    25 
    26        /*StringBuffer sb = new StringBuffer();
    27        String s ="";
    28        BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(filepath),"UTF-8"));
    29 
    30        while( (s = br.readLine()) != null) {
    31          sb.append(s + "
    ");
    32          }
    33 
    34        br.close();
    35        String str = sb.toString();
    36       return str;*/
    37 
    38         try {  
    39             BufferedReader bis = new BufferedReader(new InputStreamReader(new FileInputStream( new File(filepath)),"UTF-8") );  
    40             StringBuffer stringBuffer=new StringBuffer(250000);
    41             String szTemp;  
    42               
    43             while ( (szTemp = bis.readLine()) != null) {  
    44                 stringBuffer.append(szTemp+"
    ");  
    45             }  
    46             bis.close();  
    47             return stringBuffer.toString();  
    48         }  
    49         catch( Exception e ) {  
    50             return "";  
    51         }  
    52     }
    53     /*public static void main(String[] args){
    54         try {
    55             System.out.print(HtmlContent.getHtml("D://zh.wikipedia.org_wiki_Wikipedia_%E9%A6%96%E9%A1%B5.html"));
    56         } catch (IOException e) {
    57             System.out.print("error");
    58             e.printStackTrace();
    59         }
    60     }*/
    61 }
    View Code

    downloadfile

     1 import java.io.DataOutputStream;
     2 import java.io.File;
     3 import java.io.FileNotFoundException;
     4 import java.io.FileOutputStream;
     5 import java.io.IOException;
     6 import java.io.InputStream;
     7 import java.io.OutputStreamWriter;
     8 
     9 import org.apache.http.Header;
    10 import org.apache.http.HttpEntity;
    11 import org.apache.http.HttpStatus;
    12 import org.apache.http.client.ClientProtocolException;
    13 import org.apache.http.client.HttpClient;
    14 import org.apache.http.client.config.RequestConfig;
    15 import org.apache.http.client.config.RequestConfig.Builder;
    16 import org.apache.http.client.methods.CloseableHttpResponse;
    17 import org.apache.http.client.methods.HttpGet;
    18 import org.apache.http.impl.client.CloseableHttpClient;
    19 import org.apache.http.impl.client.HttpClients;
    20 import org.apache.http.util.EntityUtils;
    21 
    22 
    23 public class DownLoadFile {
    24     private String filepath="";
    25     public String getFileNameByUrl(String url, String contentType){
    26         url=url.substring(7);
    27         if(contentType.indexOf("html")!=-1){
    28             url=url.replaceAll("[\?/:*|<>"]", "_")+".html";
    29             return url;
    30         }
    31         //application/pdf
    32         else{
    33             return url.replaceAll("[\?/:*|<>"]", "_")+contentType.substring(contentType.indexOf("/")+1);
    34         }
    35     }
    36     public DownLoadFile(String filepath){
    37         this.filepath=filepath;
    38     }
    39     private void saveToLocal(InputStream is,String filePath) throws IOException{
    40         try {
    41             DataOutputStream outputStream=new DataOutputStream(new FileOutputStream(new File(filePath)));
    42             int len=0;
    43             byte[] buffer=new byte[1024];
    44             while((len=is.read(buffer))!=-1){
    45                 outputStream.write(buffer, 0, len);
    46             }
    47             outputStream.flush();
    48             outputStream.close();
    49         } catch (FileNotFoundException e) {
    50             
    51             e.printStackTrace();
    52         }
    53     }
    54     public String downloadFile(String url) throws IOException{
    55         String filePathString=null;
    56         CloseableHttpClient httpClient=HttpClients.createDefault();
    57         HttpGet httpGet=new HttpGet(url);
    58         RequestConfig requestConfig=RequestConfig.copy(RequestConfig.custom().build()).setConnectTimeout(5000).build();
    59         httpGet.setConfig(requestConfig);
    60         CloseableHttpResponse response=httpClient.execute(httpGet);
    61         try {
    62             String statusCode=response.getStatusLine().toString();
    63             if(Integer.parseInt(statusCode.split(" ")[1])!=HttpStatus.SC_OK){
    64                 System.err.println(url+" Failed:"+statusCode);
    65                 filePathString=null;
    66             }
    67             else {
    68                 HttpEntity entity=response.getEntity();
    69                 InputStream input= entity.getContent();
    70                 Header header= entity.getContentType();
    71                 filePathString=filepath+getFileNameByUrl(url, header.getValue());
    72                 
    73                 saveToLocal(input, filePathString);
    74                 
    75                 
    76             }
    77 
    78         } catch (ClientProtocolException e) {
    79             // TODO Auto-generated catch block
    80             e.printStackTrace();
    81         } catch (IOException e) {
    82             // TODO Auto-generated catch block
    83             e.printStackTrace();
    84         }finally{
    85             response.close();
    86             httpClient.close();
    87         }
    88         
    89         return filePathString;
    90     }
    91 
    92     
    93 }
    View Code

    wikiparserhtml

     1 import java.util.ArrayList;
     2 import java.util.HashSet;
     3 import java.util.Set;
     4 
     5 import org.htmlparser.Node;
     6 import org.htmlparser.NodeFilter;
     7 import org.htmlparser.Parser;
     8 import org.htmlparser.filters.NodeClassFilter;
     9 import org.htmlparser.tags.LinkTag;
    10 import org.htmlparser.util.NodeList;
    11 
    12 
    13 public class WikiParseHtml {
    14     public static Set<String> extractLinkSet(String content){
    15         Set<String> linksSet=new HashSet<String>();
    16         try {
    17             Parser parser= Parser.createParser(content, "utf-8");
    18             NodeClassFilter nodeClassFilter=new NodeClassFilter(LinkTag.class);
    19             NodeList list=parser.extractAllNodesThatMatch(nodeClassFilter);
    20             for(int i=0;i<list.size();i++){
    21                 Node tagNode=list.elementAt(i);
    22                 if(tagNode instanceof LinkTag){
    23                     LinkTag linkTag=(LinkTag)tagNode;
    24                     String urlString=linkTag.getLink();
    25                     if(urlString.startsWith("http://")){
    26                         linksSet.add(urlString);
    27                     }
    28                 }
    29             }
    30         } catch (Exception e) {
    31             e.printStackTrace();
    32         }
    33         return linksSet;
    34         
    35     }
    36 }
    View Code

    queue

     1 import java.util.LinkedList;
     2 
     3 
     4 public class Queue {
     5     private LinkedList<Object> queue=new LinkedList<Object>();
     6     public void enQueue(Object t){
     7         queue.addLast(t);
     8     }
     9     public Object deQueue(){
    10         return queue.removeFirst();
    11     }
    12     public boolean isQueueEmpty(){
    13         return queue.isEmpty();
    14     }
    15     public boolean contains(Object t){
    16         return queue.contains(t);
    17     }
    18     public boolean empty(){
    19         return queue.isEmpty();
    20     }
    21 }
    View Code

    linkqueue

     1 import java.util.HashSet;
     2 import java.util.Set;
     3 
     4 
     5 public class LinkQueue {
     6     private static Set visitedUrl=new HashSet();
     7     private static Queue unvisitedUrl=new Queue();
     8     public static Queue getUnvisitedUrl(){
     9         return unvisitedUrl;
    10     }
    11     public static void addVisitedUrl(String url){
    12         visitedUrl.add(url);
    13     }
    14     public static void removeVisitedUrl(String url){
    15         visitedUrl.remove(url);
    16     }
    17     public static Object unVisitedUrlDequeue(){
    18         return unvisitedUrl.deQueue();
    19     }
    20     public static void addUnvisitedUrl(String url){
    21         if(url!=null||!url.trim().equals("")||!visitedUrl.contains(url)||!unvisitedUrl.contains(url))
    22             unvisitedUrl.enQueue(url);
    23     }
    24     public static int getVisitedUrlNum(){
    25         return visitedUrl.size();
    26     }
    27     public static boolean unVisitedUrlEmpty(){
    28         return unvisitedUrl.isQueueEmpty();
    29     }
    30 }
    View Code
  • 相关阅读:
    Python判断列表是否已排序的各种方法及其性能分析
    Python实现C代码统计工具(四)
    Python代码统计工具
    Python实现C代码统计工具(三)
    Python实现C代码统计工具(一)
    Python标准输出重定向
    为C函数自动添加跟踪语句
    Python2.7字符编码详解
    Python实现Linux命令xxd -i功能
    1124 Raffle for Weibo Followers (20 分)
  • 原文地址:https://www.cnblogs.com/qianwei/p/4002160.html
Copyright © 2011-2022 走看看