zoukankan      html  css  js  c++  java
  • java的一个爬虫

    进行抓取页面,我看了一下人家的教程,一般要用到htmlparser用来解析html得到一个网页的相关链接,用httpclient抓取网页数据, 
    下面是一我写的spider类 
    package com.openzone.search.spider; 
    import java.io.BufferedReader; 
    import java.io.BufferedWriter; 
    import java.io.DataOutputStream; 
    import java.io.File; 
    import java.io.FileOutputStream; 
    import java.io.FileWriter; 
    import java.io.IOException; 
    import java.io.InputStream; 
    import java.io.InputStreamReader; 
    import java.io.OutputStream; 
    import java.io.OutputStreamWriter; 
    import java.io.Writer; 
    import java.util.HashSet; 
    import java.util.Set; 
    import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; 
    import org.apache.commons.httpclient.HttpClient; 
    import org.apache.commons.httpclient.HttpException; 
    import org.apache.commons.httpclient.methods.GetMethod; 
    import org.apache.commons.httpclient.params.HttpMethodParams; 
    import org.htmlparser.Node; 
    import org.htmlparser.NodeFilter; 
    import org.htmlparser.Parser; 
    import org.htmlparser.filters.NodeClassFilter; 
    import org.htmlparser.filters.OrFilter; 
    import org.htmlparser.lexer.Stream; 
    import org.htmlparser.tags.LinkTag; 
    import org.htmlparser.util.NodeList; 
    import org.htmlparser.util.ParserException; 
    public class Spider { 
    /** 
    * 使用种子初始化url队列 
    */ 
    String[] seeds;//种子地址,爬虫首先进入的网页 
    String line;//获取相关的链接,比如line="http://localhost",爬虫将只记录以此为开头的地址 
    String savepath;//存储网页的文件夹 
    String encoding;//爬虫的编码形式 
    public Spider(String[] seeds,String line,String savepath,String encoding){ 
    this.seeds=seeds; 
    this.line=line; 
    this.savepath=savepath; 
    this.encoding=encoding; 

    public void init(){ 
    Set<String> seedsSet=new HashSet<String>(); 
    for(int i=0;i<seeds.length;i++){ 
    seedsSet.add(seeds[i]); 

    UrlTables.addToUnvisitedUrlSet(seedsSet); 

    public void run() throws HttpException, IOException, ParserException{ 
    init(); 
    for(int i=0;i<20;i++){ 
    if(UrlTables.getUnvisitedUrl().size()!=0){ 
    String url=UrlTables.getFirstFromVisitedUrSet(); 
    catchPages(url); 
    UrlTables.addToVisitedUrlSet(url); 
    UrlTables.addToUnvisitedUrlSet(getUrls(url)); 



    public void catchPages(String url){ 
    String filename=null; 
    HttpClient httpClient=new HttpClient(); 
    httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(5000); 
    GetMethod getMethod=new GetMethod(url); 
    //生成getmthod对象并设置参数 
    //设置get请求超时5s 
    getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000); 
    //设置请求重试处理 
    getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, 
    new DefaultHttpMethodRetryHandler()); 
    //设置encoding 网页模板
    getMethod.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET,encoding); 
    getMethod.addRequestHeader("Content-Type", "text/html; charset=UTF-8");   
    //执行http get请求 
    int statusCode; 
    try { 
    statusCode = httpClient.executeMethod(getMethod); 
    System.out.print(statusCode); 
    if(statusCode==200){ 
    InputStream responseBody=null; 
    responseBody=getMethod.getResponseBodyAsStream(); 
    filename=getFileNameByUrl(url, getMethod.getResponseHeader("Content-Type").getValue()); 
    if(responseBody!=null) 
    saveToLocal(responseBody,filename); 
    System.out.println("getsuccess"); 
    String body=""; 
    body=responseBody.toString(); 
    System.out.println(body); 
    }else{ 
    System.out.print("getfalse"); 

    } catch (HttpException e) { 
    // TODO Auto-generated catch block 
    e.printStackTrace(); 
    } catch (IOException e) { 
    // TODO Auto-generated catch block 
    e.printStackTrace(); 


    /* 
    * 将catchPages得到的网页的比特流存到本地 
    */ 
    public void saveToLocal(InputStream responseBody,String filename) throws IOException{ 
    BufferedReader reader = new BufferedReader(new InputStreamReader(responseBody,encoding)); 
    File file=new File(savepath,filename); 
    FileOutputStream fileOutputStream=new FileOutputStream(file); 
    OutputStreamWriter writer=new OutputStreamWriter(fileOutputStream); 
    String line; 
    while((line=reader.readLine())!=null){ 
    System.out.println(line); 
    writer.write(line); 

    writer.flush(); 
    writer.close(); 

    /* 
    * 解析页面的url 
    */ 
    public Set<String> getUrls(String url) throws ParserException{ 
    Set<String> links=new HashSet<String>(); 
    Parser parser=new Parser(url); 
    parser.setEncoding(encoding); 
    NodeFilter frameFilter=new NodeFilter() { 
    @Override 
    public boolean accept(Node node) { 
    // TODO Auto-generated method stub 
    if(node.getText().startsWith("frame src=")){ 
    return true; 
    }else{ 
    return false; 


    }; 
    OrFilter linkFilter=new OrFilter(new NodeClassFilter(LinkTag.class),frameFilter); 
    NodeList list=parser.extractAllNodesThatMatch(linkFilter); 
    for(int i=0;i<list.size();i++){ 
    Node tag=list.elementAt(i); 
    if(tag instanceof LinkTag){ 
    LinkTag link=(LinkTag)tag; 
    String linkUrl=link.getLink(); 
    if(frameFilter.accept(tag)){ 
    //处理<frame> 
    String frameTxt=tag.getText(); 
    int start=frameTxt.indexOf("src="); 
    frameTxt=frameTxt.substring(start); 
    int end=frameTxt.indexOf(" "); 
    if(end==-1){ 
    end=frameTxt.indexOf(">"); 
    http://www.huiyi8.com/moban/
    String frameUrl=frameTxt.substring(5,end-1); 
    if(LinkFilter(frameUrl)) 
    links.add(frameUrl); 
    }else{ 
    //处理<a> 
    if(LinkFilter(linkUrl)){ 
    links.add(linkUrl); 




    return links; 

    //爬虫遵循的线索 
    public boolean LinkFilter(String url){ 
    if(url.startsWith(line)){ 
    return true; 
    }else{ 
    return false; 




    //网页名filter,不然会出现存储错误 
    public String getFileNameByUrl(String url,String contentType){ 
    //移除http; 
    url=url.substring(7); 
    //text/html类型 
    if(contentType.indexOf("html")!=-1){ 
    url=url.replaceAll("[\?/:*|<>"]", "_")+".html"; 
    return url; 
    }else{ 
    return url.replaceAll("[\?/:*|<>"]","_")+"."+ 
    contentType.substring(contentType.lastIndexOf("/")+1); 



    //下面是存储地址的类 
    package com.openzone.search.spider; 
    import java.util.HashSet; 
    import java.util.LinkedList; 
    import java.util.Set; 
    public class UrlTables { 
    private static Set<String> visitedUrlSet=new HashSet(); 
    private static LinkedList unvisitedUrlSet=new LinkedList(); 
    public static Set getVisitedUrl() { 
    return visitedUrlSet; 

    public static void setVisitedUrl(Set visitedUrl) { 
    UrlTables.visitedUrlSet = visitedUrl; 

    public static LinkedList getUnvisitedUrl() { 
    return unvisitedUrlSet; 

    public static void setUnvisitedUrl(LinkedList unvisitedUrl) { 
    UrlTables.unvisitedUrlSet = unvisitedUrl; 

    public static void addToVisitedUrlSet(String url){ 
    visitedUrlSet.add(url); 

    public static boolean IsUnvisitedUrlSetEmpty(){ 
    boolean isEmpty=false; 
    if(unvisitedUrlSet.isEmpty()){ 
    isEmpty=true; 

    return isEmpty;  

    public static void addToUnvisitedUrlSet(Set<String> urls){ 
    for (String url : urls) { 
    if(!isVisited(url)){ 
    unvisitedUrlSet.add(url); 



    public static boolean isVisited(String url){ 
    boolean isVisited=false; 
    for (String visitedUrl : visitedUrlSet) { 
    if(visitedUrl.equals(url)){ 
    isVisited=true; 


    return isVisited; 

    public static String getFirstFromVisitedUrSet(){ 
    String url=unvisitedUrlSet.getFirst().toString(); 
    unvisitedUrlSet.removeFirst(); 
    return url; 


    //下面实例化爬虫进行工作 
    package com.openzone.search.spider; 
    import java.io.IOException; 
    import org.apache.commons.httpclient.HttpException; 
    import org.htmlparser.util.ParserException; 
    public class SpiderRun { 
    /** 
    * @param args 
    */ 
    public static void main(String[] args) { 
    // TODO Auto-generated method stub 
    String[] seeds={"http://localhost/openzone/"}; 
    String line="http://localhost"; 
    String savepath="D:\javaworkspace\openzone"; 
    String encoding="utf-8"; 
    Spider spider=new Spider(seeds, line, savepath, encoding); 
    try { 
    spider.run(); 
    } catch (HttpException e) { 
    e.printStackTrace(); 
    // TODO Auto-generated catch block 
    } catch (ParserException e) { 
    // TODO Auto-generated catch block 
    e.printStackTrace(); 
    } catch (IOException e) { 
    // TODO Auto-generated catch block 
    e.printStackTrace(); 


  • 相关阅读:
    SLS评测报告
    Flash对不同的浏览器的兼容性
    NodeJS的Cluster模块使用
    Varnish+Xcache构建高性能WEB构架初探
    Memcached Client的释疑
    Firebug及YSlow简介与使用图文详解
    PHP Memcached 实现简单数据库缓存
    PHP + Memcache 实现Session共享
    Linux 开机关机在线求助与指令输入
    Linux 基础学习篇笔记 Linux基础知识
  • 原文地址:https://www.cnblogs.com/xkzy/p/3816726.html
Copyright © 2011-2022 走看看