zoukankan      html  css  js  c++  java
  • 爬虫——博客实例

    //Rule.java用于指定查询url,method,params
    public class Rule
    {
    private String url;//链接
    private String[] params;//参数集合
    private String[] values;//参数的值
    //对返回的HTML,第一希过滤所用的标签,先设置type
    private String resultTagName;
    /*设置resultTagName的类型,默认是ID
    *CLASS/ID/SELECTION
    */
    private int type=ID;
    /*请求的类型,默认是get
    * GET/POST
    */
    private int requestMethod=GET;
    public final static int GET=0;
    public final static int POST=1;
    public final static int class="0";
    public final static int ID=1;
    public final static int SELECTION=2;
    public Rule()
    {
    }

    public Rule(String url,String[] param,String[] values,String resultTagName,int type,int requestMethod)
    {
    super();
    this.url=url;
    this.params=params;
    this.values=values;
    this.resultTagName=resultTagName;
    this.type=type;
    this.requestMethod=requestMethod;
    }

    public String getUrl()
    {
    return url;
    }
    public void setUrl(String url)
    {
    this.url=url;
    }
    public String[] getParams()
    {
    return params;
    }
    public void setParams(String[] params)
    {
    this.params=params;
    }
    public String[] getValues()
    {
    return values;
    }
    public void setValues(String[] values)
    {
    this.values=values;
    }
    public String getResultTagName()
    {
    return resultTagName;
    }
    public void setResultTagName(String resultTagName)
    {
    this.resultTagName=resultTagName;
    }
    public int getType()
    {
    return type;
    }
    public void setType(int type)
    {
    this.type=type;
    }
    public int getRequestMethod()
    {
    return requestMethod;
    }
    public void setRequestMethod()
    {
    this.requestMethod=requestMethod;
    }
    }

    //链接需要的数据对象
    public class LinkTypeData {
    private int id;
    // 链接的地址
    private String linkHref;
    //链接标题
    private String linkText;
    //摘要
    private String summary;
    //内容
    private String content;
    public int getId()
    {
    return id;
    }
    public void setId(int id)
    {
    this.id=id;
    }
    public String getLinkHref()
    {
    return linkHref;
    }
    public void setLinkHref(String linkHref)
    {
    this.linkHref=linkHref;
    }
    public String getSummary()
    {
    return summary;
    }
    public void setSummary(String summary)
    {
    this.summary=summary;
    }
    public String getContent()
    {
    return content;
    }
    public void setContent(String content)
    {
    this.content=content;
    }

    }

    import java.util.List;

    //核心的查询类
    public class ExtractService {
    public static List<LinkTypeData> extract(Rule rule)//<>是泛型,里面指定了这个集合中存放的是什么数据
    {
    //对rule必要检验
    validateRule(rule);
    List<LinkTypeData> datas=new ArrayList<LinkTypeData>();
    LinkTypeData data=null;
    try
    {
    //解析rule
    String url=rule.getUrl();
    String[] params=rule.getParams();
    String[] values=rule.getValues();
    String resultTagName=rule.getResultTagName();
    int type=rule.getType;
    int requestType=rule.getRequestMethod();
    connection conn=Jsoup.connect(url);//Jsoup.connect解析url网站地址
    //设置查询参数
    if(params!=null)
    {
    for(int i=0;i<params.length;i++)
    {
    conn.data(params[i],values[i]);
    }
    }
    //设置请求类型
    Document doc=null;
    switch (requestType)
    {
    case Rule.GET:
    doc=conn.timeout(100000).get();
    break;
    case Rule.Post:
    doc=conn.timeout(100000).post();
    break;
    }
    //处理返回数据
    Elements results=new Elements();
    switch(type)
    {
    case Rule.CLASS:
    results=doc.getElementsByClass(resultTagName);
    break;
    case Rule.ID:
    Element result=doc.getElementById(resultTagName);
    results.add(result);
    break;
    case Rule.SELECTION:
    results=doc.select(resultTagName)
    break;
    default;
    //当resultTagName为空时默认去body标签
    if(TextUtil.isEmpty(resultTagName))
    {
    results=doc.getElementsByTag("body");
    }
    }
    for(Element result:results)
    {
    Elements links=result.getElementsByTag("a");

    for(Element link:links)
    {
    //必要的筛选
    String linkHref=link.attr("href");
    String linkText=link.text();
    data=new LinkTypeData();
    data.setLinkHref(linkHref);
    data.setLinkText(linkText);
    datas.add(data);
    }
    }
    }catch(IOException e)
    {
    e.printStackTrace();
    }
    return datas;
    }

    //传入参数必要检验
    private static void validateRule(Rule rule)
    {
    String url=rule.getUrl();
    if(TextUtil.isEmpty(url))
    {
    throw new RuleException("url不能为空!");
    }
    if(!url.startsWith("http://"))
    {
    throw new RuleException("url格式不正确");
    }
    if(rule.getParams()!=null&&rule.getValues()!=null)
    {
    if(rule.getParams().length!=rule.getvalues().length)
    {
    throw new RuleException("参数键值对个数不匹配");
    }

    }
    }
    }

  • 相关阅读:
    A
    N
    M
    L
    K
    J
    sass
    通过ps给透明通道的图片添加灰度(适用于需要兼容IE7,效果很好)
    CSS十一问——好奇心+刨根问底=CSSer
    清除浮动的7种方法
  • 原文地址:https://www.cnblogs.com/sunshinewxz/p/4430355.html
Copyright © 2011-2022 走看看