zoukankan      html  css  js  c++  java
  • nutch源代码html的头信息解析

    主要是meta、base、标签的信息

    /**
    * Licensed to the Apache Software Foundation (ASF) under one or more
    * contributor license agreements. See the NOTICE file distributed with
    * this work for additional information regarding copyright ownership.
    * The ASF licenses this file to You under the Apache License, Version 2.0
    * (the "License"); you may not use this file except in compliance with
    * the License. You may obtain a copy of the License at
    *
    *
    http://www.apache.org/licenses/LICENSE-2.0
    *
    * Unless required by applicable law or agreed to in writing, software
    * distributed under the License is distributed on an "AS IS" BASIS,
    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    * See the License for the specific language governing permissions and
    * limitations under the License.
    */

    package org.apache.nutch.parse.html;

    import java.net.URL;

    import org.apache.nutch.parse.HTMLMetaTags;
    import org.w3c.dom.*;

    /**
    * Class for parsing META Directives from DOM trees. This class
    * handles specifically Robots META directives (all, none, nofollow,
    * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
    * instructions. All meta directives are stored in a HTMLMetaTags instance.
    * 对于html头的解析,主要有meta base解析
    */
    public class HTMLMetaProcessor {

    /**
    * Utility class with indicators for the robots directives "noindex"
    * and "nofollow", and HTTP-EQUIV/no-cache
    */

    /**
    * Sets the indicators in <code>robotsMeta</code> to appropriate
    * values, based on any META tags found under the given
    * <code>node</code>.
    */
    public static final void getMetaTags (
    HTMLMetaTags metaTags, Node node, URL currURL) {

    metaTags.reset();
    getMetaTagsHelper(metaTags, node, currURL);
    }

    private static final void getMetaTagsHelper(
    HTMLMetaTags metaTags, Node node, URL currURL) {

    if (node.getNodeType() == Node.ELEMENT_NODE) {

    if ("body".equalsIgnoreCase(node.getNodeName())) {
    // META tags should not be under body
    return;
    }
    //解析meta中的信息,详见http://www.w3school.com.cn/tags/tag_meta.asp
    if ("meta".equalsIgnoreCase(node.getNodeName())) {
    NamedNodeMap attrs = node.getAttributes();
    Node nameNode = null;
    Node equivNode = null;
    Node contentNode = null;
    // Retrieves name, http-equiv and content attribues
    //取一个meta中的所有属性,such as :
    //<meta name="keywords" content="HTML,ASP,PHP,SQL"> 网站的关键字
    //<meta http-equiv="Refresh" content="5;url=http://www.w3school.com.cn"> 定时刷新页面,有url属性时会跳转到相应的url
    for (int i=0; i<attrs.getLength(); i++) {
    Node attr = attrs.item(i);
    String attrName = attr.getNodeName().toLowerCase();
    if (attrName.equals("name")) {
    nameNode = attr;
    } else if (attrName.equals("http-equiv")) {
    equivNode = attr;
    } else if (attrName.equals("content")) {
    contentNode = attr;
    }
    }

    if (nameNode != null) {
    if (contentNode != null) {
    String name = nameNode.getNodeValue().toLowerCase();
    metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());

    /**
    * meta 中 robotos 信息
    Robots META标签则主要是针对一个个具体的页面。和其他的META标签(如使用的语言、页面的描述、关键词等)一样,Robots META标签也是放在页面的<head></head>中,专门用来告诉搜索引擎ROBOTS如何抓取该页的内容。
    Robots META标签的写法:
    Robots META标签中没有大小写之分,name=”Robots”表示所有的搜索引擎,可以针对某个具体搜索引擎写为name=”BaiduSpider”。 content部分有四个指令选项:index、noindex、follow、nofollow,指令间以“,”分隔。
    INDEX 指令告诉搜索机器人抓取该页面;
    FOLLOW 指令表示搜索机器人可以沿着该页面上的链接继续抓取下去;
    Robots Meta标签的缺省值是INDEX和FOLLOW,只有inktomi除外,对于它,缺省值是INDEX,NOFOLLOW。
    这样,一共有四种组合:
    以下是引用片段:
    <META NAME=”ROBOTS” CONTENT=”INDEX,FOLLOW”>
    <META NAME=”ROBOTS” CONTENT=”NOINDEX,FOLLOW”>
    <META NAME=”ROBOTS” CONTENT=”INDEX,NOFOLLOW”>
    <META NAME=”ROBOTS” CONTENT=”NOINDEX,NOFOLLOW”>
    其中
    以下是引用片段:
    <META NAME=”ROBOTS” CONTENT=”INDEX,FOLLOW”>可以写成<META NAME=”ROBOTS” CONTENT=”ALL”>;
    <META NAME=”ROBOTS” CONTENT=”NOINDEX,NOFOLLOW”>可以写成<META NAME=”ROBOTS” CONTENT=”NONE”>
    目前看来,绝大多数的搜索引擎机器人都遵守robots.txt的规则,而对于Robots META标签,目前支持的并不多,但是正在逐渐增加,如著名搜索引擎GOOGLE就完全支持,而且GOOGLE还增加了一个指令“archive”,可以限制GOOGLE是否保留网页快照。例如:
    以下是引用片段:
    <META NAME=”googlebot” CONTENT=”index,follow,noarchive”>
    */
    if ("robots".equals(name)) {

    if (contentNode != null) {
    String directives =
    contentNode.getNodeValue().toLowerCase();
    int index = directives.indexOf("none");

    if (index >= 0) {
    metaTags.setNoIndex();
    metaTags.setNoFollow();
    }

    index = directives.indexOf("all");
    if (index >= 0) {
    // do nothing...
    }

    index = directives.indexOf("noindex");
    if (index >= 0) {
    metaTags.setNoIndex();
    }

    index = directives.indexOf("nofollow");
    if (index >= 0) {
    metaTags.setNoFollow();
    }

    index = directives.indexOf("noarchive");
    if (index >= 0) {
    metaTags.setNoCache();
    }
    }

    } // end if (name == robots)
    }
    }
    //http-equiv 信息http://www.w3school.com.cn/htmldom/prop_meta_httpequiv.asp
    //主要定义刷新(refresh)、content-type
    if (equivNode != null) {
    if (contentNode != null) {
    String name = equivNode.getNodeValue().toLowerCase();
    String content = contentNode.getNodeValue();
    metaTags.getHttpEquivTags().setProperty(name, content);
    if ("pragma".equals(name)) {
    content = content.toLowerCase();
    int index = content.indexOf("no-cache");
    if (index >= 0)
    metaTags.setNoCache();
    //刷新信息 <meta http-equiv="Refresh" content="5;url=http://www.w3school.com.cn">
    } else if ("refresh".equals(name)) {
    int idx = content.indexOf(';');
    String time = null;
    if (idx == -1) { // just the refresh time
    time = content;
    } else time = content.substring(0, idx);
    try {
    metaTags.setRefreshTime(Integer.parseInt(time));
    // skip this if we couldn't parse the time
    metaTags.setRefresh(true);
    } catch (Exception e) {
    ;
    }
    URL refreshUrl = null;
    if (metaTags.getRefresh() && idx != -1) { // set the URL
    idx = content.toLowerCase().indexOf("url=");
    if (idx == -1) { // assume a mis-formatted entry with just the url
    idx = content.indexOf(';') + 1;
    } else idx += 4;
    if (idx != -1) {
    String url = content.substring(idx);
    try {
    refreshUrl = new URL(url);
    } catch (Exception e) {
    // XXX according to the spec, this has to be an absolute
    // XXX url. However, many websites use relative URLs and
    // XXX expect browsers to handle that.
    // XXX Unfortunately, in some cases this may create a
    // XXX infinitely recursive paths (a crawler trap)...
    // if (!url.startsWith("/")) url = "/" + url;
    try {
    refreshUrl = new URL(currURL, url);
    } catch (Exception e1) {
    refreshUrl = null;
    }
    }
    }
    }
    if (metaTags.getRefresh()) {
    if (refreshUrl == null) {
    // apparently only refresh time was present. set the URL
    // to the same URL.
    refreshUrl = currURL;
    }
    metaTags.setRefreshHref(refreshUrl);
    }
    }
    }
    }
    //提取base信息,http://www.w3school.com.cn/tags/tag_base.asp
    //默认的相对url
    } else if ("base".equalsIgnoreCase(node.getNodeName())) {
    NamedNodeMap attrs = node.getAttributes();
    Node hrefNode = attrs.getNamedItem("href");

    if (hrefNode != null) {
    String urlString = hrefNode.getNodeValue();

    URL url = null;
    try {
    if (currURL == null)
    url = new URL(urlString);
    else
    url = new URL(currURL, urlString);
    } catch (Exception e) {
    ;
    }

    if (url != null)
    metaTags.setBaseHref(url);
    }

    }

    }

    NodeList children = node.getChildNodes();
    if (children != null) {
    int len = children.getLength();
    for (int i = 0; i < len; i++) {
    getMetaTagsHelper(metaTags, children.item(i), currURL);
    }
    }
    }

    }



  • 相关阅读:
    Machine learning 第8周编程作业 K-means and PCA
    Machine learning 第7周编程作业 SVM
    Machine learning第6周编程作业
    Machine learning 第5周编程作业
    小M的作物 最小割最大流
    k-近邻算法 python实现
    编辑距离 区间dp
    Machine learning第四周code 编程作业
    MDK5报错missing closing quote
    HDU 5512
  • 原文地址:https://www.cnblogs.com/serendipity/p/2254229.html
Copyright © 2011-2022 走看看