zoukankan      html  css  js  c++  java
  • nutch从网页中提取字段并索引_HtmlParseFilter

    package org.apache.nutch.htmlfilter.my;
    
    import java.util.regex.*;
    
    import org.apache.commons.logging.Log;
    import org.apache.commons.logging.LogFactory;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.nutch.crawl.Crawl;
    import org.apache.nutch.metadata.Metadata;
    import org.apache.nutch.parse.HTMLMetaTags;
    import org.apache.nutch.parse.HtmlParseFilter;
    import org.apache.nutch.parse.Parse;
    import org.apache.nutch.parse.ParseResult;
    import org.apache.nutch.protocol.Content;
    import org.w3c.dom.DocumentFragment;
    
    public class MyHtmlParseFilter implements HtmlParseFilter {
    
        public static final Log LOG = LogFactory.getLog(MyHtmlParseFilter.class);
        
        private Configuration conf;
    
        private Pattern p_p_title = Pattern
                .compile("<span .+class=\"b14c\">(.*?)</span>");
    
        private Pattern p_p_article = Pattern
                .compile("<td .*class=\"h14\".*>([\\s\\S]+?)</td>");
    
        private Pattern p_p_pubdate = Pattern
                .compile("<font class=\"h12\">发布时间:(.*)</font>");
    
        public ParseResult filter(Content content, ParseResult parseResult,
                HTMLMetaTags metaTags, DocumentFragment doc) {
            Parse parse = parseResult.get(content.getUrl());
            Metadata md = parse.getData().getParseMeta();
    
            try {
                // 抽取字段 正文信息示例
                String html = new String(content.getContent());
    
                String title = extract(html, p_p_title);
                String article = extract(html, p_p_article);
                String site = "中国公路信息网|行业动态|新通车信息";
                String pubdate_1 = extract(html, p_p_pubdate);
                String pubdate = pubdate_1.replace('年', '-').replace('月', '-')
                        .replace("日", "");
                String refurl = null;
                String cate = "1234567";
    
                md.add("p_title", title);
                md.add("p_article", article);
                md.add("p_site", site);
                md.add("p_pubdate", pubdate);
                md.add("p_refurl", refurl);
                md.add("p_cate", cate);
            } catch (Exception e) {
                LOG.info(e.getMessage());
            }
    
            return parseResult;
        }
    
        private String extract(String html, Pattern p) {
            Matcher match = p.matcher(html);
            String val = null;
            while (match.find()) {
                val = match.group(1);
                if (val != null) {
                    val = val.trim();
                }
            }
            return val;
        }
    
        public Configuration getConf() {
            return this.conf;
        }
    
        public void setConf(Configuration conf) {
            this.conf = conf;
        }
    
    }
  • 相关阅读:
    css学习_css3伸缩布局 flex布局
    css学习_cs3s旋转的图片
    css学习_css3过渡
    css学习_css伪元素的本质
    css学习_css精灵技术、字体图标
    css学习_css用户界面样式
    Python 的 with 语句
    KNN--Python实现
    Python中NumPy(axis=0 与axis=1)
    Python中escape和unescape
  • 原文地址:https://www.cnblogs.com/i80386/p/2723969.html
Copyright © 2011-2022 走看看