zoukankan      html  css  js  c++  java
  • 基于标签路径特征系的网页文章正文提取

    算法来源

    JAVA实现

    public class ContentExtractor {
        public static final Logger LOG = LoggerFactory.getLogger(ContentExtractor.class);
    
        protected Document doc;
    
        ContentExtractor(Document doc) {
            this.doc = doc;
        }
    
        protected HashMap<Element, CountInfo> infoMap = new HashMap<Element, CountInfo>();
    
        class CountInfo {
    
            int textCount = 0;
            int linkTextCount = 0;
            int tagCount = 0;
            int linkTagCount = 0;
            double density = 0;
            double densitySum = 0;
            double score = 0;
            int pCount = 0;
            ArrayList<Integer> leafList = new ArrayList<Integer>();
    
        }
    
        public static class News {
            protected String url = null;
            protected String title = null;
            protected String content = null;
            protected String time = null;
    
            protected Element contentElement = null;
    
            public String getUrl() {
                return url;
            }
    
            public void setUrl(String url) {
                this.url = url;
            }
    
            public String getTitle() {
                return title;
            }
    
            public void setTitle(String title) {
                this.title = title;
            }
    
            public String getContent() {
                if (content == null) {
                    if (contentElement != null) {
                        content = contentElement.text();
                    }
                }
                return content;
            }
    
            public void setContent(String content) {
                this.content = content;
            }
    
            public String getTime() {
                return time;
            }
    
            public void setTime(String time) {
                this.time = time;
            }
    
            @Override
            public String toString() {
                return "URL:
    " + url + "
    TITLE:
    " + title + "
    TIME:
    " + time + "
    CONTENT:
    " + getContent() + "
    CONTENT(SOURCE):
    " + contentElement;
            }
    
            public Element getContentElement() {
                return contentElement;
            }
    
            public void setContentElement(Element contentElement) {
                this.contentElement = contentElement;
            }
    
        }
    
        protected void clean() {
            doc.select("script,noscript,style,iframe,br,svg").remove();
        }
    
        protected CountInfo computeInfo(Node node) {
    
            if (node instanceof Element) {
                Element tag = (Element) node;
    
                CountInfo countInfo = new CountInfo();
                for (Node childNode : tag.childNodes()) {
                    CountInfo childCountInfo = computeInfo(childNode);
                    countInfo.textCount += childCountInfo.textCount;
                    countInfo.linkTextCount += childCountInfo.linkTextCount;
                    countInfo.tagCount += childCountInfo.tagCount;
                    countInfo.linkTagCount += childCountInfo.linkTagCount;
                    countInfo.leafList.addAll(childCountInfo.leafList);
                    countInfo.densitySum += childCountInfo.density;
                    countInfo.pCount += childCountInfo.pCount;
                }
                countInfo.tagCount++;
                String tagName = tag.tagName();
                if (tagName.equals("a")) {
                    countInfo.linkTextCount = countInfo.textCount;
                    countInfo.linkTagCount++;
                } else if (tagName.equals("p")) {
                    countInfo.pCount++;
                }
    
                int pureLen = countInfo.textCount - countInfo.linkTextCount;
                int len = countInfo.tagCount - countInfo.linkTagCount;
                if (pureLen == 0 || len == 0) {
                    countInfo.density = 0;
                } else {
                    countInfo.density = (pureLen + 0.0) / len;
                }
    
                infoMap.put(tag, countInfo);
    
                return countInfo;
            } else if (node instanceof TextNode) {
                TextNode tn = (TextNode) node;
                CountInfo countInfo = new CountInfo();
                String text = tn.text();
                int len = text.length();
                countInfo.textCount = len;
                countInfo.leafList.add(len);
                return countInfo;
            } else {
                return new CountInfo();
            }
        }
    
        protected double computeScore(Element tag) {
            CountInfo countInfo = infoMap.get(tag);
            double var = Math.sqrt(computeVar(countInfo.leafList) + 1);
            double score = Math.log(var) * countInfo.densitySum * Math.log(countInfo.textCount - countInfo.linkTextCount + 1) * Math.log10(countInfo.pCount + 2);
            return score;
        }
    
        protected double computeVar(ArrayList<Integer> data) {
            if (data.size() == 0) {
                return 0;
            }
            if (data.size() == 1) {
                return data.get(0) / 2;
            }
            double sum = 0;
            for (Integer i : data) {
                sum += i;
            }
            double ave = sum / data.size();
            sum = 0;
            for (Integer i : data) {
                sum += (i - ave) * (i - ave);
            }
            sum = sum / data.size();
            return sum;
        }
    
        public Element getContentElement() throws Exception {
            clean();
            computeInfo(doc.body());
            double maxScore = 0;
            Element content = null;
            for (Map.Entry<Element, CountInfo> entry : infoMap.entrySet()) {
                Element tag = entry.getKey();
                if (tag.tagName().equals("a") || tag == doc.body()) {
                    continue;
                }
                double score = computeScore(tag);
                if (score > maxScore) {
                    maxScore = score;
                    content = tag;
                }
            }
            if (content == null) {
                System.err.println(doc.body().outerHtml());
                throw new Exception("extraction failed");
            }
            return content;
        }
    
        public News getNews() throws Exception {
            News news = new News();
            Element contentElement;
            try {
                contentElement = getContentElement();
                news.setContentElement(contentElement);
            } catch (Exception ex) {
                LOG.info("news content extraction failed,extraction abort", ex);
                throw new Exception(ex);
            }
    
            if (doc.baseUri() != null) {
                news.setUrl(doc.baseUri());
            }
    
            try {
                news.setTime(getTime(contentElement));
            } catch (Exception ex) {
                LOG.info("news title extraction failed", ex);
            }
    
            try {
                news.setTitle(getTitle(contentElement));
            } catch (Exception ex) {
                LOG.info("title extraction failed", ex);
            }
            return news;
        }
    
        protected String getTime(Element contentElement) throws Exception {
            String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})";
            Pattern pattern = Pattern.compile(regex);
            Element current = contentElement;
            for (int i = 0; i < 2; i++) {
                if (current != null && current != doc.body()) {
                    Element parent = current.parent();
                    if (parent != null) {
                        current = parent;
                    }
                }
            }
            for (int i = 0; i < 6; i++) {
                if (current == null) {
                    break;
                }
                String currentHtml = current.outerHtml();
                Matcher matcher = pattern.matcher(currentHtml);
                if (matcher.find()) {
                    return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3) + " " + matcher.group(4) + ":" + matcher.group(5) + ":" + matcher.group(6);
                }
                if (current != doc.body()) {
                    current = current.parent();
                }
            }
    
            try {
                return getDate(contentElement);
            } catch (Exception ex) {
                throw new Exception("time not found");
            }
    
        }
    
        protected String getDate(Element contentElement) throws Exception {
            String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})";
            Pattern pattern = Pattern.compile(regex);
            Element current = contentElement;
            for (int i = 0; i < 2; i++) {
                if (current != null && current != doc.body()) {
                    Element parent = current.parent();
                    if (parent != null) {
                        current = parent;
                    }
                }
            }
            for (int i = 0; i < 6; i++) {
                if (current == null) {
                    break;
                }
                String currentHtml = current.outerHtml();
                Matcher matcher = pattern.matcher(currentHtml);
                if (matcher.find()) {
                    return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3);
                }
                if (current != doc.body()) {
                    current = current.parent();
                }
            }
            throw new Exception("date not found");
        }
    
        protected double strSim(String a, String b) {
            int len1 = a.length();
            int len2 = b.length();
            if (len1 == 0 || len2 == 0) {
                return 0;
            }
            double ratio;
            if (len1 > len2) {
                ratio = (len1 + 0.0) / len2;
            } else {
                ratio = (len2 + 0.0) / len1;
            }
            if (ratio >= 3) {
                return 0;
            }
            return (lcs(a, b) + 0.0) / Math.max(len1, len2);
        }
    
        protected String getTitle(final Element contentElement) throws Exception {
            final ArrayList<Element> titleList = new ArrayList<Element>();
            final ArrayList<Double> titleSim = new ArrayList<Double>();
            final AtomicInteger contentIndex = new AtomicInteger();
            final String metaTitle = doc.title().trim();
            if (!metaTitle.isEmpty()) {
                doc.body().traverse(new NodeVisitor() {
                    @Override
                    public void head(Node node, int i) {
                        if (node instanceof Element) {
                            Element tag = (Element) node;
                            if (tag == contentElement) {
                                contentIndex.set(titleList.size());
                                return;
                            }
                            String tagName = tag.tagName();
                            if (Pattern.matches("h[1-6]", tagName)) {
                                String title = tag.text().trim();
                                double sim = strSim(title, metaTitle);
                                titleSim.add(sim);
                                titleList.add(tag);
                            }
                        }
                    }
    
                    @Override
                    public void tail(Node node, int i) {
                    }
                });
                int index = contentIndex.get();
                if (index > 0) {
                    double maxScore = 0;
                    int maxIndex = -1;
                    for (int i = 0; i < index; i++) {
                        double score = (i + 1) * titleSim.get(i);
                        if (score > maxScore) {
                            maxScore = score;
                            maxIndex = i;
                        }
                    }
                    if (maxIndex != -1) {
                        return titleList.get(maxIndex).text();
                    }
                }
            }
    
            Elements titles = doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title]");
            if (titles.size() > 0) {
                String title = titles.first().text();
                if (title.length() > 5 && title.length()<40) {
                    return titles.first().text();
                }
            }
            try {
                return getTitleByEditDistance(contentElement);
            } catch (Exception ex) {
                throw new Exception("title not found");
            }
    
        }
    
        protected String getTitleByEditDistance(Element contentElement) throws Exception {
            final String metaTitle = doc.title();
    
            final ArrayList<Double> max = new ArrayList<Double>();
            max.add(0.0);
            final StringBuilder sb = new StringBuilder();
            doc.body().traverse(new NodeVisitor() {
    
                public void head(Node node, int i) {
    
                    if (node instanceof TextNode) {
                        TextNode tn = (TextNode) node;
                        String text = tn.text().trim();
                        double sim = strSim(text, metaTitle);
                        if (sim > 0) {
                            if (sim > max.get(0)) {
                                max.set(0, sim);
                                sb.setLength(0);
                                sb.append(text);
                            }
                        }
    
                    }
                }
    
                public void tail(Node node, int i) {
                }
            });
            if (sb.length() > 0) {
                return sb.toString();
            }
            throw new Exception();
    
        }
    
        protected int lcs(String x, String y) {
    
            int M = x.length();
            int N = y.length();
            if (M == 0 || N == 0) {
                return 0;
            }
            int[][] opt = new int[M + 1][N + 1];
    
            for (int i = M - 1; i >= 0; i--) {
                for (int j = N - 1; j >= 0; j--) {
                    if (x.charAt(i) == y.charAt(j)) {
                        opt[i][j] = opt[i + 1][j + 1] + 1;
                    } else {
                        opt[i][j] = Math.max(opt[i + 1][j], opt[i][j + 1]);
                    }
                }
            }
    
            return opt[0][0];
    
        }
    
        protected int editDistance(String word1, String word2) {
            int len1 = word1.length();
            int len2 = word2.length();
    
            int[][] dp = new int[len1 + 1][len2 + 1];
    
            for (int i = 0; i <= len1; i++) {
                dp[i][0] = i;
            }
    
            for (int j = 0; j <= len2; j++) {
                dp[0][j] = j;
            }
    
            for (int i = 0; i < len1; i++) {
                char c1 = word1.charAt(i);
                for (int j = 0; j < len2; j++) {
                    char c2 = word2.charAt(j);
    
                    if (c1 == c2) {
                        dp[i + 1][j + 1] = dp[i][j];
                    } else {
                        int replace = dp[i][j] + 1;
                        int insert = dp[i][j + 1] + 1;
                        int delete = dp[i + 1][j] + 1;
    
                        int min = replace > insert ? insert : replace;
                        min = delete > min ? min : delete;
                        dp[i + 1][j + 1] = min;
                    }
                }
            }
    
            return dp[len1][len2];
        }
    
        /*输入Jsoup的Document,获取正文所在Element*/
        public static Element getContentElementByDoc(Document doc) throws Exception {
            ContentExtractor ce = new ContentExtractor(doc);
            return ce.getContentElement();
        }
    
        /*输入HTML,获取正文所在Element*/
        public static Element getContentElementByHtml(String html) throws Exception {
            Document doc = Jsoup.parse(html);
            return getContentElementByDoc(doc);
        }
    
        /*输入HTML和URL,获取正文所在Element*/
        public static Element getContentElementByHtml(String html, String url) throws Exception {
            Document doc = Jsoup.parse(html, url);
            return getContentElementByDoc(doc);
        }
    
    //    public static OkHttpRequester okHttpRequester = new OkHttpRequester();
    
        /*输入URL,获取正文所在Element*/
        public static Element getContentElementByUrl(String url) throws Exception {
    //        HttpRequest request = new HttpRequest(url);
            String html = Jsoup.connect(url).get().outerHtml();
            return getContentElementByHtml(html, url);
        }
    
        /*输入Jsoup的Document,获取正文文本*/
        public static String getContentByDoc(Document doc) throws Exception {
            ContentExtractor ce = new ContentExtractor(doc);
            return ce.getContentElement().text();
        }
    
        /*输入HTML,获取正文文本*/
        public static String getContentByHtml(String html) throws Exception {
            Document doc = Jsoup.parse(html);
            return getContentElementByDoc(doc).text();
        }
    
        /*输入HTML和URL,获取正文文本*/
        public static String getContentByHtml(String html, String url) throws Exception {
            Document doc = Jsoup.parse(html, url);
            return getContentElementByDoc(doc).text();
        }
    
        /*输入URL,获取正文文本*/
        public static String getContentByUrl(String url) throws Exception {
    //        HttpRequest request = new HttpRequest(url);
    //        String html = request.response().decode();
            String html = Jsoup.connect(url).get().outerHtml();
            return getContentByHtml(html, url);
        }
    
        /*输入Jsoup的Document,获取结构化新闻信息*/
        public static News getNewsByDoc(Document doc) throws Exception {
            ContentExtractor ce = new ContentExtractor(doc);
            return ce.getNews();
        }
    
        /*输入HTML,获取结构化新闻信息*/
        public static News getNewsByHtml(String html) throws Exception {
            Document doc = Jsoup.parse(html);
            return getNewsByDoc(doc);
        }
    
        /*输入HTML和URL,获取结构化新闻信息*/
        public static News getNewsByHtml(String html, String url) throws Exception {
            Document doc = Jsoup.parse(html, url);
            return getNewsByDoc(doc);
        }
    
        /*输入URL,获取结构化新闻信息*/
        public static News getNewsByUrl(String url) throws Exception {
    //        HttpRequest request = new HttpRequest(url);
    //        String html = request.response().decode();
            String html = Jsoup.connect(url).get().outerHtml();
            return getNewsByHtml(html, url);
        }
    }
    
    

    Maven依赖

    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.13.1</version>
    </dependency>
    

    使用案例

    String html = "新闻网页js渲染后的dom结构";
    Document doc = Jsoup.parse(html);
    Element element = ContentExtractor.getContentElementByDoc(doc);
    String content = element.text();
    
  • 相关阅读:
    单例模式的三种写法
    ASP.NET如何下载大文件
    字符串是引用类型
    SQL 事务隔离级别
    Sql Server 锁
    设非主键为聚集索引
    C#如何使用SqlCacheDependency
    简易系统后台架构
    matlab cross 3*1 向量叉乘
    Matlab求齐次方程的解
  • 原文地址:https://www.cnblogs.com/xiaogblog/p/13921270.html
Copyright © 2011-2022 走看看