zoukankan      html  css  js  c++  java
  • FP-Tree Java实现(二):模板挖掘

    从下往上,使用循环+递归模式识别日志模板。

    package com.coshaho.fptree;
    
    import java.util.*;
    import java.util.stream.Collectors;
    
    /**
     * FP树:仅考虑算法
     *
     * @author coshaho
     * @since 2020/1/5
     */
    public class FPTree {
        // FP树根节点
        private FPNode root = new FPNode("Root", -1);
        // FP树节点线索头
        private Map<String, FPNode> firstNodeTable = new HashMap<>();
        // FP树节点线索尾
        private Map<String, FPNode> lastNodeTable = new HashMap<>();
        // 支持度
        private int support = 1;
        // 树的单词统计列表,降序
        private List<FPNode> table = new ArrayList<>();
    
        /**
         * 创建FP树
         * @param data 多行数据
         * @param count 每行数据出现次数
         * @param support 支持度
         */
        public FPTree(List<List<String>> data, List<Integer> count, int support) {
            this.support = support;
            if (null == count) {
                int size = data.size();
                count = new ArrayList<>();
                for (int i = 0; i < size; i++) {
                    count.add(1);
                }
            }
            data = sort(data, count);
            // line为一行日志
            int i = 0;
            for (List<String> line : data) {
                FPNode curNode = root;
                for (String word : line) {
                    if (curNode.getChildren().containsKey(word)) {
                        // 子节点存在则访问次数加一
                        curNode.getChildren().get(word).increase(count.get(i));
                    } else {
                        // 子节点不存在则新增子节点
                        FPNode child = new FPNode(word, count.get(i));
                        curNode.getChildren().put(word, child);
                        child.setFather(curNode);
                    }
                    curNode = curNode.getChildren().get(word);
                    // 当前节点有线索指向,则不必重复建立线索
                    if (curNode.isVisited()) {
                        continue;
                    }
                    // 创建线索
                    if (firstNodeTable.containsKey(word)) {
                        lastNodeTable.get(word).setNext(curNode);
                    } else {
                        firstNodeTable.put(word, curNode);
                    }
                    lastNodeTable.put(word, curNode);
                    curNode.setVisited(true);
                }
                i++;
            }
        }
    
        public void print() {
            root.print(0);
        }
    
        /**
         * 获取日志模板
         * @param last 下层节点
         */
        public void growth(List<String> last, List<LogTemplate> templates) {
            if (isSingleTree(this.root)) {
                getSingleTreeTemplate(last, templates);
            } else {
                getMultiTreeTemplate(last, templates);
            }
        }
    
        private void getWordTable(Map<String, Integer> wordCount) {
            for (Map.Entry<String, Integer> entry : wordCount.entrySet()) {
                if (entry.getValue() >= this.support) {
                    table.add(new FPNode(entry.getKey(), entry.getValue()));
                }
            }
            if (0 != table.size()) {
                table = table.stream().sorted(Comparator.comparing(FPNode::getCount).reversed())
                        .collect(Collectors.toList());
            }
        }
    
        private Map<String, Integer> getWordCount(List<List<String>> data, List<Integer> count) {
            Map<String, Integer> wordCount = new HashMap<>();
            // 统计单词出现的次数
            int i = 0;
            for (List<String> line : data) {
                for (String word : line) {
                    if (wordCount.containsKey(word)) {
                        wordCount.put(word, wordCount.get(word) + count.get(i));
                    } else {
                        wordCount.put(word, count.get(i));
                    }
                }
                i++;
            }
            return wordCount;
        }
    
        private List<List<String>> sortData(Map<String, Integer> wordCount, List<List<String>> data) {
            List<List<String>> result = new ArrayList<>();
            // 单词排序
            for (List<String> line : data) {
                List<String> newLine = line.stream()
                        .filter(word -> wordCount.get(word) >= support)
                        .sorted(Comparator.comparing(word -> wordCount.get(word)).reversed())
                        .collect(Collectors.toList());
                if (0 != newLine.size()) {
                    result.add(newLine);
                }
            }
            return result;
        }
    
        private List<List<String>> sort(List<List<String>> data, List<Integer> count) {
            Map<String, Integer> wordCount = getWordCount(data, count);
            getWordTable(wordCount);
            return sortData(wordCount, data);
        }
    
        private void getSingleTreeTemplate(List<String> last, List<LogTemplate> templates) {
            // 获取单树路径上所有节点
            List<FPNode> wordCount = new ArrayList<>();
            FPNode child = getFirstChild(root);
            while (null != child) {
                wordCount.add(child);
                child = getFirstChild(child);
            }
            // 获取wordCount所有非空子集
            List<LogTemplate> sonTemplates = getSonSet(wordCount);
            for (LogTemplate template : sonTemplates) {
                // 子集合出现次数大于支撑度则保留为模板
                if (template.getCount() >= support) {
                    templates.add(template);
                    template.getWords().addAll(last);
                }
            }
        }
    
        private void getMultiTreeTemplate(List<String> last, List<LogTemplate> templates) {
            // table为树包含单词集合,降序
            // 此处转换为升序,从下往上计算以每个节点结尾的模板
            Collections.reverse(table);
            for (FPNode node : table) {
                List<String> curWords = new ArrayList<>();
                curWords.add(node.getWord());
                // last为上一层递归调用计算的节点
                curWords.addAll(last);
                // 当前节点当做一个日志模板
                if(null == last || 0 == last.size()) {
                    LogTemplate template = new LogTemplate();
                    template.setCount(node.getCount());
                    List<String> words = new ArrayList<>();
                    words.add(node.getWord());
                    template.setWords(words);
                    templates.add(template);
                }
    
                FPNode link = this.firstNodeTable.get(node.getWord());
                List<List<String>> data = new ArrayList<>();
                List<Integer> count = new ArrayList<>();
                // 一条线索上有多个节点,每个节点从下往上对应一条日志模板路径
                while (null != link) {
                    FPNode me = link;
                    List<String> meWords = new ArrayList<>();
                    me = me.getFather();
                    // 线索上每个节点往上走
                    while (null != me.getFather()) {
                        meWords.add(me.getWord());
                        me = me.getFather();
                    }
                    count.add(link.getCount());
                    // 不加这一句会导致排序不稳定
                    Collections.reverse(meWords);
                    data.add(meWords);
                    link = link.getNext();
                }
    
                // 以上述节点构造新树
                FPTree newTree = new FPTree(data, count, this.support);
                newTree.growth(curWords, templates);
            }
        }
    
        private List<LogTemplate> getSonSet(List<FPNode> wordCount) {
            List<LogTemplate> result = new ArrayList<>();
            int length = wordCount.size();
            int mark;
            int nEnd = 1 << length;
            // 对于length位二进制数,每个数字对应一个子集合取法
            for (mark = 0; mark < nEnd; mark++) {
                LogTemplate template = new LogTemplate();
                // 循环查找每位是否应该放入集合
                for (int i = 0; i < length; i++) {
                    //该位有元素输出
                    if (((1 << i) & mark) != 0) {
                        template.getWords().add(wordCount.get(i).getWord());
                        // wordCount按照count降序排列,template count取最小值
                        template.setCount(wordCount.get(i).getCount());
                    }
                }
                // 空集合舍弃
                if (template.getCount() != 0) {
                    result.add(template);
                }
            }
            return result;
        }
    
        private boolean isSingleTree(FPNode tree) {
            if (null == tree || null == tree.getChildren() || 0 == tree.getChildren().size()) {
                return true;
            }
            // 有多个子节点则不是单树
            if (1 < tree.getChildren().size()) {
                return false;
            } else {
                return isSingleTree(getFirstChild(tree));
            }
        }
    
        private FPNode getFirstChild(FPNode tree) {
            if (null == tree || null == tree.getChildren() || 0 == tree.getChildren().size()) {
                return null;
            } else {
                for (FPNode child : tree.getChildren().values()) {
                    return child;
                }
                return null;
            }
        }
    
        public static void main(String[] args) {
            List<String> line1 = new ArrayList<>();
            line1.add("C");
            line1.add("A");
            line1.add("B");
            List<String> line2 = new ArrayList<>();
            line2.add("A");
            line2.add("B");
            line2.add("D");
            List<String> line3 = new ArrayList<>();
            line3.add("A");
            line3.add("B");
            List<String> line4 = new ArrayList<>();
            line4.add("C");
            line4.add("E");
            List<List<String>> data = new ArrayList<>();
            data.add(line1);
            data.add(line2);
            data.add(line3);
            data.add(line4);
    
            FPTree tree = new FPTree(data, null, 1);
            tree.print();
            List<LogTemplate> templates = new ArrayList<>();
            tree.growth(new ArrayList<>(), templates);
            for (LogTemplate template : templates) {
                template.print();
            }
        }
    }
    package com.coshaho.fptree;
    
    import java.util.HashMap;
    import java.util.Map;
    
    /**
     * FP树节点:仅考虑算法
     * @author coshaho
     * @since 2020/1/5
     */
    public class FPNode {
        // 单词
        private String word;
        // 单词出现次数
        private int count = 1;
        // 子节点
        private Map<String, FPNode> children = new HashMap<>();
        // 父节点
        private FPNode father;
        // 线索:指向下一个相同单词节点
        private FPNode next;
        // 是否有线索指向自己
        private boolean visited = false;
    
        public FPNode(String word, int count) {
            this.word = word;
            this.count = count;
        }
    
        public void increase(int i) {
            count += i;
        }
    
        public void print(int n) {
            for(int i = 0; i < n; i++) {
                if(i == n - 1) {
                    System.out.print("--");
                } else {
                    System.out.print("  ");
                }
            }
            System.out.println(word + ": " + count);
            for(FPNode child : children.values()) {
                child.print(n + 1);
            }
        }
    
        public String getWord() {
            return word;
        }
    
        public void setWord(String word) {
            this.word = word;
        }
    
        public int getCount() {
            return count;
        }
    
        public void setCount(int count) {
            this.count = count;
        }
    
        public Map<String, FPNode> getChildren() {
            return children;
        }
    
        public void setChildren(Map<String, FPNode> children) {
            this.children = children;
        }
    
        public FPNode getFather() {
            return father;
        }
    
        public void setFather(FPNode father) {
            this.father = father;
        }
    
        public FPNode getNext() {
            return next;
        }
    
        public void setNext(FPNode next) {
            this.next = next;
        }
    
        public boolean isVisited() {
            return visited;
        }
    
        public void setVisited(boolean visited) {
            this.visited = visited;
        }
    }
    package com.coshaho.fptree;
    
    import java.util.ArrayList;
    import java.util.List;
    
    /**
     * 日志模板
     *
     * @author coshaho
     * @since 2020/1/6
     */
    public class LogTemplate {
        private List<String> words = new ArrayList<>();
        private int count;
    
        public List<String> getWords() {
            return words;
        }
    
        public void setWords(List<String> words) {
            this.words = words;
        }
    
        public int getCount() {
            return count;
        }
    
        public void setCount(int count) {
            this.count = count;
        }
    
        public void print() {
            System.out.println(words + ": " + count);
        }
    }
  • 相关阅读:
    智能客户端请教
    C++位运算 (转并完善)
    Virtual PC 上网设置(宿主机为win7)
    Opencv2.0 lib和dll的编译获取过程 以及 vs2005,vs2008配置过程
    SQL Server 查询处理中的各个阶段(SQL执行顺序) 转
    Win32汇编项目总结——猎杀潜航
    sql server忘记sa密码的解决方法
    ODBC导出Excel遇到的表名错误问题解决(excel 2007)
    Opencv在MFC客户端的Picture控件上显示图片
    忘记深拷贝的后果
  • 原文地址:https://www.cnblogs.com/coshaho/p/12163496.html
Copyright © 2011-2022 走看看