zoukankan      html  css  js  c++  java
  • 利用文本挖掘技术来找出《天龙八部》中的“小鲜词”

    问题导读:

    1.怎样自动的从文本中找出新的词?

    2.怎样在处理数据时自动分割大文件?

    3.怎样利用JAVA进行抽词?

    开始之前,先看一下从人人网中发现的90后用户爱用的词

    是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜)

    项目结构

    当然,text.dat和common.dic这两个文件你可以随意替换,注意text.dat中的数据一定要够份量,否则没啥效果

    原理么,看下Matrix67大牛的文章你就懂了

    互联网时代的社会语言学:基于SNS的文本数据挖掘

    处理数据下载

    下边开始上代码

    common

    这个里边包含以下几个类,主要是定义数据结构

    CountMap.java

    定义一个计数Map来进行数据操作和持久化

    package grid.common;
    
    import java.io.Serializable;
    import java.util.HashMap;
    
    
    public class CountMap<T> extends HashMap<T, Integer> implements Serializable {
    
        private static final long serialVersionUID = 6097963798841161750L;
    
        public void increase(T t) {//添加元素
            Integer count = get(t);
            if (null == count) {
                put(t, 1);
            } else {
                put(t, ++count);
            }
        }
    
        public int count() {   //计数
            int count = 0;
            for (T t : keySet()) {
                count += get(t);
            }
            return count;
        }
    
        public int get(char c) {
            Integer count = super.get(c);
            return null == count ? 0 : count;
        }
    }

    Node.java

    定义语法树的节点

    package grid.common;
    
    import java.util.ArrayList;
    import java.util.Collections;
    import java.util.List;
    
    public class Node<T> {
        protected List<Node<T>> children;
    
        protected Node<T> parent;
    
        protected T value;
    
        Node(T value) {
            this.value = value;
        }
    
        public Node<T> add(T value) {
            if (null == children) {
                children = new ArrayList<Node<T>>();
            }
            Node<T> child = new Node<T>(value);
            child.setParent(this);
            children.add(child);
            return child;
        }
    
        public T getValue() {
            return value;
        }
    
        public Node<T> getParent() {
            return parent;
        }
    
        public void setParent(Node<T> parent) {
            this.parent = parent;
        }
            //递归遍历孩子节点
        private void recurseChildren(List<Node<T>> list, Node<T> parent) {
            if (null == parent.children) {
                list.add(parent);
            } else {
                for (Node<T> node : parent.children) {
                    recurseChildren(list, node);
                }
            }
        }
    
        public List<Node<T>> getLeaves() {
            List<Node<T>> list = new ArrayList<Node<T>>();
            recurseChildren(list, this);
            return list;
    
        }
    
        public List<T> getBranchPath() {
            List<T> list = new ArrayList<T>();
            Node<T> node = this;
            do {
                list.add(node.getValue());
                node = node.parent;
            } while (null != node && !(node instanceof Tree<?>));
            Collections.reverse(list);
            return list;
        }
    
        private void append(StringBuilder builder, int deep, Node<T> node) {
            for (int i = 0; i < deep; i++) {
                builder.append("  ");
            }
            builder.append("|--");
            builder.append(node.getValue());
            builder.append("
    ");
            if (null != node.children) {
                for (Node<T> child : node.children) {
                    append(builder, deep + 1, child);
                }
            }
        }
    
        public String dump() {
            StringBuilder builder = new StringBuilder();
            append(builder, 0, this);
            return builder.toString();
        }
    
        public String toString() {
            return value.toString();
        }
    }
    

    TextDatReader.java

    读取处理数据

    package grid.common;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStreamReader;
    
    public class TextDatReader {
    //   public static String read(String path) throws IOException {
    //          File file = new File(path);
    //          FileReader reader = new FileReader(file);
    //          char buffer[] = new char[(int) file.length()];
    //          reader.read(buffer);
    //          return new String(buffer);
    //      }
        @SuppressWarnings("resource")
        public static String read(String path) throws IOException {
            File file = new File(path);
            FileInputStream s = new FileInputStream(file);
            // 以utf8格式打开文件
    //      FileReader fr = new FileReader(file);
            BufferedReader reader = new BufferedReader(new InputStreamReader(s,
                    "utf8"));
            char buffer[] = new char[(int) file.length()];
            reader.read(buffer);
            return new String(buffer);
        }
    
        // 判断是否存在dat文件夹,没有的话就创建
        public static void createDir() {
            File file = new File("./dat");
            if (!file.exists() && !file.isDirectory()) {
                file.mkdir();
            }
        }
    
        public static final String SUFFIX = ".dat"; // 分割后的文件名后缀
    
        // 将指定的文件按着给定的文件的字节数进行分割文件,其中name指的是需要进行分割的文件名,size指的是指定的小文件的大小
        public static void divide(String name, long size) throws Exception {
            File file = new File(name);
            if (!file.exists() || (!file.isFile())) {
                throw new Exception("指定文件不存在!");
            }
            // 取得文件的大小
            long fileLength = file.length();
            if (size <= 0) {
                size = fileLength / 2;
            }
            // 取得被分割后的小文件的数目
            int num = (fileLength % size != 0) ? (int) (fileLength / size + 1)
                    : (int) (fileLength / size);
            // 存放被分割后的小文件名
            String[] fileNames = new String[num];
            // 输入文件流,即被分割的文件
            FileInputStream in = new FileInputStream(file);
            // 读输入文件流的开始和结束下标
            long end = 0;
            int begin = 0;
            createDir();
            // 根据要分割的数目输出文件
            for (int i = 1; i <= num; i++) {
                // 对于前num - 1个小文件,大小都为指定的size
                File outFile = new File("./dat", "text" + i + SUFFIX);
                // 构建小文件的输出流
                FileOutputStream out = new FileOutputStream(outFile);
                // 将结束下标后移size
                end += size;
                end = (end > fileLength) ? fileLength : end;
                // 从输入流中读取字节存储到输出流中
                for (; begin < end; begin++) {
                    out.write(in.read());
                }
                out.close();
                fileNames[i] = outFile.getAbsolutePath();
                System.out.println("第"+i+"个子文件生成……");
    
            }
            in.close();
        }
    
        // public static void main(final String[] args) throws Exception {
        // String name = "text.dat";
        // long size = 1024 * 1024 * 4;// 1K=1024b(字节),切割后每个文件为4M
        // TextDatReader.divide(name, size);
        //
        // }
    
    }

    TextUtils.java

    用来做文本处理,如判断是否为空、匹配字符等

    package grid.common;
    
    
    public class TextUtils {
    
        public static boolean isCnLetter(char c) {//判断是否为中文字符
            return c >= 0x4E00 && c <= 0x9FCB;
        }
    
        public static boolean isNumeric(char c) {//判断是否为数字
                return c >= '0' && c <= '9';
        }
    
        public static boolean isEnLetter(char c) {//判断是否为英文字母
            return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
        }
            //字符串匹配
        public static boolean match(String src, int off, String dest) {
            int len = dest.length();
            int srcLen = src.length();
            for (int i = 0; i < len; i++) {
                if (srcLen <= off + i) {
                    return false;
                }
                if (dest.charAt(i) != src.charAt(off + i)) {
                    return false;
                }
            }
            return true;
        }
          //判断是否为空
        public static boolean isBlank(String str) {
            return null == str || str.isEmpty() || str.trim().isEmpty();
        }
    }
    

    Tree.java

    语法树

    package grid.common;
    
    
    public class Tree<T> extends Node<T> {
    
        public Tree(T value) {
            super(value);
        }
    
    }
    

    dic

    里边包含CnDictionary类

    CnDictionary.java

    词典处理

    package grid.text.dic;
    
    import grid.common.CountMap;
    import grid.common.TextDatReader;
    import grid.common.TextUtils;
    
    import java.io.IOException;
    import java.util.HashSet;
    import java.util.Set;
    
    
    public class CnDictionary {
    
        private final String COMMON_WORD_DIC_PATH = "common.dic";
    
        /**
         * This text data is for character statistic. Change to your own if you
         * like.
         */
        private final String COMMON_LETTER_RESOURCE_PATH = "text.dat";
    
        private Set<String> dictionary = new HashSet<String>();
    
        private CountMap<Character> letterCountMap = new CountMap<Character>();
    
        private int totalLetterCount;
    
        private static CnDictionary instance;
    //单例模式
        public static CnDictionary Instance() {
            if (null == instance) {
                try {
                    instance = new CnDictionary();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            return instance;
        }
    
        private CnDictionary() throws IOException {
            initWordDic();
            initLetterCountMap();
        }
    
        private void initLetterCountMap() throws IOException {
            String letterResource = TextDatReader.read(COMMON_LETTER_RESOURCE_PATH);//读取语料数据 text.dat
            final int len = letterResource.length();
            char c;
            for (int i = 0; i < len; i++) {
                c = letterResource.charAt(i);
                if (TextUtils.isCnLetter(c)) {
                    letterCountMap.increase(c);
                }
            }
            totalLetterCount = letterCountMap.count();
    
        }
    
        private void initWordDic() throws IOException {
    
            String bytes = TextDatReader.read(COMMON_WORD_DIC_PATH);//读取词典commondic
            final int len = bytes.length();
            String s = "";
            char c;
            for (int i = 0; i < len; i++) {
                c = bytes.charAt(i);
    
                if ('
    ' == c || '
    ' == c || 0 == c) {
                    if (!TextUtils.isBlank(s)) {
                        dictionary.add(s.trim());
                    }
                    s = "";
                } else {
                    s += c;
                }
                if (0 == c) {
                    break;
                }
            }
        }
    
        public boolean contains(String word) {
            return dictionary.contains(word);
        }
    
        public double rate(char c) {
            return (double) letterCountMap.get(c) / totalLetterCount;
        }
    
        public int size() {
            return dictionary.size();
        }
    }
    

    evolution

    EntropyJudger.java

    计算熵值

    package grid.text.evolution;
    
    import grid.common.CountMap;
    import grid.common.TextUtils;
    import grid.text.index.Pos;
    import grid.text.index.TextIndexer;
    
    public class EntropyJudger {
    
        private TextIndexer indexer;
    
        /**
         * A word least appeared count
         */
        private static int LEAST_COUNT_THRESHOLD = 5;
    
        /**
         * Threshold for solid rate calculated by word appeared count and every
         * single letter.
         * 
         * The smaller this values is, more new words you will get, but with less
         * accuracy. The greater this value is, less new words you will get, but
         * with high accuracy.
         */
        private static double SOLID_RATE_THRESHOLD = 0.018;
    
        /**
         * Threshold for entropy value calculated by candidate word prefix character
         * count and suffix character count
         * 
         * The smaller this values is, more new words you will get, but with less
         * accuracy. The greater this value is, less new words you will get, but
         * with high accuracy.
         */
        private static double ENTROPY_THRESHOL = 1.92;
    
        public EntropyJudger(TextIndexer indexer) {
            this.indexer = indexer;
        }
    
        public boolean judge(String candidate) {
            double solidRate = getSolidRate(candidate);
    
            if (solidRate < SOLID_RATE_THRESHOLD) {
                return false;
            }
    
            double entropy = getEntropy(candidate);
    
            if (entropy < ENTROPY_THRESHOL) {
                return false;
            }
            return true;
        }
    
        private double getEntropy(String candidate) {
            Pos pos = new Pos(candidate);
            CountMap<Character> frontCountMap = new CountMap<Character>();
            CountMap<Character> backCountMap = new CountMap<Character>();
            final int candidateLen = candidate.length();
            int off = 0;
            char c;
            double rate, frontEntropy = 0, backEntropy = 0;
    
            while (indexer.find(pos).isFound()) {
                off = pos.getPos();
    
                c = indexer.charAt(off - 1);
                if (TextUtils.isCnLetter(c)) {
                    frontCountMap.increase(c);
                }
                c = indexer.charAt(off + candidateLen);
                if (TextUtils.isCnLetter(c)) {
                    backCountMap.increase(c);
                }
    
            }
    
            for (char key : frontCountMap.keySet()) {
                rate = (double) frontCountMap.get(key) / frontCountMap.count();
                frontEntropy -= rate * Math.log(rate);
            }
            for (char key : backCountMap.keySet()) {
                rate = (double) backCountMap.get(key) / backCountMap.count();
                backEntropy -= rate * Math.log(rate);
            }
    
            return frontEntropy > backEntropy ? backEntropy : frontEntropy;
    
        }
    
        /**
         * @param candidate
         * @return
         */
        public double getSolidRate(String candidate) {
    
            final int candidateLen = candidate.length();
    
            if (candidateLen < 2) {
                return 1;
            }
    
            final int count = indexer.count(candidate);
            double rate = 1;
    
            if (count < LEAST_COUNT_THRESHOLD) {
                return 0;
            }
    
            for (int i = 0; i < candidateLen; i++) {
                rate *= (double) count / indexer.count("" + candidate.charAt(i));
            }
    
            return Math.pow(rate, 1D / candidateLen) * Math.sqrt(candidateLen);
        }
    
        public void setIndexer(TextIndexer indexer) {
            this.indexer = indexer;
        }
    
    }
    

    NewWordDiscover.java

    抽词程序

    package grid.text.evolution;
    
    import grid.common.TextUtils;
    import grid.text.dic.CnDictionary;
    import grid.text.index.CnPreviewTextIndexer;
    import grid.text.index.TextIndexer;
    import grid.text.selector.CnTextSelector;
    import grid.text.selector.TextSelector;
    
    import java.util.HashSet;
    import java.util.Set;
    
    public class NewWordDiscover {
    
        private CnDictionary dictionary;
    
        /**
         * Minimum word length
         */
        private final static int MIN_CANDIDATE_LEN = 2;
    
        /**
         * Maximum word length
         */
        private final static int MAX_CANDIDATE_LEN = 6;
    
        private static Set<Character> structuralLetterSet = new HashSet<Character>();
    
        private static char[] structuralLetters = { '我', '你', '您', '他', '她', '谁',
                '哪', '那', '这', '的', '了', '着', '也', '是', '有', '不', '在', '与', '呢',
                '啊', '呀', '吧', '嗯', '哦', '哈', '呐' };
    
        static {
            for (char c : structuralLetters) {
                structuralLetterSet.add(c);
            }
        }
    
        public NewWordDiscover() {
            dictionary = CnDictionary.Instance();
        }
    
        /**
         * New word discover is based on statistic and entropy, better to sure
         * document size is in 100kb level, or you may get a unsatisfied result.
         * 
         * @param document
         * @return
         */
        public Set<String> discover(String document) {
    
            Set<String> set = new HashSet<String>();
            TextIndexer indexer = new CnPreviewTextIndexer(document);
            TextSelector selector = new CnTextSelector(document, MIN_CANDIDATE_LEN,
                    MAX_CANDIDATE_LEN);
            EntropyJudger judger = new EntropyJudger(indexer);
            String candidate;
            while (!selector.end()) {
                candidate = selector.next();
                if (TextUtils.isBlank(candidate)) {
                    continue;
                }
                if (structuralLetterSet.contains(candidate.charAt(0))
                        || structuralLetterSet.contains(candidate.charAt(candidate
                                .length() - 1))) {
                    continue;
                }
                // Replace IF clause with "set.contains(candidate)" if you want to
                // find new word without any dictionary
                if (dictionary.contains(candidate) || set.contains(candidate)) {
                    selector.select();
                } else if (judger.judge(candidate)) {
                    set.add(candidate);
                }
            }
            return set;
        }
    }
    

    index

    这几个类用于给词创建索引,方便从词典中找出

    CnPreviewTextIndexer.java

    package grid.text.index;
    
    import grid.common.TextUtils;
    
    import java.util.HashMap;
    import java.util.Map;
    import java.util.Vector;
    
    public class CnPreviewTextIndexer implements TextIndexer {
    
        private final static int CN_LETTER_COUNT = 5021;
    
        private String document;
    
        private Map<Character, Vector<Integer>> posMap;
    
        public CnPreviewTextIndexer(String document) {
            this.document = document;
            init();
        }
    
        private void init() {
            final int len = document.length();
    
            final int supposedMinCount = 1 + (int) Math.log(len / CN_LETTER_COUNT
                    + 1);
    
            char c;
    
            Vector<Integer> posVector;
    
            posMap = new HashMap<Character, Vector<Integer>>(CN_LETTER_COUNT);
    
            for (int i = 0; i < len; i++) {
                c = document.charAt(i);
                if (!TextUtils.isCnLetter(c)) {
                    continue;
                }
                posVector = posMap.get(c);
                if (null == posVector) {
                    posVector = new Vector<Integer>(supposedMinCount);
                    posMap.put(c, posVector);
                }
                posVector.add(i);
            }
        }
    
        @Override
        public int count(String text) {
    
            if (TextUtils.isBlank(text)) {
                return 0;
            }
    
            Vector<Integer> vector = posMap.get(text.charAt(0));
    
            if (null == vector) {
                return 0;
            }
    
            if (1 == text.length()) {
                return vector.size();
            }
    
            final int size = vector.size();
            int count = 0;
    
            for (int i = 0; i < size; i++) {
                if (TextUtils.match(document, vector.get(i), text)) {
                    count++;
                }
            }
    
            return count;
        }
    
        @Override
        public Pos find(Pos pos) {
            String text = pos.getTarget();
    
            pos.setFound(false);
    
            if (TextUtils.isBlank(text)) {
                return pos;
            }
    
            Vector<Integer> vector = posMap.get(text.charAt(0));
    
            if (null == vector) {
                return pos;
            }
    
            final int arraySize = vector.size();
            final int arrayIndex = pos.arrayIndex + 1;
    
            for (int i = arrayIndex; i < arraySize; i++) {
                if (TextUtils.match(document, vector.get(i), text)) {
                    pos.setFound(true);
                    pos.setPos(vector.get(i));
                    pos.arrayIndex = i;
                    break;
                }
            }
    
            return pos;
        }
    
        @Override
        public int len() {
            return document.length();
        }
    
        @Override
        public String sub(int off, int len) {
            if (off < 0 || off + len >= document.length()) {
                return "";
            }
            return document.substring(off, off + len);
        }
    
        @Override
        public char charAt(int index) {
            if (index < 0 || index >= document.length()) {
                return 0;
            }
            return document.charAt(index);
        }
    }

    Pos.java

    package grid.text.index;
    
    
    public class Pos {
        private String target;
    
        /**
         * Pos for current matched full target text
         */
        private int pos = -1;
    
        /**
         * Index in position array for current matched full target text
         */
        int arrayIndex = -1;
    
        private boolean found = false;
    
        public Pos(String target) {
            this.target = target;
        }
    
        public String getTarget() {
            return target;
        }
    
        public int getPos() {
            return pos;
        }
    
        public boolean isFound() {
            return found;
        }
    
        void setPos(int pos) {
            this.pos = pos;
        }
    
        void setFound(boolean found) {
            this.found = found;
        }
    }
    

    SimpleTextIndexer.java

    package grid.text.index;
    
    
    public class SimpleTextIndexer implements TextIndexer {
    
        private String document;
    
        public SimpleTextIndexer(String document) {
            this.document = document;
        }
    
        @Override
        public int count(String text) {
            int off = 0;
            int count = 0;
            final int len = text.length();
            while ((off = document.indexOf(text, off)) > -1) {
                count++;
                off += len;
            }
            return count;
        }
    
        @Override
        public Pos find(Pos pos) {
            final String text = pos.getTarget();
            final int len = text.length();
            int off = pos.getPos() + len;
            if (pos.getPos() < 0)
                off = 0;
    
            pos.setFound(false);
    
            if ((off = document.indexOf(text, off)) > -1) {
                pos.setFound(true);
                pos.setPos(off);
            }
            return pos;
        }
    
        @Override
        public int len() {
            return document.length();
        }
    
        @Override
        public String sub(int off, int len) {
            return document.substring(off, off + len);
        }
    
        @Override
        public char charAt(int index) {
            if (index < 0 || index >= document.length()) {
                return 0;
            }
            return document.charAt(index);
        }
    }
    

    TextIndexer.java

    package grid.text.index;
    
    
    public interface TextIndexer {
    
        /**
         * @param text
         * @return count for specific text
         */
        public int count(String text);
    
        /**
         * @param pos
         * @return next position for current pos
         */
        public Pos find(Pos pos);
    
        /**
         * @return original document length
         */
        public int len();
    
        /**
         * @param off
         * @param len
         * @return the sub string start from <b>off</b> and with a length with
         *         <b>len</b>
         */
        public String sub(int off, int len);
    
        /**
         * @param index
         * @return return the character in the specified index
         */
        public char charAt(int index);
    }
    

    participle

    分词处理,具体看实现

    Chunk.java

    package grid.text.participle;
    
    import grid.text.dic.CnDictionary;
    
    import java.util.List;
    
    
    public class Chunk implements Comparable<Chunk> {
    
        private List<String> list;
    
        private int len = 0;
    
        private double avg = 0;
    
        private double variance = 0;
    
        public Chunk(List<String> list) {
            this.list = list;
            init();
        }
    
        private void init() {
    
            for (String s : list) {
                len += s.length();
            }
            avg = (double) len / list.size();
    
            for (String s : list) {
                variance += Math.pow(avg - s.length(), 2);
            }
            variance = Math.sqrt(variance);
        }
    
        public int getLen() {
            return len;
        }
    
        public double getAvg() {
            return avg;
        }
    
        public double getVariance() {
            return variance;
        }
    
        public String getHead() {
            if (null == list || list.isEmpty()) {
                return "";
            }
            return list.get(0);
        }
    
        private int compareDouble(double d1, double d2) {
            if (d1 - d2 < -0.0000001D) {
                return 1;
            } else if (d1 - d2 > 0.0000001D) {
                return -1;
            }
            return 0;
        }
    
        @Override
        public int compareTo(Chunk o) {
    
            if (len != o.len) {
                return o.len - len;
            }
    
            int d = compareDouble(avg, o.avg);
            if (0 != d) {
                return d;
            }
    
            d = compareDouble(variance, o.variance);
            if (0 != d) {
                return d;
            }
    
            CnDictionary dictionary = CnDictionary.Instance();
    
            double rateSrc = 0, rateDest = 0;
            for (String s : list) {
                if (1 == s.length()) {
                    rateSrc += dictionary.rate(s.charAt(0));
                }
            }
            for (String s : o.list) {
                if (1 == s.length()) {
                    rateDest += dictionary.rate(s.charAt(0));
                }
            }
            return compareDouble(rateSrc, rateDest);
        }
    
        public String toString() {
            return list.toString();
        }
    }
    

    ChunkStream.java

    package grid.text.participle;
    
    import grid.common.Node;
    import grid.common.TextUtils;
    import grid.common.Tree;
    import grid.text.dic.CnDictionary;
    
    import java.util.ArrayList;
    import java.util.Collections;
    import java.util.List;
    
    public class ChunkStream {
    
        /**
         * Define the max supposed word length
         * 
         * You could shorten the value if you don't need too long participle result
         */
        private static final int MAX_WORD_LEN = 7;
    
        /**
         * Define the predict level while execute participle.
         * 
         * Negligible accuracy will be promoted if you increase this value
         */
        private static final int PREDICT_LEVEL = 3;
    
        private static CnDictionary dictionary = CnDictionary.Instance();
    
        public String next(String text, int off) {
            Tree<String> root = new Tree<String>("ROOT");
            recurse(root, off, text, 0);
            List<Node<String>> list = root.getLeaves();
            List<Chunk> chunkList = new ArrayList<Chunk>();
            for (Node<String> node : list) {
                chunkList.add(new Chunk(node.getBranchPath()));
            }
            Collections.sort(chunkList);
            return chunkList.get(0).getHead();
    
        }
    
        private void recurse(Node<String> node, int off, String text,
                int predictDeep) {
            int len = MAX_WORD_LEN + off > text.length() ? text.length() - off
                    : MAX_WORD_LEN;
    
            while (predictDeep < PREDICT_LEVEL) {
                if (len < 1) {
                    return;
                }
    
                String s = text.substring(off, off + len);
                if (len < 2) {
                    if (!TextUtils.isCnLetter(text.charAt(off))) {
                        break;
                    }
                    recurse(node.add(s), off + 1, text, predictDeep + 1);
                } else if (dictionary.contains(s)) {
                    recurse(node.add(s), off + s.length(), text, predictDeep + 1);
                }
                len--;
            }
        }
    }
    

    MechanicalParticiple.java

    package grid.text.participle;
    
    import grid.common.TextUtils;
    
    import java.util.Vector;
    
    
    public class MechanicalParticiple {
    
        public Vector<String> partition(String document) {
            Vector<String> vector = new Vector<String>();
            final int docLen = document.length();
            int off = 0;
            char c;
            String seg = "";
            ChunkStream stream = new ChunkStream();
    
            while (off < docLen) {
                c = document.charAt(off);
                if (TextUtils.isEnLetter(c) || TextUtils.isNumeric(c)) {
                    seg += c;
                    off++;
                } else if (TextUtils.isCnLetter(c)) {
                    if (!TextUtils.isBlank(seg)) {
                        vector.add(seg);
                        seg = "";
                    }
                    String word = stream.next(document, off);
                    if (!TextUtils.isBlank(word)) {
                        vector.add(word);
                        off += word.length();
                    }
                } else {
                    if (!TextUtils.isBlank(seg)) {
                        vector.add(seg);
                        seg = "";
                    }
    
                    /**
                     * TODO: Uncomment the "ELSE IF" clause if you would like to
                     * reserve punctuations
                     */
    
                    // else if (!TextUtils.isBlank("" + c)) { vector.add("" + c); }
    
                    off++;
                }
            }
            if (!TextUtils.isBlank(seg)) {
                vector.add(seg);
            }
            return vector;
    
        }
    }
    

    selector

    文本选择器,筛选出可能为新词的词汇

    CnTextSelector.java

    package grid.text.selector;
    
    import grid.common.TextUtils;
    
    
    public class CnTextSelector extends CommonTextSelector {
    
        public CnTextSelector(String document, int minSelectLen, int maxSelectLen) {
            super(document, minSelectLen, maxSelectLen);
        }
    
        protected void adjustCurLen() {
            while (pos < docLen && !TextUtils.isCnLetter(document.charAt(pos))) {
                pos++;
            }
            for (int i = 0; i < maxSelectLen && pos + i < docLen; i++) {
                if (!TextUtils.isCnLetter(document.charAt(pos + i))) {
                    curLen = i;
                    if (curLen < minSelectLen) {
                        pos++;
                        adjustCurLen();
                    }
                    return;
                }
            }
    
            curLen = pos + maxSelectLen > docLen ? docLen - pos : maxSelectLen;
        }
    }
    

    CommonTextSelector.java

    package grid.text.selector;
    
    
    public class CommonTextSelector implements TextSelector {
    
        protected String document;
    
        protected int pos = 0;
    
        protected int maxSelectLen = 5;
    
        protected int minSelectLen = 2;
    
        protected int curLen;
    
        protected final int docLen;
    
        public CommonTextSelector(String document, int minSelectLen,
                int maxSelectLen) {
            this.document = document;
            this.minSelectLen = minSelectLen;
            this.maxSelectLen = maxSelectLen;
            docLen = document.length();
            adjustCurLen();
        }
    
        public void select() {
            pos += ++curLen;
            adjustCurLen();
        }
    
        protected void adjustCurLen() {
            curLen = pos + maxSelectLen > docLen ? docLen - pos : maxSelectLen;
        }
    
        public String next() {
            if (curLen < minSelectLen) {
                pos++;
                adjustCurLen();
            }
    
            if (pos + curLen <= docLen && curLen >= minSelectLen) {
                return document.substring(pos, pos + curLen--);
            } else {
                curLen--;
                // return document.substring(pos, docLen);
                return "";
            }
        }
    
        public boolean end() {
            return curLen < minSelectLen && curLen + pos >= docLen - 1;
        }
    
        @Override
        public int getCurPos() {
            return pos;
        }
    }
    

    TextSelector.java

    package grid.text.selector;
    
    
    public interface TextSelector {
        public boolean end();
    
        public void select();
    
        public String next();
    
        public int getCurPos();
    
    }
    

    测试代码

    NewWordDiscoverTest.java

    package grid.test;
    
    import grid.common.TextDatReader;
    import grid.text.evolution.NewWordDiscover;
    import grid.text.index.CnPreviewTextIndexer;
    
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.util.Scanner;
    import java.util.Set;
    
    public class NewWordDiscoverTest {
        public static void writefile(String m) {
    
            try {
                File file = new File("result.txt");
                if (!file.exists()) {
                    file.createNewFile();
                }
                FileWriter fileWritter = new FileWriter(file.getName(), true);
                BufferedWriter bufferWritter = new BufferedWriter(fileWritter);
                bufferWritter.write(m);
                bufferWritter.close();
    
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    
        @SuppressWarnings("resource")
        public static void main(String[] args) throws Exception {
            // 开始之前,清空result.txt,避免数据重复
            File filere = new File("result.txt");
            filere.delete();
    
            Scanner scan = new Scanner(System.in);
            System.out.println("请输入您要处理的文件名称:
    ");
            String path = scan.next();
            File file = new File(path);
            if (!file.exists() || (!file.isFile())) {
                throw new Exception("指定文件不存在!");
            }
            long maxsize = 1024 * 1024 * 1024;// 1G,超过这个值需要做文件切分
            long size = 1024 * 1024 * 5; // 子文件最大为100M
            long fileLength = file.length();
            if (size <= 0) {
                size = fileLength / 2;
            }
            // 取得被分割后的小文件的数目
            int num = (fileLength % size != 0) ? (int) (fileLength / size + 1)
                    : (int) (fileLength / size);
            if (file.length() >= maxsize) {
                System.out.println("文件大小超出1G,是否开始进行文件切割?1:是 0:否
    ");
    
                int t = scan.nextInt();
                if (t == 1) {
                    TextDatReader.divide(path, size);
                    System.out.println("切割完成
    ");
                    System.out.println("结果保存在当前目录下的dat文件夹中
    ");
    
                }
                // System.out.println("请输入您要处理的文件序号,例如1代表dat文件架下的text1.dat
    ");
                // int m = scans.nextInt();
                for (int m = 1; m <= num; m++) {
                    String pathdived = "./dat/text" + m + ".dat";
                    System.out.println("开始提取第" + m + "个文件……");
                    discovrWord(pathdived);
                }
    
            } else {
                System.out.println("开始提取文件……");
                discovrWord(path);
            }
        }
    
        private static void discovrWord(String path) throws IOException {
            String document = TextDatReader.read(path);
            NewWordDiscover discover = new NewWordDiscover();
            Set<String> words = discover.discover(document);
            CnPreviewTextIndexer ci = new CnPreviewTextIndexer(document);
    //      long start = System.currentTimeMillis();
    //      System.out.println("耗时: " + (double) document.length()
    //              / (System.currentTimeMillis() - start) * 1000);
            System.out.println("新词个数: " + words.size());
            System.out.println("发现的新词:" + "
    ");
            for (String newword : words) {
                System.out.println(newword + "," + ci.count(newword) + "
    ");// 发现新词后,统计每个新词出现的次数
                writefile(newword + "," + ci.count(newword) + "
    ");
            }
        }
    }

    抽词测试,结果如下

    ParticipleTest.java

    package grid.test;
    
    import grid.text.participle.MechanicalParticiple;
    
    import java.util.Vector;
    
    
    public class ParticipleTest {
    
        private static String document = "我是中国人";
    
        public static void main(String args[]) {
            MechanicalParticiple participle = new MechanicalParticiple();
            Vector<String> vec = participle.partition(document);
            System.out.println(vec);
        }
    }
    

    分词测试,结果如下

    怎么样,很酷吧,你还可以试着用《天龙八部》数据集玩下,看看主角是不是乔帮主。如果发现了什么新鲜词,请告诉博主,咱也不落后哈!

    VIP独享–天龙八部新词,如果想看结果请心里默夸博主一百次

    执行以上步骤后再送您一份哈利波特版的

  • 相关阅读:
    BadgeView使用
    设计模式(一)单例模式的七种写法
    Android 之使用LocalBroadcastManager解决BroadcastReceiver安全问题
    AsyncTask源码分析
    Android微信支付—注意事项
    Android微信支付SDK开发
    Android支付宝SDK开发笔记
    onInterceptTouchEvent与onTouchEvent默认返回值
    线程间的通信方式3--Handler
    Android应用程序启动过程(二)分析
  • 原文地址:https://www.cnblogs.com/ainima/p/6331773.html
Copyright © 2011-2022 走看看