zoukankan      html  css  js  c++  java
  • 从语料中自动挖掘短语

    从语料中自动挖掘短语

    https://github.com/shangjingbo1226/AutoPhrase

    预测搜索短语可采用FST结构,

    https://blog.csdn.net/vivian_ll/article/details/95049652

    https://www.youtube.com/watch?v=3kQyYbTyXfc

    https://www.youtube.com/watch?v=k97WC5ijB7U

     https://speakerdeck.com/mschoch/finite-state-transducers-in-go

    package core.index;
    
    
    import org.apache.lucene.util.BytesRef;
    import org.apache.lucene.util.IntsRefBuilder;
    import org.apache.lucene.util.fst.*;
    
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    
    
    public class FstSearch {
    
        // 输入保证字典序
        public static FST<Long> buildFst() throws IOException {
            String inputValues[] = {"cat house", "dog", "dog house", "dogs house", "dogs houses"};
            long outputValues[] = {5, 7, 8, 12, 16};
            PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
            Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    
            IntsRefBuilder scratchInts = new IntsRefBuilder();
            for (int i = 0; i < inputValues.length; i++) {
                BytesRef scratchBytes = new BytesRef(inputValues[i]);
                builder.add(Util.toIntsRef(scratchBytes, scratchInts), outputValues[i]);
            }
            return builder.finish();
        }
    
        public static void main(String[] args) throws IOException {
            FST<Long> fst = FstSearch.buildFst();
    
            String s = "";
            s = "do";
            System.out.println(s + "			" + search(fst, s));
            s = "dog";
            System.out.println(s + "			" + search(fst, s));
            s = "dog house";
            System.out.println(s + "	" + search(fst, s));
            s = "dogs house";
            System.out.println(s + "	" + search(fst, s));
            s = "dogs houses";
            System.out.println(s + "	" + search(fst, s));
            s = "c";
            System.out.println(s + "			" + search(fst, s));
            s = "ca";
            System.out.println(s + "			" + search(fst, s));
            s = "cat";
            System.out.println(s + "			" + search(fst, s));
            s = "cat houses";
            System.out.println(s + "	" + search(fst, s));
    
        }
    
        /**
         * 当前是英文 所以一个字节 和字符的偏移量等价了,其他语言需要修改。
         */
        public static <T> List<Integer> search(FST<T> fst, String input) throws IOException {
            List<Integer> offsets = new ArrayList<>();
    
            BytesRef bytesRef = new BytesRef(input);
            assert fst.inputType == FST.INPUT_TYPE.BYTE1;
    
            FST.BytesReader fstReader = fst.getBytesReader();
            FST.Arc<T> arc = fst.getFirstArc(new FST.Arc());
            FST.Arc<T> holder = new FST.Arc<>();
            for (int i = 0; i < bytesRef.length; ++i) {
                FST.Arc<T> targetArc = fst.findTargetArc(bytesRef.bytes[i + bytesRef.offset] & 255, arc, holder, fstReader);
    
                // (arc.target == -1 && arc.isFinal() && arc!=holder) { if get all then remove -1 like below
                if (arc.isFinal() && arc != holder) {
                    offsets.add(i);
                }
                if (targetArc == null) {
                    return offsets;
                }
                arc.copyFrom(holder);
            }
            if (arc.isFinal()) {
                offsets.add(bytesRef.length);
            }
            return offsets;
        }
    }
    

      

  • 相关阅读:
    011. Python中*args, **kwargs 和 pass 和self 解释
    010. windows10下安装kivy 1.9.1版
    013. MVC5过滤器
    制作ubuntu16.04 自动安装iso镜像 二
    Nexus安装
    使用docker-compose 大杀器来部署服务 上
    Docker-Compose入门
    nvidia-docker命令详解
    安装使用NVIDIA-Docker-- 可使用GPU的Docker容器
    frp实现内网穿透
  • 原文地址:https://www.cnblogs.com/startnow/p/14092245.html
Copyright © 2011-2022 走看看