从语料中自动挖掘短语
https://github.com/shangjingbo1226/AutoPhrase
预测搜索短语可采用FST结构,
https://blog.csdn.net/vivian_ll/article/details/95049652
https://www.youtube.com/watch?v=3kQyYbTyXfc
https://www.youtube.com/watch?v=k97WC5ijB7U
https://speakerdeck.com/mschoch/finite-state-transducers-in-go
package core.index;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class FstSearch {
// 输入保证字典序
public static FST<Long> buildFst() throws IOException {
String inputValues[] = {"cat house", "dog", "dog house", "dogs house", "dogs houses"};
long outputValues[] = {5, 7, 8, 12, 16};
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
IntsRefBuilder scratchInts = new IntsRefBuilder();
for (int i = 0; i < inputValues.length; i++) {
BytesRef scratchBytes = new BytesRef(inputValues[i]);
builder.add(Util.toIntsRef(scratchBytes, scratchInts), outputValues[i]);
}
return builder.finish();
}
public static void main(String[] args) throws IOException {
FST<Long> fst = FstSearch.buildFst();
String s = "";
s = "do";
System.out.println(s + " " + search(fst, s));
s = "dog";
System.out.println(s + " " + search(fst, s));
s = "dog house";
System.out.println(s + " " + search(fst, s));
s = "dogs house";
System.out.println(s + " " + search(fst, s));
s = "dogs houses";
System.out.println(s + " " + search(fst, s));
s = "c";
System.out.println(s + " " + search(fst, s));
s = "ca";
System.out.println(s + " " + search(fst, s));
s = "cat";
System.out.println(s + " " + search(fst, s));
s = "cat houses";
System.out.println(s + " " + search(fst, s));
}
/**
* 当前是英文 所以一个字节 和字符的偏移量等价了,其他语言需要修改。
*/
public static <T> List<Integer> search(FST<T> fst, String input) throws IOException {
List<Integer> offsets = new ArrayList<>();
BytesRef bytesRef = new BytesRef(input);
assert fst.inputType == FST.INPUT_TYPE.BYTE1;
FST.BytesReader fstReader = fst.getBytesReader();
FST.Arc<T> arc = fst.getFirstArc(new FST.Arc());
FST.Arc<T> holder = new FST.Arc<>();
for (int i = 0; i < bytesRef.length; ++i) {
FST.Arc<T> targetArc = fst.findTargetArc(bytesRef.bytes[i + bytesRef.offset] & 255, arc, holder, fstReader);
// (arc.target == -1 && arc.isFinal() && arc!=holder) { if get all then remove -1 like below
if (arc.isFinal() && arc != holder) {
offsets.add(i);
}
if (targetArc == null) {
return offsets;
}
arc.copyFrom(holder);
}
if (arc.isFinal()) {
offsets.add(bytesRef.length);
}
return offsets;
}
}
