zoukankan      html  css  js  c++  java
  • lucene自定义同义词实现

     lucene同义词搜索原理其实是根据 PositionIncrementAttribute 和 CharTermAttribute的次元记录信息来实现的,当前使用lucene版本为4.8.0首先同义词要实现
     
    package lucene_index;
     
    import java.io.IOException;
    import java.util.Map;
    import java.util.Stack;
     
    import org.apache.lucene.analysis.TokenFilter;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
    import org.apache.lucene.util.AttributeSource;
     
    public class MySameFiter extends TokenFilter {
        Stack<String> stack = null;
        private CharTermAttribute cta = null; //词元信息
        private PositionIncrementAttribute position = null; // 词元位置信息
        private AttributeSource.State current; // 记录当前的词元位置状态
        private Map<String, String[]> map  ; // 同义词表
     
        protected MySameFiter(TokenStream input,Map<String, String[]> map ) {
            super(input);
            stack = new Stack<>();
            cta = input.addAttribute(CharTermAttribute.class);
            position = input.addAttribute(PositionIncrementAttribute.class);
            this.map = map ; 
        }
        @Override
        public boolean incrementToken() throws IOException {
        
            //同义词操作
            while (stack.size() > 0) {
                String word = stack.pop();
                restoreState(current);
                cta.setEmpty();
                cta.append(word);
                position.setPositionIncrement(0);
                return true;
            }
            //判断是否有下一个分词
            if (!input.incrementToken()) {
                return false;
            }
            //获取当前的状态
            if (getSameWrds(cta.toString())) {
                current = captureState();
            }
            
            
            return true;
        }
     
        private boolean getSameWrds(String words) {
            String[] arr = map.get(words);
            if (arr != null) {
                for (String word : arr) {
                    stack.push(word);
     
                }
                return true;
            }
     
            return false;
        }
     
    }
     
     自定义分词器
     
    package lucene_index;
     
    import java.io.Reader;
    import java.util.HashMap;
    import java.util.Map;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.Tokenizer;
    import org.apache.lucene.analysis.core.StopAnalyzer;
    import org.apache.lucene.analysis.core.StopFilter;
    import org.apache.lucene.analysis.util.CharArraySet;
    import org.apache.lucene.util.Version;
    import org.wltea.analyzer.lucene.IKTokenizer;
     
    public class StopWrodsAnalyse extends Analyzer{
        private Map<String, String[]> map = new HashMap<String, String[]>();
    //  private CharArraySet  set = null;
        public StopWrodsAnalyse(Map<String, String[]> map ){
            //for(Map.Entry<String, String []> entry : map.entrySet()){
            //  set = StopFilter.makeStopSet(Version.LUCENE_48, entry.getValue(),true);
        //  }
        //  set.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
            this.map = map ;
        }
        @Override
        protected TokenStreamComponents createComponents(String words, Reader reader) {
            
            Tokenizer source =  new IKTokenizer(reader, false);
            TokenStream stream = new MySameFiter(source,map);//将自定义的filter传入词库的话用ik的
        //  stream = new StopFilter(Version.LUCENE_48, stream, set);
            return new TokenStreamComponents(source,stream);
        }
     
    }
     
     
     
    package lucene_index;
     
    import java.io.File;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.Collection;
    import java.util.HashMap;
    import java.util.Map;
     
    import javax.print.Doc;
     
    import org.apache.commons.io.FileUtils;
    import org.apache.commons.io.LineIterator;
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.document.StringField;
    import org.apache.lucene.document.Field.Index;
    import org.apache.lucene.document.Field.Store;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.queryparser.classic.ParseException;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.search.TermQuery;
    import org.apache.lucene.search.TopDocs;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
     
    public class MainTest {
        public static void main(String[] args) throws IOException, ParseException {
            LineIterator it = FileUtils.lineIterator(new File("E://searchwork_custom//data_index//ConfigFile//ExpansionWord.csv"),"gbk");
            Map<String, String []> map = new HashMap<String, String[]>();
            while (it.hasNext()) {
                String word = it.nextLine();
                String [] wordArr = word.replace("-,", "").trim().split("\,");
                if(map.containsKey(wordArr[0]))
                    continue;
                map.put(wordArr[0], wordArr);
            }
            Analyzer analyzer = new StopWrodsAnalyse(map);
            Directory directory = FSDirectory.open(new File("E:\luceneindex")); 
            IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer);
            IndexWriter writer = new IndexWriter(directory, config);
            Collection<Document> coll = new ArrayList<Document>();
            for (Map.Entry<String, String []> entry : map.entrySet()) {
                Document docss = new Document();
                Field field = new Field("name", entry.getKey(),Store.YES,Index.ANALYZED);
                docss.add(field);
                coll.add(docss);
            }
            writer.addDocuments(coll);
            writer.commit();
            writer.close();
            IndexSearcher searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(new File("E:\luceneindex"))));
          // QueryParser parser = new QueryParser(Version.LUCENE_48, "name", analyzer);
            search(searcher);
            //WordInfo.getWordInfo(word, analyzer);
        }
        public static void search(IndexSearcher searcher) throws IOException{
            Query q =  new TermQuery(new Term("name","中国建设银行"));
            System.out.println(q);
            TopDocs doc = searcher.search(q, 10);
            ScoreDoc [] docs = doc.scoreDocs;
            for (int i = 0; i < docs.length; i++) {
                Document d = searcher.doc(docs[i].doc);
                System.out.println(d.get("name"));
            }
            
        }
    }
     
     

    3.测试
     当搜建行建设银行中国建设银行时建行或者建设银行时
    图片 
  • 相关阅读:
    Balance的数学思想构造辅助函数
    1663. Smallest String With A Given Numeric Value (M)
    1680. Concatenation of Consecutive Binary Numbers (M)
    1631. Path With Minimum Effort (M)
    1437. Check If All 1's Are at Least Length K Places Away (E)
    1329. Sort the Matrix Diagonally (M)
    1657. Determine if Two Strings Are Close (M)
    1673. Find the Most Competitive Subsequence (M)
    1641. Count Sorted Vowel Strings (M)
    1679. Max Number of K-Sum Pairs (M)
  • 原文地址:https://www.cnblogs.com/wangnanhui/p/6833172.html
Copyright © 2011-2022 走看看