zoukankan      html  css  js  c++  java
  • 搜索引擎之全文搜索算法功能实现(基于Lucene)

    之前做去转盘网的时候,我已经公开了非全文搜索的代码,需要的朋友希望能够前去阅读我的博客。本文主要讨论如何进行全文搜索,由于本人花了很长时间设计了新作:观点,观点对全文搜索的要求还是很高的,所以我又花了不少时间研究全文搜索,你可以先体验下:点我搜索。废话也不多说了,直接上代码:

     public Map<String,Object>  articleSearchAlgorithms(SearchCondition condition,IndexSearcher searcher) throws ParseException, IOException{
             
                Map<String,Object> map =new HashMap<String,Object>();
                 String[] filedsList=condition.getFiledsList();
                 String keyWord=condition.getKeyWord();
                 int currentPage=condition.getCurrentPage();
                 int pageSize=condition.getPageSize();
                 String sortField=condition.getSortField();
                 boolean isASC=condition.isDESC();
                 String sDate=condition.getsDate();
                String eDate=condition.geteDate();
                String classify=condition.getClassify();
                 
                
                //过滤终结字符
                keyWord=escapeExprSpecialWord(keyWord);
                
                BooleanQuery q1 = new BooleanQuery();
                BooleanQuery q2 = new BooleanQuery();
                 BooleanQuery booleanQuery = new BooleanQuery(); //boolean查询
                 
                 if(classify!=null&&(classify.equals("guanzhi")||classify.equals("opinion")||classify.equals("write"))){
                     String typeId="1";//默认言论
                     if(classify.equals("guanzhi")){
                         typeId="2";
                     }
                     if(classify.equals("opinion")){
                         typeId="3";
                     }
                     Query termQuery = new TermQuery(new Term("typeId",typeId)); 
                     q1.add(termQuery,BooleanClause.Occur.MUST);
                 }
    
                 if(sDate!=null&&eDate!=null){//是否范围查询由这两个参数决定
                    Query rangeQuery = new TermRangeQuery("writingTime", new BytesRef(sDate), new BytesRef(eDate),true, true);
                    q1.add(rangeQuery,BooleanClause.Occur.MUST);
                 }
    
                Sort sort = new Sort(); // 排序
                sort.setSort(SortField.FIELD_SCORE);
                if(sortField!=null){
                    sort.setSort(new SortField(sortField, SortField.Type.STRING, isASC));
                }
                
                int start = (currentPage - 1) * pageSize;
                int hm = start + pageSize;
                
                TopFieldCollector res = TopFieldCollector.create(sort,hm,false, false, false, false);
    
                //完全匹配查询
                Term t0=new Term(filedsList[1],keyWord);
                TermQuery termQuery = new TermQuery(t0);//两种高度匹配的查询
                q2.add(termQuery,BooleanClause.Occur.SHOULD);
                
                //前缀匹配
                Term t1=new Term(filedsList[1],keyWord);
                PrefixQuery prefixQuery=new PrefixQuery(t1);
                q2.add(prefixQuery,BooleanClause.Occur.SHOULD);
                
                //短语,相似度匹配,适用于分词的内容
                for(int i=0;i<filedsList.length;i++){ //多字段term查询算法
                    if(i!=1){
                        PhraseQuery phraseQuery=new PhraseQuery();
                        Term ts0=new Term(filedsList[i],keyWord);
                        phraseQuery.add(ts0);
                        
                        FuzzyQuery fQuery=new FuzzyQuery(new Term(filedsList[i],keyWord),2);//最后相似度查询
                        
                        q2.add(phraseQuery,BooleanClause.Occur.SHOULD);
                        q2.add(fQuery,BooleanClause.Occur.SHOULD);//后缀相似的拿出来
                    }
                }
    
                MultiFieldQueryParser  queryParser = new MultiFieldQueryParser(Version.LUCENE_47,filedsList,analyzer);
                queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);
                Query query = queryParser.parse(keyWord);
    
                q2.add(query,BooleanClause.Occur.SHOULD);
                
                //必须加逻辑判断,否则结果是不同的
                if(q1!=null && q1.toString().length()>0){
                    booleanQuery.add(q1,BooleanClause.Occur.MUST);
                }
                if(q2!=null && q2.toString().length()>0){
                     booleanQuery.add(q2,BooleanClause.Occur.MUST);
                }
                
                searcher.search(booleanQuery, res);
                long amount = res.getTotalHits(); 
                TopDocs tds = res.topDocs(start, pageSize);
                map.put("amount",amount);
                map.put("tds",tds);
                map.put("query",booleanQuery);
                return map;
        }

    注意下:上面代码的搜索条件(SearchCondition )是观点网的具体需求,您可以按照您自己的搜索条件做改动,这里也很难适配所有读者。

    public Map<String, Object> searchArticle(SearchCondition condition) throws Exception{
                
            Map<String,Object> map =new HashMap<String,Object>();
            List<Write> list=new ArrayList<Write>();
            
             DirectoryReader reader=condition.getReader();
             String URL=condition.getURL();
             boolean isHighligth=condition.isHighlight();
             String keyWord=condition.getKeyWord();
             IndexSearcher searcher=getSearcher(reader,URL);
            
            try{
                Map<String,Object> output=articleSearchAlgorithms(condition,searcher);
                if(output==null){
                    map.put("amount",0L);
                    map.put("source",null);
                    return map;
                }
                
                map.put("amount", output.get("amount"));
                TopDocs tds = (TopDocs) output.get("tds");
                ScoreDoc[] sd = tds.scoreDocs;
                Query query =(Query) output.get("query");
                
                for (int i = 0; i < sd.length; i++) {
                    
                    Document doc = searcher.doc(sd[i].doc);
    
                    String id = doc.get("id");
                    /**********************start*************************需要处理的放一块儿********************/
                    String temp=doc.get("title");
                    String title =temp; //默认不高亮
                    if(isHighligth){
                        //高亮文章标题
                        Highlighter highlighterTitle = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
                        highlighterTitle.setTextFragmenter(new SimpleFragmenter(40)); // 字长度
                        TokenStream ts = analyzer.tokenStream("title", new StringReader(temp));
                        title= highlighterTitle.getBestFragment(ts,temp); 
                        if(title==null){
                            title=temp.replace(keyWord,"<span style='color:red'>"+keyWord+"</span>");//高亮处理插件bug,加这句话避免
                        }
                    }
                    
                    String temp1=HtmlEnDecode.htmlEncode(doc.get("content"));
                    String content=temp1;//使用自己封装的方法来转义
                    
                    if(isHighligth){
                        //做高亮处理,content
                        Highlighter highlighterContent = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
                        highlighterContent.setTextFragmenter(new SimpleFragmenter(Constant.HIGHLIGHT_CONTENT_LENGTH)); // 字长度
                        //temp1=StringEscapeUtils.escapeHtml(temp1);//将汉字转义导致高亮失效
                        TokenStream ts1 = analyzer.tokenStream("content", new StringReader(temp1));
                        content = highlighterContent.getBestFragment(ts1,temp1);
                        
                        if(content==null){
                            content=temp1.replace(keyWord,"<span style='color:red'>"+keyWord+"</span>");//高亮处理插件bug,加这句话避免
                            
                            //假设遇上这种情况做处理,其他的高亮器会自动截图
                            content=subContent(content);//截取处理
                            content=HtmlEnDecode.htmldecode(content);//html解码
                            content=SubStringHTML.sub(content,Constant.HIGHLIGHT_CONTENT_LENGTH);
                        }
                    }
                    /*---------------------------------------不断变动的数据放一块儿----------------------------*/
                    
                    Write write=writeDao.getArticle(Long.parseLong(id));
                    if(write!=null){
                        write.setTitle(title);
                        write.setContent(content);
                        
                        Date writingTime=write.getWritingTime();
                        String timeGap=DateUtil.dateGap(writingTime);//timeGap
                        write.setTimeGap(timeGap);
                        
                        list.add(write);
                    }
                }
                
            }catch(Exception e){
                e.printStackTrace();
            }
            map.put("source",list);
            return map;
        }

    注意上面,这是具体的搜索代码,不同的应用场景有不同的需求,请您按照自己的需求封装对象,查询数据库等,代码毫无保留,绝对可用。

    如果有什么疑问可以加qq群:284205104 如果群满了就麻烦去趟去转盘找下最新的群加了即可,谢谢您的阅读。

  • 相关阅读:
    Codeforces Round #336 B
    Codeforces Round #336 A
    hiho一下157
    Codeforces Round #420 E
    Codeforces Round #420 C
    Codeforces Round #420 B
    Codeforces Round #420 A
    Codeforces Round #418 C
    Codeforces Round #418 B
    CodeForces 811D Vladik and Favorite Game bfs,模拟
  • 原文地址:https://www.cnblogs.com/huangxie/p/8059132.html
Copyright © 2011-2022 走看看