zoukankan      html  css  js  c++  java
  • lucene.net全文检索(二)lucene.net 的封装

    查询

       public class LuceneQuery : ILuceneQuery
        {
            #region Identity
            private Logger logger = new Logger(typeof(LuceneQuery));
            #endregion Identity
    
            #region QueryIndex
            /// <summary>
            /// 获取商品信息数据
            /// </summary>
            /// <param name="queryString"></param>
            /// <returns></returns>
            public List<Commodity> QueryIndex(string queryString)
            {
                IndexSearcher searcher = null;
                try
                {
                    List<Commodity> ciList = new List<Commodity>();
                    Directory dir = FSDirectory.Open(StaticConstant.IndexPath);
                    searcher = new IndexSearcher(dir);
                    Analyzer analyzer = new PanGuAnalyzer();
    
                    //--------------------------------------这里配置搜索条件
                    QueryParser parser = new QueryParser(Version.LUCENE_30, "title", analyzer);
                    Query query = parser.Parse(queryString);
                    Console.WriteLine(query.ToString()); //显示搜索表达式
                    TopDocs docs = searcher.Search(query, (Filter)null, 10000);
    
                    foreach (ScoreDoc sd in docs.ScoreDocs)
                    {
                        Document doc = searcher.Doc(sd.Doc);
                        ciList.Add(DocumentToCommodityInfo(doc));
                    }
    
                    return ciList;
                }
                finally
                {
                    if (searcher != null)
                    {
                        searcher.Dispose();
                    }
                }
            }
    
    
    
            /// <summary>
            /// 分页获取商品信息数据
            /// </summary>
            /// <param name="queryString"></param>
            /// <param name="pageIndex">第一页为1</param>
            /// <param name="pageSize"></param>
            /// <param name="totalCount"></param>
            /// <returns></returns>
            public List<Commodity> QueryIndexPage(string queryString, int pageIndex, int pageSize, out int totalCount, string priceFilter, string priceOrderBy)
            {
                totalCount = 0;
                IndexSearcher searcher = null;
                try
                {
                    List<Commodity> ciList = new List<Commodity>();
                    FSDirectory dir = FSDirectory.Open(StaticConstant.IndexPath);
                    searcher = new IndexSearcher(dir);
                    Analyzer analyzer = new PanGuAnalyzer();
    
                    //--------------------------------------这里配置搜索条件
                    QueryParser parser = new QueryParser(Version.LUCENE_30, "title", analyzer);
                    Query query = parser.Parse(queryString);
                    
                    pageIndex = Math.Max(1, pageIndex);//索引从1开始
                    int startIndex = (pageIndex - 1) * pageSize;
                    int endIndex = pageIndex * pageSize;
    
                    NumericRangeFilter<float> numPriceFilter = null;
                    if (!string.IsNullOrWhiteSpace(priceFilter))
                    {
                        bool isContainStart = priceFilter.StartsWith("[");
                        bool isContainEnd = priceFilter.EndsWith("]");
                        string[] floatArray = priceFilter.Replace("[", "").Replace("]", "").Replace("{", "").Replace("}", "").Split(',');
                        float start = 0;
                        float end = 0;
                        if (!float.TryParse(floatArray[0], out start) || !float.TryParse(floatArray[1], out end))
                        {
                            throw new Exception("Wrong priceFilter");
                        }
                        numPriceFilter = NumericRangeFilter.NewFloatRange("price", start, end, isContainStart, isContainEnd);
                    }
    
                    Sort sort = new Sort();
                    if (!string.IsNullOrWhiteSpace(priceOrderBy))
                    {
                        SortField sortField = new SortField("price", SortField.FLOAT, priceOrderBy.EndsWith("asc", StringComparison.CurrentCultureIgnoreCase));
                        sort.SetSort(sortField);
                    }
    
                    TopDocs docs = searcher.Search(query, numPriceFilter, 10000, sort);
                    //TopDocs docs = searcher.Search(query, null, 10000);
                    
                    totalCount = docs.TotalHits;
                    //PrintScores(docs, startIndex, endIndex, searcher);
                    for (int i = startIndex; i < endIndex && i < totalCount; i++)
                    {
                        Document doc = searcher.Doc(docs.ScoreDocs[i].Doc);
                        ciList.Add(DocumentToCommodityInfo(doc));
                    }
    
                    return ciList;
                }
                finally
                {
                    if (searcher != null)
                    {
                        searcher.Dispose();
                    }
                }
            }
    
            private void PrintScores(TopDocs docs, int startIndex, int endIndex, MultiSearcher searcher)
            {
                ScoreDoc[] scoreDocs = docs.ScoreDocs;
                for (int i = startIndex; i < endIndex && i < scoreDocs.Count(); i++)
                {
                    int docId = scoreDocs[i].Doc;
                    Document doc = searcher.Doc(docId);
                    logger.Info(string.Format("{0}的分值为{1}", doc.Get("productid"), scoreDocs[i].Score));
                }
            }
    
            #endregion QueryIndex
    
            #region private
            private Commodity DocumentToCommodityInfo(Document doc)
            {
                return new Commodity()
                           {
                               Id = int.Parse(doc.Get("id")),
                               Title = doc.Get("title"),
                               ProductId = long.Parse(doc.Get("productid")),
                               CategoryId = int.Parse(doc.Get("categoryid")),
                               ImageUrl = doc.Get("iamgeurl"),
                               Price = decimal.Parse(doc.Get("price")),
                               Url = doc.Get("url")
                           };
            }
    
            #endregion private
        }
    View Code

    批量/单个索引的增删改

        /// <summary>
        /// 多线程的问题 :多文件写,然后合并
        /// 延时:异步队列
        /// 
        /// </summary>
        public class LuceneBulid : ILuceneBulid
        {
            #region Identity
            private Logger logger = new Logger(typeof(LuceneBulid));
            #endregion Identity
    
            #region 批量BuildIndex 索引合并
            /// <summary>
            /// 批量创建索引(要求是统一的sourceflag,即目录是一致的)
            /// </summary>
            /// <param name="ciList">sourceflag统一的</param>
            /// <param name="pathSuffix">索引目录后缀,加在电商的路径后面,为空则为根目录.如sa1</param>
            /// <param name="isCreate">默认为false 增量索引  true的时候删除原有索引</param>
            public void BuildIndex(List<Commodity> ciList, string pathSuffix = "", bool isCreate = false)
            {
                IndexWriter writer = null;
                try
                {
                    if (ciList == null || ciList.Count == 0)
                    {
                        return;
                    }
    
                    string rootIndexPath = StaticConstant.IndexPath;
                    string indexPath = string.IsNullOrWhiteSpace(pathSuffix) ? rootIndexPath : string.Format("{0}\{1}", rootIndexPath, pathSuffix);
    
                    DirectoryInfo dirInfo = Directory.CreateDirectory(indexPath);
                    LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                    writer = new IndexWriter(directory, new PanGuAnalyzer(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                    //writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                    writer.SetMaxBufferedDocs(100);//控制写入一个新的segent前内存中保存的doc的数量 默认10  
                    writer.MergeFactor = 100;//控制多个segment合并的频率,默认10
                    writer.UseCompoundFile = true;//创建复合文件 减少索引文件数量
    
                    ciList.ForEach(c => CreateCIIndex(writer, c));
                }
                finally
                {
                    if (writer != null)
                    {
                        //writer.Optimize(); 创建索引的时候不做合并  merge的时候处理
                        writer.Close();
                    }
                }
            }
    
            /// <summary>
            /// 将索引合并到上级目录
            /// </summary>
            /// <param name="sourceDir">子文件夹名</param>
            public void MergeIndex(string[] childDirs)
            {
                Console.WriteLine("MergeIndex Start");
                IndexWriter writer = null;
                try
                {
                    if (childDirs == null || childDirs.Length == 0) return;
                    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
                    string rootPath = StaticConstant.IndexPath;
                    DirectoryInfo dirInfo = Directory.CreateDirectory(rootPath);
                    LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                    writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);//删除原有的
                    LuceneIO.Directory[] dirNo = childDirs.Select(dir => LuceneIO.FSDirectory.Open(Directory.CreateDirectory(string.Format("{0}\{1}", rootPath, dir)))).ToArray();
                    writer.MergeFactor = 100;//控制多个segment合并的频率,默认10
                    writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量
                    writer.AddIndexesNoOptimize(dirNo);
                }
                finally
                {
                    if (writer != null)
                    {
                        writer.Optimize();
                        writer.Close();
                    }
                    Console.WriteLine("MergeIndex End");
                }
            }
    
            //Field.Store.YES:存储字段值(未分词前的字段值)        
            //Field.Store.NO:不存储,存储与索引没有关系         
            //Field.Store.COMPRESS:压缩存储,用于长文本或二进制,但性能受损         
            //Field.Index.ANALYZED:分词建索引         
            //Field.Index.ANALYZED_NO_NORMS:分词建索引,但是Field的值不像通常那样被保存,而是只取一个byte,这样节约存储空间         
            //Field.Index.NOT_ANALYZED:不分词且索引         
            //Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引,Field的值去一个byte保存         
            //TermVector表示文档的条目(由一个Document和Field定位)和它们在当前文档中所出现的次数         
            //Field.TermVector.YES:为每个文档(Document)存储该字段的TermVector         
            //Field.TermVector.NO:不存储TermVector         
            // Field.TermVector.WITH_POSITIONS:存储位置        
            //Field.TermVector.WITH_OFFSETS:存储偏移量         
            //Field.TermVector.WITH_POSITIONS_OFFSETS:存储位置和偏移量
            #endregion 批量BuildIndex 索引合并
    
            #region 单个/批量索引增删改
            /// <summary>
            /// 新增一条数据的索引
            /// </summary>
            /// <param name="ci"></param>
            public void InsertIndex(Commodity ci)
            {
                IndexWriter writer = null;
                try
                {
                    if (ci == null) return;
                    string rootIndexPath = StaticConstant.IndexPath;
                    DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);
    
                    bool isCreate = dirInfo.GetFiles().Count() == 0;//下面没有文件则为新建索引 
                    LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                    writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                    writer.MergeFactor = 100;//控制多个segment合并的频率,默认10
                    writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量
                    CreateCIIndex(writer, ci);
                }
                catch (Exception ex)
                {
                    logger.Error("InsertIndex异常", ex);
                    throw ex;
                }
                finally
                {
                    if (writer != null)
                    {
                        //if (fileNum > 50)
                        //    writer.Optimize();
                        writer.Close();
                    }
                }
            }
    
            /// <summary>
            /// 批量新增数据的索引
            /// </summary>
            /// <param name="ciList"></param>
            public void InsertIndexMuti(List<Commodity> ciList)
            {
                BuildIndex(ciList, "", false);
            }
    
            /// <summary>
            /// 批量删除数据的索引
            /// </summary>
            /// <param name="ciList"></param>
            public void DeleteIndexMuti(List<Commodity> ciList)
            {
                IndexReader reader = null;
                try
                {
                    if (ciList == null || ciList.Count == 0) return;
                    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
                    string rootIndexPath = StaticConstant.IndexPath;
                    DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);
                    LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                    reader = IndexReader.Open(directory, false);
                    foreach (Commodity ci in ciList)
                    {
                        reader.DeleteDocuments(new Term("productid", ci.ProductId.ToString()));
                    }
                }
                catch (Exception ex)
                {
                    logger.Error("DeleteIndex异常", ex);
                    throw ex;
                }
                finally
                {
                    if (reader != null)
                    {
                        reader.Dispose();
                    }
                }
            }
    
            /// <summary>
            /// 删除多条数据的索引
            /// </summary>
            /// <param name="ci"></param>
            public void DeleteIndex(Commodity ci)
            {
                IndexReader reader = null;
                try
                {
                    if (ci == null) return;
                    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
                    string rootIndexPath = StaticConstant.IndexPath;
                    DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);
                    LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                    reader = IndexReader.Open(directory, false);
                    reader.DeleteDocuments(new Term("productid", ci.ProductId.ToString()));
                }
                catch (Exception ex)
                {
    
                    logger.Error("DeleteIndex异常", ex);
                    throw ex;
                }
                finally
                {
                    if (reader != null)
                    {
                        reader.Dispose();
                    }
                }
            }
    
            /////// <summary>
            /////// 更新一条数据的索引
            /////// </summary>
            //public void UpdateIndex(Commodity ci)
            //{
            //    DeleteIndex(ci);
            //    InsertIndex(ci);
            //}
    
            /// <summary>
            /// 更新一条数据的索引
            /// </summary>
            /// <param name="ci"></param>
            public void UpdateIndex(Commodity ci)
            {
                IndexWriter writer = null;
                try
                {
                    if (ci == null) return;
                    string rootIndexPath = StaticConstant.IndexPath;
                    DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);
    
                    bool isCreate = dirInfo.GetFiles().Count() == 0;//下面没有文件则为新建索引 
                    LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                    writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                    writer.MergeFactor = 100;//控制多个segment合并的频率,默认10
                    writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量
                    writer.UpdateDocument(new Term("productid", ci.ProductId.ToString()), ParseCItoDoc(ci));
                }
                catch (Exception ex)
                {
                    logger.Error("InsertIndex异常", ex);
                    throw ex;
                }
                finally
                {
                    if (writer != null)
                    {
                        //if (fileNum > 50)
                        //    writer.Optimize();
                        writer.Close();
                    }
                }
            }
    
            /// <summary>
            /// 批量更新数据的索引
            /// </summary>
            /// <param name="ciList">sourceflag统一的</param>
            public void UpdateIndexMuti(List<Commodity> ciList)
            {
                IndexWriter writer = null;
                try
                {
                    if (ciList == null || ciList.Count == 0) return;
                    string rootIndexPath = StaticConstant.IndexPath;
                    DirectoryInfo dirInfo = Directory.CreateDirectory(rootIndexPath);
    
                    bool isCreate = dirInfo.GetFiles().Count() == 0;//下面没有文件则为新建索引 
                    LuceneIO.Directory directory = LuceneIO.FSDirectory.Open(dirInfo);
                    writer = new IndexWriter(directory, CreateAnalyzerWrapper(), isCreate, IndexWriter.MaxFieldLength.LIMITED);
                    writer.MergeFactor = 50;//控制多个segment合并的频率,默认10
                    writer.UseCompoundFile = true;//创建符合文件 减少索引文件数量
                    foreach (Commodity ci in ciList)
                    {
                        writer.UpdateDocument(new Term("productid", ci.ProductId.ToString()), ParseCItoDoc(ci));
                    }
                }
                catch (Exception ex)
                {
                    logger.Error("InsertIndex异常", ex);
                    throw ex;
                }
                finally
                {
                    if (writer != null)
                    {
                        //if (fileNum > 50)
                        //    writer.Optimize();
                        writer.Close();
                    }
                }
            }
            #endregion 单个索引增删改
    
            #region PrivateMethod
            /// <summary>
            /// 创建分析器
            /// </summary>
            /// <returns></returns>
            private PerFieldAnalyzerWrapper CreateAnalyzerWrapper()
            {
                Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
    
                PerFieldAnalyzerWrapper analyzerWrapper = new PerFieldAnalyzerWrapper(analyzer);
                analyzerWrapper.AddAnalyzer("title", new PanGuAnalyzer());
                analyzerWrapper.AddAnalyzer("categoryid", new StandardAnalyzer(Version.LUCENE_30));
                return analyzerWrapper;
            }
    
            /// <summary>
            /// 创建索引
            /// </summary>
            /// <param name="analyzer"></param>
            /// <param name="title"></param>
            /// <param name="content"></param>
            private void CreateCIIndex(IndexWriter writer, Commodity ci)
            {
                try
                {
                    writer.AddDocument(ParseCItoDoc(ci));
                }
                catch (Exception ex)
                {
                    logger.Error("CreateCIIndex异常", ex);
                    throw ex;
                }
            }
    
            /// <summary>
            /// 将Commodity转换成doc
            /// </summary>
            /// <param name="ci"></param>
            /// <returns></returns>
            private Document ParseCItoDoc(Commodity ci)
            {
                Document doc = new Document();
    
                doc.Add(new Field("id", ci.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                doc.Add(new Field("title", ci.Title, Field.Store.YES, Field.Index.ANALYZED));//盘古分词
                doc.Add(new Field("productid", ci.ProductId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                doc.Add(new Field("categoryid", ci.CategoryId.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                doc.Add(new Field("imageurl", ci.ImageUrl, Field.Store.YES, Field.Index.NOT_ANALYZED));
                doc.Add(new Field("url", ci.Url, Field.Store.YES, Field.Index.NOT_ANALYZED));
                doc.Add(new NumericField("price", Field.Store.YES, true).SetFloatValue((float)ci.Price));
                return doc;
            }
    
            #endregion PrivateMethod
        }
    View Code

    分词器封装

        public class LuceneAnalyze : ILuceneAnalyze
        {
            private Logger logger = new Logger(typeof(LuceneAnalyze));
    
    
            //
            #region AnalyzerKey
            /// <summary>
            /// 将搜索的keyword分词 
            /// 通过or 链接;查询更多的数据(贪婪查询)
            /// </summary>
            /// <param name="keyword"></param>
            /// <returns></returns>
            public string[] AnalyzerKey(string keyword)
            {
                Analyzer analyzer = new PanGuAnalyzer();
                QueryParser parser = new QueryParser(Version.LUCENE_30, "title", analyzer);
                Query query = parser.Parse(this.CleanKeyword(keyword));
                if (query is TermQuery)
                {
                    Term term = ((TermQuery)query).Term;
                    return new string[] { term.Text };
                }
                else if (query is PhraseQuery)
                {
                    Term[] term = ((PhraseQuery)query).GetTerms();
                    return term.Select(t => t.Text).ToArray();
                }
                else if (query is BooleanQuery)// and  or
                {
                    BooleanClause[] clauses = ((BooleanQuery)query).GetClauses();
                    List<string> analyzerWords = new List<string>();
                    foreach (BooleanClause clause in clauses)
                    {
                        Query childQuery = clause.Query;
                        if (childQuery is TermQuery)
                        {
                            Term term = ((TermQuery)childQuery).Term;
                            analyzerWords.Add(term.Text);
                        }
                        else if (childQuery is PhraseQuery)
                        {
                            Term[] term = ((PhraseQuery)childQuery).GetTerms();
                            analyzerWords.AddRange(term.Select(t => t.Text));
                        }
                    }
                    return analyzerWords.ToArray();
                }
                else
                {
                    logger.Debug(string.Format("AnalyzerKey在解析keyword={0}的结果为new string[] { keyword } ", keyword));
                    return new string[] { keyword };
                }
            }
    
            /// <summary>
            /// 清理头尾and or 关键字
            /// </summary>
            /// <param name="keyword"></param>
            /// <returns></returns>
            private string CleanKeyword(string keyword)
            {
                if (string.IsNullOrWhiteSpace(keyword))
                { }
                else
                {
                    bool isClean = false;
                    while (!isClean)
                    {
                        keyword = keyword.Trim();
                        if (keyword.EndsWith(" AND"))
                        {
                            keyword = string.Format("{0}and", keyword.Remove(keyword.Length - 3, 3));
                        }
                        else if (keyword.EndsWith(" OR"))
                        {
                            keyword = string.Format("{0}or", keyword.Remove(keyword.Length - 2, 2));
                        }
                        else if (keyword.StartsWith("AND "))
                        {
                            keyword = string.Format("and{0}", keyword.Substring(3));
                        }
                        else if (keyword.StartsWith("OR "))
                        {
                            keyword = string.Format("or{0}", keyword.Substring(2));
                        }
                        else if (keyword.Contains(" OR "))
                        {
                            keyword = keyword.Replace(" OR ", " or ");
                        }
                        else if (keyword.Contains(" AND "))
                        {
                            keyword = keyword.Replace(" AND ", " and ");
                        }
                        else
                            isClean = true;
                    }
    
                }
                return QueryParser.Escape(keyword);
            }
            #endregion AnalyzerKey
    View Code

     

    !

    • 作       者 : 明志德道
    • 希     望: 如果内容对您有用,动动小手点个赞吧,您的支持就是我持续写作的动力!
    • 声     明1 : 如有错误,欢迎讨论,请勿谩骂^_^。
    • 声     明2 : 原创博客请在转载时保留原文链接或在文章开头加上本人博客地址,否则保留追究法律责任的权利。
  • 相关阅读:
    ubuntu 制做samba
    《Programming WPF》翻译 第4章 前言
    《Programming WPF》翻译 第4章 3.绑定到数据列表
    《Programming WPF》翻译 第4章 4.数据源
    《Programming WPF》翻译 第5章 6.触发器
    《Programming WPF》翻译 第4章 2.数据绑定
    《Programming WPF》翻译 第4章 1.不使用数据绑定
    《Programming WPF》翻译 第5章 7.控件模板
    《Programming WPF》翻译 第5章 8.我们进行到哪里了?
    《Programming WPF》翻译 第5章 5.数据模板和样式
  • 原文地址:https://www.cnblogs.com/for-easy-fast/p/14319028.html
Copyright © 2011-2022 走看看