zoukankan      html  css  js  c++  java
  • LuceneNet的使用

    先建立索引,再查询,速度很快。

    索引花费的时间比较多,但还可以接受,200M的文本需要1分多钟,40G的文本需要4个小时多。

    刚开始我用 2.9版本的,是选择将文本也保存在索引中,占据的空间比原先的文本2倍多。

    而且发现如果里面的邮箱地址不带@后面无法查询出来,也许是所用的分词的关系,也不知道怎样才可以自定义分隔符。

    后来改用了4.8版本,索引的空间只比原先的文本大一点点,而且不带@的关键字也可以查询出来。

    但还是有个问题,查询出来的内容中文是乱码,也无法用中文查询。

    无论是 NLuke 还是 Luke , 都没法成功打开索引文件。

    IndexWriter 构造函数使用了另一个重载,即第三个参数为 bool,如果为 true 表示不存在就创建、存在就覆盖,为 false 表示不存在就出错、存在就追加。这个不方便,因为我们需要的是:不存在就创建、存在就追加,怎样才能实现这个功能呢?省略掉,就实现这个功能了。
     
    void Main()
    {
     
     string idxpath = @"D:dataDB xtindex";
     string dir = @"D:dataDB xtsearch ianya";
     //!!怎样用@分词?
     string keyword = "zhaoshu0997";
     Utils.FullSearch.FileHelper filehelper= new Utils.FullSearch.FileHelper(idxpath);
     //filehelper.BuildIndex(dir);
     Utils.FullSearch.SearchResults results = filehelper.Search(keyword);
     Console.WriteLine($"{keyword.ToString()}");
     results.Dump();
    }

    namespace Utils.FullSearch { public class SearchResults{ public int TotalHits{get;set;} public List<Hit> SearchContents{get;set;} } public class Hit{ public float Score{get;set;} public string Content{get;set;} } public class FileHelper { private const LuceneVersion MATCH_LUCENE_VERSION= LuceneVersion.LUCENE_48; private const string Field_Name= "content"; private const int Results_Per_Page = 10; //private IndexWriter writer; private StandardAnalyzer analyzer; private QueryParser queryParser; //private SearcherManager searchManager; private string _indexPath; private StandardAnalyzer SetupAnalyzer() => new StandardAnalyzer(MATCH_LUCENE_VERSION); private QueryParser SetupQueryParser(StandardAnalyzer analyzer) => new QueryParser(MATCH_LUCENE_VERSION, Field_Name, analyzer); public FileHelper(string indexPath) { analyzer = SetupAnalyzer(); queryParser = SetupQueryParser(analyzer); _indexPath = indexPath; } public void BuildIndex(string dir) { var watch = Stopwatch.StartNew(); List<string> fpaths = FindFile(dir); IndexWriter writer = new IndexWriter(FSDirectory.Open(_indexPath), new IndexWriterConfig(MATCH_LUCENE_VERSION, analyzer)); foreach(string fpath in fpaths){ string[] contents = File.ReadAllLines(fpath, Encoding.UTF8); foreach(string content in contents){ Document doc = new Document { new TextField(Field_Name, content, Field.Store.YES) }; writer.AddDocument(doc); } ($"index time for {fpath}:{watch.ElapsedMilliseconds/1000.0}second").Dump(); } writer.Flush(true, true); writer.Commit(); writer.Dispose(); watch.Stop(); ($"index time for {dir}:{watch.ElapsedMilliseconds/1000.0}second").Dump(); } public static List<string> FindFile(string sSourcePath) { List<String> list = new List<string>(); DirectoryInfo theFolder = new DirectoryInfo(sSourcePath); FileInfo[] thefileInfo = theFolder.GetFiles("*.*", SearchOption.TopDirectoryOnly); foreach (FileInfo NextFile in thefileInfo) //遍历文件 list.Add(NextFile.FullName); DirectoryInfo[] dirInfo = theFolder.GetDirectories(); foreach (DirectoryInfo NextFolder in dirInfo) { //list.Add(NextFolder.ToString()); FileInfo[] fileInfo = NextFolder.GetFiles("*.*", SearchOption.AllDirectories); foreach (FileInfo NextFile in fileInfo) //遍历文件 list.Add(NextFile.FullName); } return list; } public SearchResults Search(string queryString) { var watch = Stopwatch.StartNew(); Query query = queryParser.Parse(queryString); IndexWriter writer = new IndexWriter(FSDirectory.Open(_indexPath), new IndexWriterConfig(MATCH_LUCENE_VERSION, analyzer)); SearcherManager searchManager = new SearcherManager(writer, true, null); searchManager.MaybeRefreshBlocking(); IndexSearcher searcher = searchManager.Acquire(); try { TopDocs topdDocs = searcher.Search(query, Results_Per_Page); SearchResults searchResults = new SearchResults() { TotalHits = topdDocs.TotalHits, SearchContents = new List<Hit>() }; foreach (var result in topdDocs.ScoreDocs) { Document document = searcher.Doc(result.Doc); Hit searchResult = new Hit { Score = result.Score, Content = document.GetField(Field_Name)?.GetStringValue() }; searchResults.SearchContents.Add(searchResult); } ($"search time for {queryString}:{watch.ElapsedMilliseconds/1000.0}second").Dump(); return searchResults; } finally { searchManager.Release(searcher); searcher = null; } } } }

      

  • 相关阅读:
    最大并发连接数和最大会话数的区别
    Redis Sentinel 情况下bind地址设置
    ZooKeeper 授权验证
    推荐一个zookeeper信息查看工具
    WebForm-博客园-6.0-空间(Space)-短信息(Msg)
    ylbtech-cnblogs(博客园)-数据库设计-6.0-Msg(短消息)
    WebForm-博客园-1.0-账户模块(Passport)-登录与注册
    WebForm+Web.config: 超时时间已到。在操作完成之前超时时间已过或服务器未响应。
    ylbtech-cnblogs(博客园)-数据库设计-1,Passport(账户)
    IIS 配置
  • 原文地址:https://www.cnblogs.com/sui84/p/12594735.html
Copyright © 2011-2022 走看看