zoukankan html css js c++ java

LuceneNet的使用

先建立索引，再查询，速度很快。

索引花费的时间比较多，但还可以接受，200M的文本需要1分多钟，40G的文本需要4个小时多。

刚开始我用 2.9版本的，是选择将文本也保存在索引中，占据的空间比原先的文本2倍多。

而且发现如果里面的邮箱地址不带@后面无法查询出来，也许是所用的分词的关系，也不知道怎样才可以自定义分隔符。

后来改用了4.8版本，索引的空间只比原先的文本大一点点，而且不带@的关键字也可以查询出来。

但还是有个问题，查询出来的内容中文是乱码，也无法用中文查询。

无论是 NLuke 还是 Luke ，都没法成功打开索引文件。

IndexWriter 构造函数使用了另一个重载，即第三个参数为 bool，如果为 true 表示不存在就创建、存在就覆盖，为 false 表示不存在就出错、存在就追加。这个不方便，因为我们需要的是：不存在就创建、存在就追加，怎样才能实现这个功能呢？省略掉，就实现这个功能了。

void Main()
{
 
 string idxpath = @"D:dataDB	xtindex";
 string dir = @"D:dataDB	xtsearch	ianya";
 //!!怎样用@分词？
 string keyword = "zhaoshu0997";
 Utils.FullSearch.FileHelper filehelper= new Utils.FullSearch.FileHelper(idxpath);
 //filehelper.BuildIndex(dir);
 Utils.FullSearch.SearchResults results = filehelper.Search(keyword);
 Console.WriteLine($"{keyword.ToString()}");
 results.Dump();
}

namespace Utils.FullSearch
{
	public class SearchResults{
		public int TotalHits{get;set;}
		public List<Hit> SearchContents{get;set;}
	}
	
	public class Hit{
		public float Score{get;set;}
		public string Content{get;set;}
	}
	
	public class FileHelper
    {
		private const LuceneVersion MATCH_LUCENE_VERSION= LuceneVersion.LUCENE_48;
		private const string Field_Name= "content";
		private const int Results_Per_Page = 10;
		//private  IndexWriter writer;
		private  StandardAnalyzer analyzer;
		private  QueryParser queryParser;
		//private  SearcherManager searchManager;
		private string _indexPath;
		
		private StandardAnalyzer SetupAnalyzer() => new StandardAnalyzer(MATCH_LUCENE_VERSION);
		private QueryParser SetupQueryParser(StandardAnalyzer analyzer) => new QueryParser(MATCH_LUCENE_VERSION, Field_Name, analyzer);
		
		public FileHelper(string indexPath)
        {
			analyzer = SetupAnalyzer();
    		queryParser = SetupQueryParser(analyzer);
		    _indexPath = indexPath;
			
		}
		
        public void BuildIndex(string dir)
        {
			var watch = Stopwatch.StartNew();
			List<string> fpaths = FindFile(dir);
			
			IndexWriter writer = new IndexWriter(FSDirectory.Open(_indexPath), new IndexWriterConfig(MATCH_LUCENE_VERSION, analyzer));
			foreach(string fpath in fpaths){
				string[] contents = File.ReadAllLines(fpath, Encoding.UTF8);
				foreach(string content in contents){
					Document doc = new Document
				    {                
				        new TextField(Field_Name, content, Field.Store.YES)
				    };
					writer.AddDocument(doc);
				}
				($"index time for {fpath}:{watch.ElapsedMilliseconds/1000.0}second").Dump();
			}
			  
		    writer.Flush(true, true);
		    writer.Commit();
			writer.Dispose();
			watch.Stop();
			($"index time for {dir}:{watch.ElapsedMilliseconds/1000.0}second").Dump();
		}
		
		public static List<string> FindFile(string sSourcePath)
		{
		    List<String> list = new List<string>();
		    DirectoryInfo theFolder = new DirectoryInfo(sSourcePath);
		    FileInfo[] thefileInfo = theFolder.GetFiles("*.*", SearchOption.TopDirectoryOnly);
		    foreach (FileInfo NextFile in thefileInfo)  //遍历文件
		        list.Add(NextFile.FullName);
		   		 DirectoryInfo[] dirInfo = theFolder.GetDirectories();
			    foreach (DirectoryInfo NextFolder in dirInfo)
			    {
			        //list.Add(NextFolder.ToString());
			        FileInfo[] fileInfo = NextFolder.GetFiles("*.*", SearchOption.AllDirectories);
			        foreach (FileInfo NextFile in fileInfo)  //遍历文件
			            list.Add(NextFile.FullName);
			    }           
		    return list;
		}
			
		public  SearchResults Search(string queryString)
		{
			var watch = Stopwatch.StartNew();
		    Query query = queryParser.Parse(queryString);
			IndexWriter writer = new IndexWriter(FSDirectory.Open(_indexPath), new IndexWriterConfig(MATCH_LUCENE_VERSION, analyzer));
		    SearcherManager searchManager = new SearcherManager(writer, true, null);
		    searchManager.MaybeRefreshBlocking();
		    IndexSearcher searcher = searchManager.Acquire();
		
		    try
		    {
		        TopDocs topdDocs = searcher.Search(query, Results_Per_Page);         
		        SearchResults searchResults = new SearchResults() { TotalHits = topdDocs.TotalHits, SearchContents = new List<Hit>() };
			    foreach (var result in topdDocs.ScoreDocs)
			    {
				    Document document = searcher.Doc(result.Doc);
				    Hit searchResult = new Hit
			        {
			            Score = result.Score,
			            Content = document.GetField(Field_Name)?.GetStringValue()
			        };
			        searchResults.SearchContents.Add(searchResult);
			    }
				($"search time for {queryString}:{watch.ElapsedMilliseconds/1000.0}second").Dump();
				return searchResults;
		    }
		    finally
		    {
		        searchManager.Release(searcher);
		        searcher = null;
		    }
			
		}
	}
}

查看全文

相关阅读:
如何组建开发团队-建立畅通的沟通渠道
 如何组建开发团队-激励团队气势
 文件夹名与类名一致造成的命名空间无法识别的问题
 SQL Server 还原错误“restore database正在异常终止错误 3154”
SQL Server 服务无法启动，错误1069解决办法
 无法升级数据库....因为此版本的 SQL Server 不支持该数据库的非发布版本(539) 解决方案
 图解Microsoft SQL Server——“远程过程调用失败 [0x800706be] 错误“。
Eclipse: Android Device Chooser
部署网站时的错误“one of its dependencies.试图加载格式不正确的程序。”解决方案。
把char赋值到string里面

原文地址：https://www.cnblogs.com/sui84/p/12594735.html