最近做一个在线阅读的网站文本内容的全文搜索网站(aspnet+sqlserver),当时有两个方案:1. Sql server 2005 Full Text Search service。2. Lucene.Net。
Sql server的全文搜索索引管理比较方便,sqlserver 会自己管理索引的更新工作,也支持并发查询,但是遗憾的是没有HitHighlight功能,而且中文分词也做的比较瞎。
Lucene.NET的功能就比较全了,可惜自带的SimpleAnalyzer中文分词也比较瞎,就是把每个字拆开,还好有IkAnalyzer,分词效果还不错,中科院也个有分词系统听说做的不错(SharpICTCLAS),可是索引更新以及并发查询就是问题了。
综合一下,采用折中方案,索引以及查询用sql server做,查出结果后,用Lucene.NET里的HitHighlight对结果做命中高亮显示。
-- SP to search on the full text index with paging.
代码
CREATE PROCEDURE [dbo].[SearchOnPageContent](@SearchTerm nvarchar(200),@startIndex INT,@pageSize INT)
AS
SELECT
COUNT([key])
FROM
CONTAINSTABLE(
Page,
PageContent,
@SearchTerm,
2052)
WHERE [RANK] > 0;
WITH tmp(id,pagenumber,pagecontent,[rank],seq) AS
(
select
[key] AS id,
dbo.Page.PageNumber,
page.PageContent,
T.[rank],
ROW_NUMBER() OVER (ORDER BY T.[rank] DESC)
FROM
CONTAINSTABLE(
Page,
PageContent,
@SearchTerm,
2052)AS T
JOIN PAGE ON PAGE.id = T.[key]
where t.[rank] > 0
)
SELECT * FROM tmp WHERE seq BETWEEN (@startIndex -1)*@pageSize AND @startIndex *@pageSize
GO
// Class to handle hithighlight
代码
using IKAnalyzerNet;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
namespace OnlineReader.HitHighlight
{
// User HitHighlight component to cite the hits terms.
public class HitHighlighter
{
static IKAnalyzer analyzer = new IKAnalyzer();
static QueryParser parser = new QueryParser("text", analyzer);
private Query query = null;
//static Lucene.Net.Search.IndexSearcher searcher = null;
public string HitHighlight(string hitString,string searchTerm)
{
query = parser.Parse(searchTerm);
Lucene.Net.Highlight.Formatter formatter = new Lucene.Net.Highlight.SimpleHTMLFormatter("<span class='hit'> ", "</span>");
Lucene.Net.Highlight.SimpleFragmenter fragmenter = new Lucene.Net.Highlight.SimpleFragmenter(400);
Lucene.Net.Highlight.QueryScorer scorer = new Lucene.Net.Highlight.QueryScorer(query);
Lucene.Net.Highlight.Highlighter highlighter = new Lucene.Net.Highlight.Highlighter(formatter, scorer);
highlighter.SetTextFragmenter(fragmenter);
Lucene.Net.Analysis.TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(hitString));
string highlighted_text = highlighter.GetBestFragments(stream, hitString, 1, "...");
if (highlighted_text == "") // someties the highlighter fails to emit text...
{
highlighted_text = hitString.Replace("'", "''");
}
if (highlighted_text.Length > 3000)
{
highlighted_text = highlighted_text.Substring(0, 3000);
}
return highlighted_text;
}
}
}