Lucene.Net.dll:用做全文索引
PanGu.dll(盘古分词):作为中文分词的条件
大致原理:
1.Lucene先根据PanGu将需要搜索的内容分隔、分词,然后根据分词的结果,做一个索引页。
2.搜索的时候,直接从索引页里面进行查找个。
直接上代码:
分词演示代码:
protected void Button1_Click(object sender, EventArgs e) { ListBox1.Items.Clear(); //标准分词,只能对英文,不能对中文 //Analyzer analyzer = new StandardAnalyzer(); //盘古分词 Analyzer analyzer = new PanGuAnalyzer(); TokenStream tokenStream = analyzer.TokenStream("",new StringReader(txtString.Text)); Lucene.Net.Analysis.Token token = null; //.Next()获取到下一个词 while ((token=tokenStream.Next())!=null) { string word = token.TermText();//分到的词 ListBox1.Items.Add(word); } }
新建索引代码:演示了两种读取数据的方式
一:文本文件的查找
protected void Button1_Click(object sender, EventArgs e) { string indexPath = @"C:index";//注意和磁盘上文件夹的大小写一致,否则会报错。 FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory()); bool isUpdate = IndexReader.IndexExists(directory); if (isUpdate) { //暂时规定:同时只能有一段代码操作索引库 //如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁 if (IndexWriter.IsLocked(directory)) { IndexWriter.Unlock(directory); } } //IndexWriter负责把数据向索引库中写入 IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED); for (int i = 1000; i < 1100; i++) { string txt =System.IO.File.ReadAllText(@"D: et et代码搜索及分词文章" + i + ".txt"); Document document = new Document();//文档对象。相当于表的一行记录 document.Add(new Field("number", i.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("body", txt, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.AddDocument(document); } writer.Close(); directory.Close();//不要忘了Close,否则索引结果搜不到 this.ClientScript.RegisterStartupScript(typeof(indexPage), "alert", "alert('创建索引完成')", true); }
二:数据库里面查找数据
protected void Button3_Click(object sender, EventArgs e) { string indexPath = @"D: et et代码搜索及分词index1";//注意和磁盘上文件夹的大小写一致,否则会报错。 FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory()); bool isUpdate = IndexReader.IndexExists(directory); if (isUpdate) { //暂时规定:同时只能有一段代码操作索引库 //如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁 if (IndexWriter.IsLocked(directory)) { IndexWriter.Unlock(directory); } } //IndexWriter负责把数据向索引库中写入 IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED); List<Writings> list = GetData(); foreach (Writings item in list) { Document document = new Document();//文档对象。相当于表的一行记录 document.Add(new Field("ID",item.ID.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); document.Add(new Field("Title", item.Title, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS)); document.Add(new Field("Contents", item.Contents, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.AddDocument(document); } writer.Close(); directory.Close();//不要忘了Close,否则索引结果搜不到 this.ClientScript.RegisterStartupScript(typeof(indexPage), "alert", "alert('创建索引完成')", true); } private List<Writings> GetData() { string conn = "server=.;user id=sa; pwd=123; database=SharesTradeNew"; string sql = "SELECT * FROM dbo.Writings"; SqlDataAdapter da = new SqlDataAdapter(sql,conn); DataTable dt = new DataTable(); int a=da.Fill(dt); return Newtonsoft.Json.JsonConvert.DeserializeObject<List<Writings>>(Newtonsoft.Json.JsonConvert.SerializeObject(dt)); } } public class Writings { public int ID { get; set; } public string Title { get; set; } public string Contents { get; set; } }
通过索引查找数据:
对应一:
protected void Button1_Click(object sender, EventArgs e) { //“计算机 专业” string kw = TextBox1.Text; FSDirectory directory = FSDirectory.Open(new DirectoryInfo(@"c:index"), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); PhraseQuery query = new PhraseQuery();//查询条件 foreach (string word in kw.Split(' '))//先用空格,让用户去分词,空格分隔的就是词“计算机 专业” { query.Add(new Term("body", word));//Contains("body",word) } //where Contains("body","计算机") and Contains("body","专业") query.SetSlop(100); TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);//盛放搜索结果的容器 searcher.Search(query, null, collector);//用query这个查询条件进行搜索,搜索结果放入collector容器中 List<SearchResult> list = new List<SearchResult>(); // collector.GetTotalHits()查询结果的总条数 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; for (int i = 0; i < docs.Length; i++) { int docId = docs[i].doc;//文档编号(lucene.net内部分配的,和number无关) Document doc = searcher.Doc(docId);//根据文档编号拿到文档对象 string number = doc.Get("number");//取出文档的number字段的值。必须是Field.Store.YES才能取出来 string body = doc.Get("body"); SearchResult sr = new SearchResult(); sr.Body = body; sr.Number = number; list.Add(sr); } Repeater1.DataSource = list; Repeater1.DataBind(); }
对应二:
protected void Button3_Click(object sender, EventArgs e) { //“计算机 专业” string kw = TextBox3.Text; FSDirectory directory = FSDirectory.Open(new DirectoryInfo(@"D: et et代码搜索及分词index1"), new NoLockFactory()); IndexReader reader = IndexReader.Open(directory, true); IndexSearcher searcher = new IndexSearcher(reader); PhraseQuery query = new PhraseQuery();//查询条件 foreach (string word in kw.Split(' '))//先用空格,让用户去分词,空格分隔的就是词“计算机 专业” { query.Add(new Term("Contents", word));//Contains("body",word) //query.Add(new Term("Title", word)); } //where Contains("body","计算机") and Contains("body","专业") query.SetSlop(100); TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);//盛放搜索结果的容器 searcher.Search(query, null, collector);//用query这个查询条件进行搜索,搜索结果放入collector容器中 List<Writings> list = new List<Writings>(); // collector.GetTotalHits()查询结果的总条数 ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs; for (int i = 0; i < docs.Length; i++) { int docId = docs[i].doc;//文档编号(lucene.net内部分配的,和number无关) Document doc = searcher.Doc(docId);//根据文档编号拿到文档对象 string id = doc.Get("ID");//取出文档的number字段的值。必须是Field.Store.YES才能取出来 string title = doc.Get("Title"); string content = doc.Get("Contents"); Writings sr = new Writings(); sr.ID = int.Parse(id); sr.Title = title; sr.Contents = content; list.Add(sr); } Repeater3.DataSource = list; Repeater3.DataBind(); }