zoukankan html css js c++ java

Lucene.Net和盘古分词应用

Lucene.Net.dll：用做全文索引

PanGu.dll(盘古分词)：作为中文分词的条件

大致原理：

1.Lucene先根据PanGu将需要搜索的内容分隔、分词，然后根据分词的结果，做一个索引页。

2.搜索的时候，直接从索引页里面进行查找个。

直接上代码：

分词演示代码：

 protected void Button1_Click(object sender, EventArgs e)
        {
            ListBox1.Items.Clear();

            //标准分词，只能对英文，不能对中文
            //Analyzer analyzer = new StandardAnalyzer();

            //盘古分词
            Analyzer analyzer = new PanGuAnalyzer();
            TokenStream tokenStream = analyzer.TokenStream("",new StringReader(txtString.Text));
            Lucene.Net.Analysis.Token token = null;

            //.Next()获取到下一个词
            while ((token=tokenStream.Next())!=null)
            {
                string word = token.TermText();//分到的词
                ListBox1.Items.Add(word);
            }
        }

View Code

新建索引代码：演示了两种读取数据的方式

一：文本文件的查找

protected void Button1_Click(object sender, EventArgs e)
        {
            string indexPath = @"C:index";//注意和磁盘上文件夹的大小写一致，否则会报错。
            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
            bool isUpdate = IndexReader.IndexExists(directory);
            if (isUpdate)
            {
                //暂时规定：同时只能有一段代码操作索引库
                //如果索引目录被锁定（比如索引过程中程序异常退出），则首先解锁
                if (IndexWriter.IsLocked(directory))
                {
                    IndexWriter.Unlock(directory);
                }
            }
            //IndexWriter负责把数据向索引库中写入
            IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
            for (int i = 1000; i < 1100; i++)
            {
                string txt =System.IO.File.ReadAllText(@"D:
et
et代码搜索及分词文章" + i + ".txt");
                Document document = new Document();//文档对象。相当于表的一行记录
                document.Add(new Field("number", i.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                document.Add(new Field("body", txt, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
                writer.AddDocument(document);

            }
            writer.Close();
            directory.Close();//不要忘了Close，否则索引结果搜不到


            this.ClientScript.RegisterStartupScript(typeof(indexPage),
                "alert", "alert('创建索引完成')", true);
        }

View Code

二：数据库里面查找数据

 protected void Button3_Click(object sender, EventArgs e)
        {
            string indexPath = @"D:
et
et代码搜索及分词index1";//注意和磁盘上文件夹的大小写一致，否则会报错。
            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
            bool isUpdate = IndexReader.IndexExists(directory);
            if (isUpdate)
            {
                //暂时规定：同时只能有一段代码操作索引库
                //如果索引目录被锁定（比如索引过程中程序异常退出），则首先解锁
                if (IndexWriter.IsLocked(directory))
                {
                    IndexWriter.Unlock(directory);
                }
            }
            //IndexWriter负责把数据向索引库中写入
            IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);

            List<Writings> list = GetData();
            foreach (Writings item in list)
            {
                Document document = new Document();//文档对象。相当于表的一行记录
                document.Add(new Field("ID",item.ID.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                document.Add(new Field("Title", item.Title, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
                document.Add(new Field("Contents", item.Contents, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
                writer.AddDocument(document);
            }
            writer.Close();
            directory.Close();//不要忘了Close，否则索引结果搜不到


            this.ClientScript.RegisterStartupScript(typeof(indexPage),
                "alert", "alert('创建索引完成')", true);
        }

        private List<Writings> GetData()
        {
            string conn = "server=.;user id=sa; pwd=123; database=SharesTradeNew";
            string sql = "SELECT * FROM dbo.Writings";
            SqlDataAdapter da = new SqlDataAdapter(sql,conn);
            DataTable dt = new DataTable();
            int a=da.Fill(dt);
            return Newtonsoft.Json.JsonConvert.DeserializeObject<List<Writings>>(Newtonsoft.Json.JsonConvert.SerializeObject(dt));
        }
    }

    public class Writings
    {
        public int ID { get; set; }
        public string Title { get; set; }
        public string Contents { get; set; }
    }

View Code

通过索引查找数据：

对应一：

protected void Button1_Click(object sender, EventArgs e)
        {
            //“计算机   专业”
            string kw = TextBox1.Text;
            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(@"c:index"), new NoLockFactory());
            IndexReader reader = IndexReader.Open(directory, true);
            IndexSearcher searcher = new IndexSearcher(reader);
            PhraseQuery query = new PhraseQuery();//查询条件
            foreach (string word in kw.Split(' '))//先用空格，让用户去分词，空格分隔的就是词“计算机   专业”
            {
                query.Add(new Term("body", word));//Contains("body",word)
            }
            //where Contains("body","计算机") and Contains("body","专业")

            query.SetSlop(100);
            TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);//盛放搜索结果的容器
            searcher.Search(query, null, collector);//用query这个查询条件进行搜索，搜索结果放入collector容器中

            List<SearchResult> list = new List<SearchResult>();

            // collector.GetTotalHits()查询结果的总条数
            ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;
            for (int i = 0; i < docs.Length; i++)
            {
                int docId = docs[i].doc;//文档编号（lucene.net内部分配的，和number无关）
                Document doc = searcher.Doc(docId);//根据文档编号拿到文档对象
                string number = doc.Get("number");//取出文档的number字段的值。必须是Field.Store.YES才能取出来
                string body = doc.Get("body");

                SearchResult sr = new SearchResult();
                sr.Body = body;
                sr.Number = number;

                list.Add(sr);
            }
            Repeater1.DataSource = list;
            Repeater1.DataBind();
        }

View Code

对应二：

protected void Button3_Click(object sender, EventArgs e)
        {
            //“计算机   专业”
            string kw = TextBox3.Text;
            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(@"D:
et
et代码搜索及分词index1"), new NoLockFactory());
            IndexReader reader = IndexReader.Open(directory, true);
            IndexSearcher searcher = new IndexSearcher(reader);
            PhraseQuery query = new PhraseQuery();//查询条件
            foreach (string word in kw.Split(' '))//先用空格，让用户去分词，空格分隔的就是词“计算机   专业”
            {
                query.Add(new Term("Contents", word));//Contains("body",word)
                //query.Add(new Term("Title", word));
            }
            //where Contains("body","计算机") and Contains("body","专业")

            query.SetSlop(100);
            TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);//盛放搜索结果的容器
            searcher.Search(query, null, collector);//用query这个查询条件进行搜索，搜索结果放入collector容器中

            List<Writings> list = new List<Writings>();

            // collector.GetTotalHits()查询结果的总条数
            ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;
            for (int i = 0; i < docs.Length; i++)
            {
                int docId = docs[i].doc;//文档编号（lucene.net内部分配的，和number无关）
                Document doc = searcher.Doc(docId);//根据文档编号拿到文档对象
                string id = doc.Get("ID");//取出文档的number字段的值。必须是Field.Store.YES才能取出来
                string title = doc.Get("Title");
                string content = doc.Get("Contents");

                Writings sr = new Writings();
                sr.ID = int.Parse(id);
                sr.Title = title;
                sr.Contents = content;

                list.Add(sr);
            }
            Repeater3.DataSource = list;
            Repeater3.DataBind();
        }

View Code

查看全文

相关阅读:
设计模式读书笔记-----适配器模式
 设计模式读书笔记-----命令模式
 一种另类的解决URL中文乱码问题--对中文进行加密、解密处理
 设计模式读书笔记-----单例模式
 Mysql的一些小知识点
 2-逻辑题二
 1-逻辑题一
 12-1054. 求平均值
 11-1048.数字加密
 10-string类的length()返回值一起的问题

原文地址：https://www.cnblogs.com/zhuyapeng/p/6691532.html

热门文章
hdoj 1879 继续畅通工程
 poj2217
bzoj4196
poj3581
学习笔记：后缀数组
 bzoj2243
bzoj1500
bzoj1901
树状数组区间修改
 bzoj1251