zoukankan      html  css  js  c++  java
  • Lucene.Net和盘古分词应用

    Lucene.Net.dll:用做全文索引

    PanGu.dll(盘古分词):作为中文分词的条件

    大致原理:

    1.Lucene先根据PanGu将需要搜索的内容分隔、分词,然后根据分词的结果,做一个索引页。

    2.搜索的时候,直接从索引页里面进行查找个。

    直接上代码:

    分词演示代码:

     protected void Button1_Click(object sender, EventArgs e)
            {
                ListBox1.Items.Clear();
    
                //标准分词,只能对英文,不能对中文
                //Analyzer analyzer = new StandardAnalyzer();
    
                //盘古分词
                Analyzer analyzer = new PanGuAnalyzer();
                TokenStream tokenStream = analyzer.TokenStream("",new StringReader(txtString.Text));
                Lucene.Net.Analysis.Token token = null;
    
                //.Next()获取到下一个词
                while ((token=tokenStream.Next())!=null)
                {
                    string word = token.TermText();//分到的词
                    ListBox1.Items.Add(word);
                }
            }
    View Code

    新建索引代码:演示了两种读取数据的方式

    一:文本文件的查找

    protected void Button1_Click(object sender, EventArgs e)
            {
                string indexPath = @"C:index";//注意和磁盘上文件夹的大小写一致,否则会报错。
                FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
                bool isUpdate = IndexReader.IndexExists(directory);
                if (isUpdate)
                {
                    //暂时规定:同时只能有一段代码操作索引库
                    //如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
                    if (IndexWriter.IsLocked(directory))
                    {
                        IndexWriter.Unlock(directory);
                    }
                }
                //IndexWriter负责把数据向索引库中写入
                IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
                for (int i = 1000; i < 1100; i++)
                {
                    string txt =System.IO.File.ReadAllText(@"D:
    et
    et代码搜索及分词文章" + i + ".txt");
                    Document document = new Document();//文档对象。相当于表的一行记录
                    document.Add(new Field("number", i.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                    document.Add(new Field("body", txt, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
                    writer.AddDocument(document);
    
                }
                writer.Close();
                directory.Close();//不要忘了Close,否则索引结果搜不到
    
    
                this.ClientScript.RegisterStartupScript(typeof(indexPage),
                    "alert", "alert('创建索引完成')", true);
            }
    View Code

    二:数据库里面查找数据

     protected void Button3_Click(object sender, EventArgs e)
            {
                string indexPath = @"D:
    et
    et代码搜索及分词index1";//注意和磁盘上文件夹的大小写一致,否则会报错。
                FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
                bool isUpdate = IndexReader.IndexExists(directory);
                if (isUpdate)
                {
                    //暂时规定:同时只能有一段代码操作索引库
                    //如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
                    if (IndexWriter.IsLocked(directory))
                    {
                        IndexWriter.Unlock(directory);
                    }
                }
                //IndexWriter负责把数据向索引库中写入
                IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
    
                List<Writings> list = GetData();
                foreach (Writings item in list)
                {
                    Document document = new Document();//文档对象。相当于表的一行记录
                    document.Add(new Field("ID",item.ID.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                    document.Add(new Field("Title", item.Title, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
                    document.Add(new Field("Contents", item.Contents, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
                    writer.AddDocument(document);
                }
                writer.Close();
                directory.Close();//不要忘了Close,否则索引结果搜不到
    
    
                this.ClientScript.RegisterStartupScript(typeof(indexPage),
                    "alert", "alert('创建索引完成')", true);
            }
    
            private List<Writings> GetData()
            {
                string conn = "server=.;user id=sa; pwd=123; database=SharesTradeNew";
                string sql = "SELECT * FROM dbo.Writings";
                SqlDataAdapter da = new SqlDataAdapter(sql,conn);
                DataTable dt = new DataTable();
                int a=da.Fill(dt);
                return Newtonsoft.Json.JsonConvert.DeserializeObject<List<Writings>>(Newtonsoft.Json.JsonConvert.SerializeObject(dt));
            }
        }
    
        public class Writings
        {
            public int ID { get; set; }
            public string Title { get; set; }
            public string Contents { get; set; }
        }
    View Code

    通过索引查找数据:

    对应一:

    protected void Button1_Click(object sender, EventArgs e)
            {
                //“计算机   专业”
                string kw = TextBox1.Text;
                FSDirectory directory = FSDirectory.Open(new DirectoryInfo(@"c:index"), new NoLockFactory());
                IndexReader reader = IndexReader.Open(directory, true);
                IndexSearcher searcher = new IndexSearcher(reader);
                PhraseQuery query = new PhraseQuery();//查询条件
                foreach (string word in kw.Split(' '))//先用空格,让用户去分词,空格分隔的就是词“计算机   专业”
                {
                    query.Add(new Term("body", word));//Contains("body",word)
                }
                //where Contains("body","计算机") and Contains("body","专业")
    
                query.SetSlop(100);
                TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);//盛放搜索结果的容器
                searcher.Search(query, null, collector);//用query这个查询条件进行搜索,搜索结果放入collector容器中
    
                List<SearchResult> list = new List<SearchResult>();
    
                // collector.GetTotalHits()查询结果的总条数
                ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;
                for (int i = 0; i < docs.Length; i++)
                {
                    int docId = docs[i].doc;//文档编号(lucene.net内部分配的,和number无关)
                    Document doc = searcher.Doc(docId);//根据文档编号拿到文档对象
                    string number = doc.Get("number");//取出文档的number字段的值。必须是Field.Store.YES才能取出来
                    string body = doc.Get("body");
    
                    SearchResult sr = new SearchResult();
                    sr.Body = body;
                    sr.Number = number;
    
                    list.Add(sr);
                }
                Repeater1.DataSource = list;
                Repeater1.DataBind();
            }
    View Code

    对应二:

    protected void Button3_Click(object sender, EventArgs e)
            {
                //“计算机   专业”
                string kw = TextBox3.Text;
                FSDirectory directory = FSDirectory.Open(new DirectoryInfo(@"D:
    et
    et代码搜索及分词index1"), new NoLockFactory());
                IndexReader reader = IndexReader.Open(directory, true);
                IndexSearcher searcher = new IndexSearcher(reader);
                PhraseQuery query = new PhraseQuery();//查询条件
                foreach (string word in kw.Split(' '))//先用空格,让用户去分词,空格分隔的就是词“计算机   专业”
                {
                    query.Add(new Term("Contents", word));//Contains("body",word)
                    //query.Add(new Term("Title", word));
                }
                //where Contains("body","计算机") and Contains("body","专业")
    
                query.SetSlop(100);
                TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);//盛放搜索结果的容器
                searcher.Search(query, null, collector);//用query这个查询条件进行搜索,搜索结果放入collector容器中
    
                List<Writings> list = new List<Writings>();
    
                // collector.GetTotalHits()查询结果的总条数
                ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;
                for (int i = 0; i < docs.Length; i++)
                {
                    int docId = docs[i].doc;//文档编号(lucene.net内部分配的,和number无关)
                    Document doc = searcher.Doc(docId);//根据文档编号拿到文档对象
                    string id = doc.Get("ID");//取出文档的number字段的值。必须是Field.Store.YES才能取出来
                    string title = doc.Get("Title");
                    string content = doc.Get("Contents");
    
                    Writings sr = new Writings();
                    sr.ID = int.Parse(id);
                    sr.Title = title;
                    sr.Contents = content;
    
                    list.Add(sr);
                }
                Repeater3.DataSource = list;
                Repeater3.DataBind();
            }
    View Code
  • 相关阅读:
    React antd如何实现<Upload>组件上传附件再次上传已清除附件缓存问题。
    spring项目logback日志与logstash和Elasticsearch整合
    Java后端面试经验总结分享(一)
    【设计模式】访问者模式
    【设计模式】命令模式
    【设计模式】模板方法模式
    【设计模式】代理模式
    【设计模式】享元模式
    【设计模式】外观模式
    【设计模式】组合模式
  • 原文地址:https://www.cnblogs.com/zhuyapeng/p/6691532.html
Copyright © 2011-2022 走看看