zoukankan      html  css  js  c++  java
  • 基于lucene.net 和ICTCLAS2014的站内搜索的实现1

    版权声明:本文为博主原创文章,未经博主同意不得转载。

    https://blog.csdn.net/zhuhuangtianzi/article/details/26967225

    Lucene.net是一个搜索引擎的框架,它自身并不能实现搜索,须要我们自己在当中实现索引的建立,索引的查找。全部这些都是依据它自身提供的API来实现。Lucene.net本身是基于java的,可是经过翻译成.ne版本号的,能够在ASP.net中使用这个来实现站内搜索。

             要实现基于汉语的搜索引擎,首先的要实现汉语的分词。眼下网上大部分都是利用已经有的盘古分词来实现的分词系统。可是盘古分词效果不太好。

    在这里我把最新的ICTCLAS2014嵌入到Lucene.net中。Lucene.net中全部的分词系统都是基于Analyzer类来继承实现的。所以假设要使用ICTCLAS2014嵌入到Lucene.net中,就必要要继承Analyzer类实现自己的分词类。

    1 ICTCLAS的引入

             首先我们要把ICTCLAS的dll引入到C#文件里 ,由于这个dll不是在C#中建立的类库,所以无法直接将其加入到C#的引用中。

    我们考虑使用以下的方法来实现,为了方便,我们把引入的函数以及结构体放入一个类中。

    例如以下所看到的:

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Runtime.InteropServices;
    using Lucene.Net.Analysis;
    
    namespace Lucene.Net.Analysis.DChinese
    {
        [StructLayout(LayoutKind.Explicit)]
        public struct result_t
        {
            [FieldOffset(0)]
            public int start;
            [FieldOffset(4)]
            public int length;
            [FieldOffset(8)]
            public int sPos1;
            [FieldOffset(12)]
            public int sPos2;
            [FieldOffset(16)]
            public int sPos3;
            [FieldOffset(20)]
            public int sPos4;
            [FieldOffset(24)]
            public int sPos5;
            [FieldOffset(28)]
            public int sPos6;
            [FieldOffset(32)]
            public int sPos7;
            [FieldOffset(36)]
            public int sPos8;
            [FieldOffset(40)]
            public int sPos9;
            [FieldOffset(44)]
            public int sPos10;
            //[FieldOffset(12)] public int sPosLow;
            [FieldOffset(48)]
            public int POS_id;
            [FieldOffset(52)]
            public int word_ID;
            [FieldOffset(56)]
            public int word_type;
            [FieldOffset(60)]
            public double weight;
        }
        public class SplitWord
        {
            const string path = @"NLPIR.dll";//设定dll的路径
    
            //对函数进行申明
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_Init", CallingConvention = CallingConvention.Cdecl)]
            public static extern bool NLPIR_Init(String sInitDirPath, int encoding = 0, String sLicenceCode = null);
    
            //特别注意,C语言的函数NLPIR_API const char * NLPIR_ParagraphProcess(const char *sParagraph,int bPOStagged=1);必须相应以下的申明
            [DllImport(path, CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl, EntryPoint = "NLPIR_ParagraphProcess")]
            public static extern IntPtr NLPIR_ParagraphProcess(String sParagraph, int bPOStagged = 1);
    
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_Exit", CallingConvention = CallingConvention.Cdecl)]
            public static extern bool NLPIR_Exit();
    
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_ImportUserDict", CallingConvention = CallingConvention.Cdecl)]
            public static extern int NLPIR_ImportUserDict(String sFilename);
    
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_FileProcess", CallingConvention = CallingConvention.Cdecl)]
            public static extern bool NLPIR_FileProcess(String sSrcFilename, String sDestFilename, int bPOStagged = 1);
    
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_FileProcessEx", CallingConvention = CallingConvention.Cdecl)]
            public static extern bool NLPIR_FileProcessEx(String sSrcFilename, String sDestFilename);
    
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_GetParagraphProcessAWordCount", CallingConvention = CallingConvention.Cdecl)]
            public static extern int NLPIR_GetParagraphProcessAWordCount(String sParagraph);
    
            //NLPIR_GetParagraphProcessAWordCount
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_ParagraphProcessAW", CallingConvention = CallingConvention.Cdecl)]
            public static extern void NLPIR_ParagraphProcessAW(int nCount, [Out, MarshalAs(UnmanagedType.LPArray)] result_t[] result);
    
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_AddUserWord", CallingConvention = CallingConvention.Cdecl)]
            public static extern int NLPIR_AddUserWord(String sWord);
    
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_SaveTheUsrDic", CallingConvention = CallingConvention.Cdecl)]
            public static extern int NLPIR_SaveTheUsrDic();
    
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_DelUsrWord", CallingConvention = CallingConvention.Cdecl)]
            public static extern int NLPIR_DelUsrWord(String sWord);
    
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_NWI_Start", CallingConvention = CallingConvention.Cdecl)]
            public static extern bool NLPIR_NWI_Start();
    
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_NWI_Complete", CallingConvention = CallingConvention.Cdecl)]
            public static extern bool NLPIR_NWI_Complete();
    
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_NWI_AddFile", CallingConvention = CallingConvention.Cdecl)]
            public static extern bool NLPIR_NWI_AddFile(String sText);
    
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_NWI_AddMem", CallingConvention = CallingConvention.Cdecl)]
            public static extern bool NLPIR_NWI_AddMem(String sText);
    
            [DllImport(path, CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl, EntryPoint = "NLPIR_NWI_GetResult")]
            public static extern IntPtr NLPIR_NWI_GetResult(bool bWeightOut = false);
    
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_NWI_Result2UserDict", CallingConvention = CallingConvention.Cdecl)]
            public static extern uint NLPIR_NWI_Result2UserDict();
    
            [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "NLPIR_GetKeyWords", CallingConvention = CallingConvention.Cdecl)]
            public static extern IntPtr NLPIR_GetKeyWords(String sText, int nMaxKeyLimit = 50, bool bWeightOut = false);
    
            [DllImport(path, CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl, EntryPoint = "NLPIR_GetFileKeyWords")]
            public static extern IntPtr NLPIR_GetFileKeyWords(String sFilename, int nMaxKeyLimit = 50, bool bWeightOut = false);
        }
    }
    

    这个类里面包括了全部的ICTCLAS的API函数,包括初始化,加入词语,加入词典,词典保存,分词等各种API。而且都是STATIC函数。

    2 分词类DChineseAnalyzer的建立

    分词类的建立我们參考StandarAnalyzer分词的实现,再次基础上实现了DChineseAnalyzer类。在分词类中实现必要的构造函数。以及

    public overrideTokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)

    public overrideTokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)

    这两个函数。他们的作用是在函数中调用用分词器Tokenizer的派生类来实现分词。在现有的版本号中通常是在使用分词类的时候,直接调用ReusableTokenStream函数,而不是调用TokenStream函数,这样能够做到一个分词类对象的建立可供多个分词文本的使用。从而降低内存的浪费。提高效率。

    以及一些字段。利用这些字段。我们能够加入一些停用词。用户自己的词典。


    3 分词器DChineseTokenizer的建立

    这个类是分词的核心关键所在。我们要在当中调用ICTCLAS中的分词。在这里面要注意的一个函数是publicoverride bool IncrementToken()

    它是我们获取下一个分词结果要用到的函数,假设想要遍历分词结果,就要建立一个循环,不断的调用IncrementToken函数。

    整个分词系统代码例如以下所看到的:

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.IO;
    using System.Runtime.InteropServices;
    using Lucene.Net.Analysis;
    using Lucene.Net.Analysis.Standard;
    using Lucene.Net.Util;
    using Lucene.Net.Documents;
    using Lucene.Net.Analysis.Tokenattributes;
    using Version = Lucene.Net.Util.Version;
    
    namespace Lucene.Net.Analysis.DChinese
    {
        public class DChineseAnalyzer : Analyzer
        {
            private ISet<string> stopSet;
            public static readonly ISet<string> STOP_WORDS_SET;
            private Version matchVersion;
            private bool replaceInvalidAcronym;
            private bool enableStopPositionIncrements;
    
            public DChineseAnalyzer(Version version, ISet<string> stopWords)
            {
                stopSet = stopWords;
                replaceInvalidAcronym = false;
                enableStopPositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(version);
                replaceInvalidAcronym = matchVersion.OnOrAfter(Version.LUCENE_24);
                this.matchVersion = version;
            }
    
            public DChineseAnalyzer(Version version)
                : this(version, STOP_WORDS_SET)
            {
            }
    
            public DChineseAnalyzer(Version version, System.IO.FileInfo stopWords)
                : this(version, WordlistLoader.GetWordSet(stopWords))
            {
            }
    
            static DChineseAnalyzer()
            {
                STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
            }
            public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
            {
                TokenStream result = new DChineseTokenizer(matchVersion, reader);
                result = new LowerCaseFilter(result);
                result = new StopFilter(enableStopPositionIncrements, result, stopSet);
                result = new PorterStemFilter(result);
                return result;
            }
    
            private class SavedStreams
            {
                protected internal DChineseTokenizer source;
                protected internal TokenStream result;
            };
            public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
            {
                SavedStreams streams = (SavedStreams)PreviousTokenStream;
                if (streams == null)
                {
                    streams = new SavedStreams();
                    streams.source = new DChineseTokenizer(matchVersion, reader);
                    streams.result = new LowerCaseFilter(streams.source);
                    streams.result = new StopFilter(enableStopPositionIncrements, streams.result, stopSet);
                    streams.result = new PorterStemFilter(streams.result);
                    PreviousTokenStream = streams;
                }
                else
                {
                    streams.source.Reset(reader);
                }
    
                streams.source.SetReplaceInvalidAcronym(replaceInvalidAcronym);
                return streams.result;
            }
    
        }
    
        public sealed class DChineseTokenizer : Tokenizer
        {
    
            private bool m_replaceInvalidAcronym;
            private int offset = 0;
            private int bufferIndex = 0;
            private int dataLen = 0;
            private const int MAX_WORD_LEN = 255;
            private const int IO_BUFFER_SIZE = 4096;
            private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
    
            private ITermAttribute termAtt;
            private IOffsetAttribute offsetAtt;
            private IPositionIncrementAttribute posIncrAtt;
    
    
    
            private void Init(System.IO.TextReader input, Version matchVersion)
            {
                if (matchVersion.OnOrAfter(Version.LUCENE_24))
                {
                    m_replaceInvalidAcronym = true;
                }
                else
                {
                    m_replaceInvalidAcronym = false;
                }
                //this.input = input;
                this.input = ChangeInput(input);
                termAtt = AddAttribute<ITermAttribute>();
                offsetAtt = AddAttribute<IOffsetAttribute>();
                posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            }
    
            public DChineseTokenizer(Version matchVersion, System.IO.TextReader input)
                : base()
            {
                Init(input, matchVersion);
            }
    
            public DChineseTokenizer(Version matchVersion, System.IO.TextReader input, AttributeSource source)
                : base(source)
            {
                Init(input, matchVersion);
            }
    
            public DChineseTokenizer(Version matchVersion, System.IO.TextReader input, AttributeFactory factory)
                : base(factory)
            {
                Init(input, matchVersion);
            }
    
            public override bool IncrementToken()
            {
                ClearAttributes();
                int length = 0;
                int start = bufferIndex;
                char[] buffer = termAtt.TermBuffer();
                while (true)
                {
    
                    if (bufferIndex >= dataLen)
                    {
                        offset += dataLen;
                        dataLen = input.Read(ioBuffer, 0, ioBuffer.Length);
                        if (dataLen <= 0)
                        {
                            dataLen = 0;
                            if (length > 0)
                                break;
                            return false;
                        }
                        bufferIndex = 0;
                    }
    
                    char c = ioBuffer[bufferIndex++];
    
                    if (!System.Char.IsWhiteSpace(c))
                    {
                        if (length == 0)
                        {
                            start = offset + bufferIndex - 1;
                        }
                        else if (length == buffer.Length)
                        {
                            buffer = termAtt.ResizeTermBuffer(1 + length);
                        }
    
                        buffer[length++] = c;
                        if (length == MAX_WORD_LEN)
                            break;
                    }
                    else if (length > 0)
                        break;
                }
    
                termAtt.SetTermLength(length);
                offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length));
                posIncrAtt.PositionIncrement = 1;
                return true;
            }
    
            public override void Reset()
            {
                base.Reset(input);
                bufferIndex = 0;
                offset = 0;
                dataLen = 0;
            }
    
            public override void Reset(TextReader input)
            {
                String inputString = input.ReadToEnd();
                IntPtr intPtr = SplitWord.NLPIR_ParagraphProcess(inputString, 0);
                string strResult = Marshal.PtrToStringAnsi(intPtr);
                this.input = new StringReader(strResult);
                bufferIndex = 0;
                offset = 0;
                dataLen = 0;
            }
    
            public override void End()
            {
                int finalOffset = CorrectOffset(offset);
                offsetAtt.SetOffset(finalOffset, finalOffset);
            }
    
            public void SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
            {
                this.m_replaceInvalidAcronym = replaceInvalidAcronym;
            }
    
            private TextReader ChangeInput(TextReader input)
            {
                //string indexPath = System.Environment.CurrentDirectory;
                //string indexPath = GetType().Assembly.Location;
                //string indexPath = System.IO.Path.GetDirectoryName(Page.Request.PhysicalPath);
                //string dirParent = Directory.GetParent(indexPath).Parent.FullName;
                string dirParent = System.AppDomain.CurrentDomain.BaseDirectory;
                
    
                bool bInit = SplitWord.NLPIR_Init(dirParent, 0, null);
                if (!bInit)
                {
                    return null;
                }
                String inputString = input.ReadToEnd();
                IntPtr intPtr = SplitWord.NLPIR_ParagraphProcess(inputString, 0);
                string strResult = Marshal.PtrToStringAnsi(intPtr);
                return new StringReader(strResult);
            }
        }
    }
    




  • 相关阅读:
    B树、B-树、B+树、B*树介绍,和B+树更适合做文件索引的原因
    异步请求数据加载到表格后根据不同状态改变表格背景颜色【表格背景色】
    Linux/windows查看设置环境变量指令
    【周期性执行事件】MySQL事件(Event)&任务调度
    DEDE列表页调用TAG标签
    poj2488 A Knight's Journey
    [置顶] Codeforces Round #190 (Div. 2)(完全)
    SharePoint 2010 用Event Receiver将文件夹自动变成approved状态 (2)
    .NET领域驱动设计—初尝(三:穿过迷雾走向光明)
    Android解决异常apk on device '0292bea1': Unable to open sync connection!
  • 原文地址:https://www.cnblogs.com/ldxsuanfa/p/10046047.html
Copyright © 2011-2022 走看看