本文转载自:http://blog.csdn.net/hehui21/article/details/2873933
这几天完成一个相关lucene.net 索引,特此写上来供大爱评论!
建立全文索引,主要有两步,一,建立索引,二,查找,那么我先来讲一下怎么新建索引!
引用类:
- Imports System.Data
- Imports System.IO
- Imports System.Net
- Imports System.Text
- Imports System.Data.SqlClient
- Imports System.Data.Sql
- Imports System.Text.RegularExpressions
- Imports System.Web
- Imports System.Threading
- Imports Microsoft.Office
- Imports Lucene.Net.Documents
- Imports Lucene.Net.Index
- Imports Lucene.Net.Search
- Imports Lucene.Net.QueryParsers
- Imports Lucene.Net.Analysis.Cn
- Imports Lucene.Net
- Imports System.Data.OleDb
我这里是把它做成一个类的,它是一个form程式,里面有一个按钮,一个rechibox,一个timer然后在这个类里定义一些私有变量:
-
- Private Delegate Sub indexfile_delegate()
- Private _dir As String = "D:/idx"'存放索引位置
- Private _display_index_Name As String = "XYPart_index"
- 'Private display_index_directory As String = _dir
- Private _Analysis As New Lucene.Net.Analysis.Cn.ChineseAnalyzer
- Private _writer As IndexWriter '新建一个索引
- Private startTime As DateTime'定义程式开始运行时间,主要是记录索引时间用
- Private _conn As New System.Data.SqlClient.SqlConnection("server=你的数据库地址;database=databasename;User Id = 用户名称;Password =密码;")
- '解读出数据里的数据
- Public Function F_getDataFromTable() As SqlDataReader
- 'connection
- _conn.Open()
- Dim query As String = ”sql语句“
- Dim mycom As SqlCommand
- mycom = New SqlCommand(query, _conn)
- Dim myrea As SqlDataReader
- myrea = mycom.ExecuteReader
- Return myrea
- End Function
解读出附件里的文件内容:
- Public Function FileParse(ByVal file As String) As String
- Dim result As String = ""
- ' For j As Integer = 0 To file.Length - 1
- Dim T_str As String = Path.GetExtension(file).ToLower
- Select Case T_str
- Case ".txt"
- result = TxtParse(file)
- Case ".htm"
- result = htmParse(file)
- Case ".html"
- result = htmParse(file)
- Case ".pdf"
- result = PDFParse(file)
- Case ".docx"
- result = WordParse(file)
- Case ".doc"
- result = WordParse(file)
- Case ".ppt"
- result = PPTParse(file)
- Case ".pptx"
- result = PPTParse(file)
- Case ".xlsx"
- result = ExcelParse(file)
- Case ".xls"
- result = ExcelParse(file)
- End Select
- ' Next
- Return result
- End Function
- Public Function htmParse(ByVal path As String) As String
- Dim sr As New StreamReader(path, System.Text.Encoding.Default)
- Dim file_html As String
- file_html = sr.ReadToEnd
- Dim Htmlstring As String = System.Text.RegularExpressions.Regex.Replace(file_html, "<[^>]*>", "")
- 'remove script
- Htmlstring = Regex.Replace(Htmlstring, "<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase)
- 'remove html
- Htmlstring = Regex.Replace(Htmlstring, "<(.[^>]*)>", "", RegexOptions.IgnoreCase)
- Htmlstring = Regex.Replace(Htmlstring, "([/r/n])[/s]+", "", RegexOptions.IgnoreCase)
- Htmlstring = Regex.Replace(Htmlstring, "-->", "", RegexOptions.IgnoreCase)
- Htmlstring = Regex.Replace(Htmlstring, "<!--.*", "", RegexOptions.IgnoreCase)
- Htmlstring = Regex.Replace(Htmlstring, "&(quot|#34);", "/", RegexOptions.IgnoreCase)
- Htmlstring = Regex.Replace(Htmlstring, "&(amp|#38);", "&", RegexOptions.IgnoreCase)
- Htmlstring = Regex.Replace(Htmlstring, "&(lt|#60);", "<", RegexOptions.IgnoreCase)
- Htmlstring = Regex.Replace(Htmlstring, "&(gt|#62);", ">", RegexOptions.IgnoreCase)
- Htmlstring = Regex.Replace(Htmlstring, "&(nbsp|#160);", " ", RegexOptions.IgnoreCase)
- Htmlstring = Regex.Replace(Htmlstring, "&(iexcl|#161);", "/xa1", RegexOptions.IgnoreCase)
- Htmlstring = Regex.Replace(Htmlstring, "&(cent|#162);", "/xa2", RegexOptions.IgnoreCase)
- Htmlstring = Regex.Replace(Htmlstring, "&(pound|#163);", "/xa3", RegexOptions.IgnoreCase)
- Htmlstring = Regex.Replace(Htmlstring, "&(copy|#169);", "/xa9", RegexOptions.IgnoreCase)
- Htmlstring = Regex.Replace(Htmlstring, "&#(/d+);", "", RegexOptions.IgnoreCase)
- Htmlstring.Replace("<", "")
- Htmlstring.Replace(">", "")
- Htmlstring.Replace("/r/n", "")
- 'Htmlstring = System.Web.HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim()
- Return Htmlstring.Replace(" ", " ")
- End Function
- Public Function TxtParse(ByVal Path_str As String) As String
- Dim sr As New StreamReader(Path_str, System.Text.Encoding.Default)
- Dim str As String = sr.ReadToEnd
- Return str
- End Function
- Public Function PDFParse(ByVal path_str As String) As String
- Dim Path_str_totxt As String = path_str.Substring(0, path_str.LastIndexOf(".")) + ".txt"
- Dim str As String = ""
- Try
- Shell(_datadir + "TextMiningTool/minetext.exe" + " " + path_str + " " + Path_str_totxt)
- str = TxtParse(Path_str_totxt)
- Shell("del /q /f " + Path_str_totxt)
- Catch ex As Exception
- MsgBox("this pdf " + path_str + " is not parsed ,the programme is continuing")
- End Try
- Return str
- End Function
- Public Function WordParse(ByVal path_str As String) As String
- Dim missing = System.Reflection.Missing.Value
- Dim isReadOnly As Boolean = False
- Dim isVisible As Boolean = True
- Dim openword As New Microsoft.Office.Interop.Word.Application
- Dim word_doc As Microsoft.Office.Interop.Word.Document
- word_doc = openword.Documents.Open(path_str, missing, missing, missing, missing, missing, missing, missing, missing, missing, missing, isVisible, missing, missing, missing, missing)
- word_doc.Activate()
- openword.Visible = False
- Dim str As String
- str = word_doc.Content.Text
- openword.Quit(missing, missing, missing)
- word_doc.Close()
- Return str
- End Function
- Public Function ExcelParse(ByVal path_str As String) As String
- Dim dbfconn As OleDb.OleDbConnection = New OleDb.OleDbConnection
- Dim conn_excel As String
- Dim FileExname As String = System.IO.Path.GetExtension(path_str).ToUpper
- If FileExname <> ".XLSX" Then
- conn_excel = "Provider=Microsoft.Jet.OLEDB.4.0;Data Source=" + path_str + ";Extended Properties='Excel 8.0;HDR=NO;IMEX=1';"
- Else
- conn_excel = "Provider=Microsoft.ACE.OLEDB.12.0;Data Source=" + path_str + ";Extended Properties='Excel 12.0;HDR=NO;IMEX=1';"
- End If
- Dim str As String = ""
- dbfconn.ConnectionString = conn_excel
- dbfconn.Open()
- Dim tablename As String
- Dim dt As DataTable = dbfconn.GetOleDbSchemaTable(OleDbSchemaGuid.Tables, Nothing)
- For i As Integer = 0 To dt.Rows.Count - 1
- tablename = dt.Rows(i)(2).ToString().Trim()
- Dim strSQL As String = "Select * from [" + tablename + "]"
- Dim objDA As System.Data.OleDb.OleDbDataAdapter = New System.Data.OleDb.OleDbDataAdapter(strSQL, conn_excel)
- Dim objDS As New DataSet()
- objDA.Fill(objDS)
- For j As Integer = 0 To objDS.Tables(0).Rows.Count - 1
- For x As Integer = 0 To objDS.Tables(0).Columns.Count - 1
- str = str + " " + objDS.Tables(0).Rows(j)(x).ToString
- Next
- Next
- Next
- dbfconn.Close()
- Return str
- End Function
- Public Function PPTParse(ByVal file As String) As String
- Dim targetFile As String = file
- Dim str As String = ""
- Dim ppApp As New Microsoft.Office.Interop.PowerPoint.Application
- Dim prsPres As Microsoft.Office.Interop.PowerPoint.Presentation = ppApp.Presentations.Open(targetFile, True, False, False)
- For Each slide As Microsoft.Office.Interop.PowerPoint.Slide In prsPres.Slides
- For Each shape As Microsoft.Office.Interop.PowerPoint.Shape In slide.Shapes
- If shape.TextFrame.HasText = Microsoft.Office.Core.MsoTriState.msoTrue Then
- str = str + shape.TextFrame.TextRange.Text
- End If
- Next
- Next
- prsPres.Close()
- ppApp.Quit()
- Return str
- End Function
建立索引:
- Private Sub indexfile()
- Dim myrea As SqlDataReader
- myrea = F_getDataFromTable()
- 'built field
- Dim Field_contents_str As String = "" 'save the contents from table ,
- Dim Field_id As String = "" 'save the Requistion_no id,
- Dim Field_Datafile As String
- Dim Field_file_Catagory As String
- display_indexinfo("start")
- ' Try
- While (myrea.Read())
- Dim doc As New Lucene.Net.Documents.Document
- ' If myrea("file_category").ToString <> "" Then
- Field_contents_str = myrea("Customer_Name").ToString + " " + myrea("Customer_Id").ToString + " " + myrea("Reject_Reason").ToString + " " + myrea("Remark").ToString + " " + myrea("Category").ToString
- Field_contents_str = Field_contents_str + " " + myrea("AACATI_SO_No").ToString + " " + myrea("Vendor_Part_No").ToString + " " + myrea("Description").ToString + " " + myrea("AASC_Part_No").ToString + " " + myrea("Brand_Name").ToString
- Field_id = myrea("requisition_no").ToString
- 'save the contents from various file
- Dim fullpath As String = ""
- fullpath = "../Utilities/Files/" + myrea("filefullname")
- If File.Exists(fullpath) Then
- Field_Datafile = FileParse(fullpath)
- Else
- Field_Datafile = ""
- End If
- ' fullpath = datadir + "word.doc"
- Field_file_Catagory = myrea("file_category").ToString
- doc.Add(New Lucene.Net.Documents.Field("Table_contents", Field_contents_str, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.TOKENIZED))
- doc.Add(New Lucene.Net.Documents.Field("File_contents", Field_Datafile, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.TOKENIZED))
- doc.Add(New Lucene.Net.Documents.Field("Requisition_no_id", Field_id, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.UN_TOKENIZED))
- doc.Add(New Lucene.Net.Documents.Field("FullContents", Field_contents_str + Field_Datafile, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.TOKENIZED))
- doc.Add(New Lucene.Net.Documents.Field("FileCatagory", Field_file_Catagory, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.TOKENIZED))
- _writer.AddDocument(doc)
- ' End If
- End While
- ' Catch ex As Exception
- ' MsgBox("index is not success")
- 'End Try
- display_indexinfo("finished")
- myrea.Close()
- End Sub
显示索引中相关信息,比如文件名称,路径,等等,
- Private Sub display_indexinfo(ByVal status As String)
- '
- Dim display_index_Time As Date
- Dim _fi As New FileInfo("D:/idx/segments")
- display_index_Time = _fi.LastAccessTime
- Dim display_index_status As String = "Finished"
- If Not File.Exists("D:/idx/segments") Then
- DataList.AppendText("Sorry ,it is not built index!!")
- ElseIf status.ToLower = "start" Then
- DataList.Clear()
- startTime = Date.Now
- DataList.AppendText("Index Name : " + _display_index_Name + vbCrLf + vbCrLf)
- DataList.AppendText("Index Directory: " + _dir + vbCrLf + vbCrLf)
- display_index_status = "in process"
- DataList.AppendText("index Status : " + display_index_status + vbCrLf)
- DataList.AppendText("Start Time : " + startTime.ToString + vbCrLf + vbCrLf)
- ElseIf status.ToLower = "finished" Then
- Dim Endtime As DateTime = Date.Now
- Dim timespan As TimeSpan = Endtime - startTime
- ' doc.Add(New Lucene.Net.Documents.Field("Build_Time", timespan.ToString, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.TOKENIZED))
- ' _writer.AddDocument(doc)
- DataList.AppendText("End Time : " + Endtime.ToString + vbCrLf + vbCrLf)
- DataList.AppendText("Span Time : " + timespan.ToString + vbCrLf + vbCrLf)
- DataList.AppendText(" Finished :")
- Else
- DataList.AppendText("Index Name : " + _display_index_Name + vbCrLf + vbCrLf)
- DataList.AppendText("Index Directory: " + _dir + vbCrLf + vbCrLf)
- DataList.AppendText("Index Time: " + display_index_Time + vbCrLf + vbCrLf)
- DataList.AppendText("index Status : " + display_index_status + vbCrLf)
- End If
- DataList.Refresh()
- End Sub
优化索引:
- Public Sub optimize_index(ByVal ar As IAsyncResult)
- Dim writer As IndexWriter = ar.AsyncState
- 'optimize index
- writer.SetMergeFactor(100) 'set document number,if your memory is bigger,please set bigger,exm:100 or 1000,defult is 10
- writer.SetMaxMergeDocs(999999999)
- 'MinMergeDocs(use it to control the memory) and MergeFactory(control amalgamation times and the size of amalgamation ),they have more affect to build the index,
- 'but Number of Mergefactory is not bigger and better,because when the document Number of segment is bigger,the efficiency is lower,so set a right value please,
- 'If your memory is bigger,you can set the MinMergedocs bigger
- writer.SetMaxFieldLength(100000) 'set char lenth,defult is 10 000
- writer.Optimize()
- writer.Close()
- _conn.Close()
- End Sub
单击button,开始建立索引:
- Private Sub btn_Builtindex_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles btn_Builtindex.Click
- Timer1.Stop()
- _writer = New IndexWriter(_dir, _Analysis, True)
- Control.CheckForIllegalCrossThreadCalls = False
- Dim CallbackOptimize As AsyncCallback = AddressOf optimize_index
- Dim indexfile_gelegate_btn As New indexfile_delegate(AddressOf indexfile)
- indexfile_gelegate_btn.BeginInvoke(CallbackOptimize, _writer)
- End Sub