zoukankan      html  css  js  c++  java
  • lecene.net实现pdf,doc,xls,ppt,htm,html等格式文件的检索

    代码如下,代码没有优化,仅实现功能
    该代码复制到程序中不能直接使用,需要下载文章最后的例子,取得其中得dll后才可以

    using System;
    using System.Configuration;
    using System.Data;
    using System.Linq;
    using System.Web;
    using System.Web.Security;
    using System.Web.UI;
    using System.Web.UI.HtmlControls;
    using System.Web.UI.WebControls;
    using System.Web.UI.WebControls.WebParts;
    using System.Xml.Linq;
    using System.Text;
    using System.IO;

    using Lucene.Net.Documents;
    using Lucene.Net.Index;
    using Lucene.Net.Search;
    using Lucene.Net.QueryParsers;
    using Lucene.Net.Analysis.Standard;

    using Lucene.Net.Analysis.Cn;


    using org.pdfbox.pdmodel;
    using org.pdfbox.util;

    using System.Text.RegularExpressions;

    public partial class _Default : System.Web.UI.Page
    {
        
    public DateTime start = new DateTime();
        
    delegate void AsyncIndexDirectoryCaller(IndexWriter writer, FileInfo file);
        IndexSearcher searcher 
    = null;

        
    protected void Page_Load(object sender, EventArgs e)
        {
            
    if (!IsPostBack)
                TextBox3.Text 
    = Server.MapPath("doc");
        }


        
    #region 建立索引
        
    protected void Button2_Click(object sender, EventArgs e)
        {
            
    string INDEX_STORE_PATH = Server.MapPath("index");  //INDEX_STORE_PATH 为索引存储目录
            string INDEX_PATH = TextBox3.Text;  //INDEX_PATH 为搜索目录

            IndexWriter writer 
    = null;
            
    try
            {
                writer 
    = new IndexWriter(INDEX_STORE_PATH, new ChineseAnalyzer(), true);
                start 
    = DateTime.Now;

                IndexDirectory(writer, 
    new FileInfo(INDEX_PATH));
                writer.Optimize();
                writer.Close();

                TimeSpan s 
    = DateTime.Now - start;

                TextBox1.Text 
    = "提示:索引完成,共用时 " + s.TotalSeconds + " 秒\n";

            }
            
    catch (Exception ex)
            {
                TextBox4.Text 
    = ex.Message.ToString();
            }


        }

        
    public void IndexDirectory(IndexWriter writer, FileInfo file)
        {
            
    if (Directory.Exists(file.FullName))
            {
                String[] files 
    = Directory.GetFileSystemEntries(file.FullName);

                
    if (files != null)
                {
                    
    for (int i = 0; i < files.Length; i++)
                    {
                        IndexDirectory(writer, 
    new FileInfo(files[i]));  //这里是一个递归 
                    }
                }
            }
            
    else if (file.Extension.ToLower() == ".txt" || file.Extension.ToLower() == ".htm" || file.Extension.ToLower() == ".html" || file.Extension.ToLower() == ".pdf" || file.Extension.ToLower() == ".doc" || file.Extension.ToLower() == ".rtf" || file.Extension.ToLower() == ".ppt" || file.Extension.ToLower() == ".xls")
            {
                IndexFile(file, writer);
            }
        }

        
    private void IndexFile(FileInfo file, IndexWriter writer)
        {

            
    try
            {
                
    if (file.Extension.ToLower() == ".pdf")
                {
                    Document doc 
    = new Document();

                    PDDocument pddoc 
    = PDDocument.load(file.FullName);  
                    PDFTextStripper stripper 
    = new PDFTextStripper();

                    doc.Add(
    new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                    doc.Add(
    new Field("contents", stripper.getText(pddoc), Field.Store.NO, Field.Index.TOKENIZED));

                    writer.AddDocument(doc);
                }
                
    else if (file.Extension.ToLower() == ".doc")
                {
                    Document doc 
    = new Document();
                    
    string str = "";
                    
    //
                    Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
                    
    object filePath = file.FullName;
                    
    object nullobj = System.Reflection.Missing.Value;
                    Microsoft.Office.Interop.Word.Document docdoc 
    = wordApp.Documents.Open(
                        
    ref filePath, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                        
    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                        
    ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                    docdoc.ActiveWindow.Selection.WholeStory();

                    str 
    = docdoc.ActiveWindow.Selection.Text.ToString();
                    docdoc.Close(
    ref nullobj, ref nullobj, ref nullobj);
                    wordApp.Quit(
    ref nullobj, ref nullobj, ref nullobj);
                    
    //

                    doc.Add(
    new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                    doc.Add(
    new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                    writer.AddDocument(doc);

                }
                
    else if (file.Extension.ToLower() == ".rtf")    //word的方式可以解决rtf文件的读取
                {
                    Document doc 
    = new Document();
                    
    string str = "";
                    
    //
                    Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
                    
    object filePath = file.FullName;
                    
    object nullobj = System.Reflection.Missing.Value;
                    Microsoft.Office.Interop.Word.Document docdoc 
    = wordApp.Documents.Open(
                        
    ref filePath, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                        
    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                        
    ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                    docdoc.ActiveWindow.Selection.WholeStory();

                    str 
    = docdoc.ActiveWindow.Selection.Text.ToString();
                    docdoc.Close(
    ref nullobj, ref nullobj, ref nullobj);
                    wordApp.Quit(
    ref nullobj, ref nullobj, ref nullobj);
                    
    //
                    
                    doc.Add(
    new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                    doc.Add(
    new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                    writer.AddDocument(doc);
                }
                
    else if (file.Extension.ToLower() == ".ppt")
                {
                    Document doc 
    = new Document();
                    
    string str = "";
                    
    //
                    PowerPoint.ApplicationClass pptApp = new PowerPoint.ApplicationClass();
                    PowerPoint.Presentation pptPre 
    = pptApp.Presentations.Open(file.FullName,
                                Microsoft.Office.Core.MsoTriState.msoTrue,
                                Microsoft.Office.Core.MsoTriState.msoFalse,
                                Microsoft.Office.Core.MsoTriState.msoFalse);

                    
    foreach (PowerPoint.Slide slide in pptPre.Slides)
                    {
                        
    foreach (PowerPoint.Shape shape in slide.Shapes)
                        {
                            
    try
                            {
                                str 
    = str + shape.TextFrame.TextRange.Text;
                            }
                            
    catch { }
                        }
                    }
                    pptPre.Close();
                    pptApp.Quit();
                    
    //

                    doc.Add(
    new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                    doc.Add(
    new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                    writer.AddDocument(doc);
                }
                
    else if (file.Extension.ToLower() == ".xls")
                {
                    Document doc 
    = new Document();
                    
    string str = "";

                    
    //
                    Microsoft.Office.Interop.Excel.Application xApp = new Microsoft.Office.Interop.Excel.ApplicationClass();
                    
    //xApp.Visible = true;

                    
    object nullobj = System.Reflection.Missing.Value;

                    Microsoft.Office.Interop.Excel.Workbook xBook 
    = xApp.Workbooks._Open(file.FullName,
                    nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj);

                    Microsoft.Office.Interop.Excel.Worksheet xSheet;
                    
    int rcount, ccount;

                    
    for (int i = 0; i < xBook.Sheets.Count; i++)
                    {
                        xSheet 
    = (Microsoft.Office.Interop.Excel.Worksheet)xBook.Sheets[i + 1];

                        rcount 
    = xSheet.UsedRange.Rows.Count;
                        ccount 
    = xSheet.UsedRange.Columns.Count;

                        
    for (int m = 0; m < rcount; m++)
                        {
                            
    for (int n = 0; n < ccount; n++)
                            {
                                str 
    = str + ((Microsoft.Office.Interop.Excel.Range)xSheet.Cells[m + 1, n + 1]).Value2;
                            }
                        }

                    }
                    xSheet 
    = null;
                    xBook.Close(nullobj, nullobj, nullobj);
                    xApp.Quit();
                    
    //

                    doc.Add(
    new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                    doc.Add(
    new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                    writer.AddDocument(doc);

                }
                
    else if (file.Extension.ToLower() == ".htm" || file.Extension.ToLower() == ".html")
                {

                    Document doc 
    = new Document();
                    
    string str = "";
                    str 
    = NoHTML(File.ReadAllText(file.FullName));

                    doc.Add(
    new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                    doc.Add(
    new Field("contents"new StreamReader(file.FullName, System.Text.Encoding.Default)));

                    writer.AddDocument(doc);
                }
                
    else    //默认是文本文件
                {
                    Document doc 
    = new Document();

                    doc.Add(
    new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                    doc.Add(
    new Field("contents"new StreamReader(file.FullName, System.Text.Encoding.Default)));

                    writer.AddDocument(doc);
                }
            }

            
    catch (FileNotFoundException fnfe)
            {
                TextBox4.Text 
    = TextBox4.Text + fnfe.Message + "\n";
                
    return;
            }
        }

        
    public static string NoHTML(string Htmlstring)//过滤调html的标签
        {
            
    //删除脚本 
            Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>""", RegexOptions.IgnoreCase);
            
    //删除HTML 
            Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>""", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"([\r\n])[\s]+""", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"-->""", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"<!--.*""", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(quot|#34);""\"", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);""&", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(lt|#60);""<", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(gt|#62);"">", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(nbsp|#160);"" ", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(iexcl|#161);""\xa1", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(cent|#162);""\xa2", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(pound|#163);""\xa3", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&(copy|#169);""\xa9", RegexOptions.IgnoreCase);
            Htmlstring 
    = Regex.Replace(Htmlstring, @"&#(\d+);""", RegexOptions.IgnoreCase);
            Htmlstring.Replace(
    "<""");
            Htmlstring.Replace(
    ">""");
            Htmlstring.Replace(
    "\r\n""");
            Htmlstring 
    = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
            
    return Htmlstring;
        }
        
    #endregion

        
    #region 搜索
        
    protected void Button1_Click(object sender, EventArgs e)
        {
            
    string INDEX_STORE_PATH = Server.MapPath("index");  //INDEX_STORE_PATH 为索引存储目录
            string KEYWORD = TextBox2.Text;

            
    try
            {
                searcher 
    = new IndexSearcher(INDEX_STORE_PATH);

                QueryParser q 
    = new QueryParser("contents"new ChineseAnalyzer());

                Query query 
    = q.Parse(KEYWORD);


                Hits hits 
    = searcher.Search(query);

                printResult(hits);

                searcher.Close();
            }
            
    catch (Exception ex)
            {
                TextBox4.Text 
    = TextBox4.Text + ex.Message.ToString();
            }
        }

        
    void printResult(Hits h)
        {
            
    string str = "";
            
    if (h.Length() == 0)
            {
                str 
    = str + "对不起,没有搜索到你要的结果。\n";
            }
            
    else
            {
                
    for (int i = 0; i < h.Length(); i++)
                {
                    
    try
                    {
                        Document doc 
    = h.Doc(i);
                        str 
    = str + "这是第" + (i + 1+ "个搜索结果,文件路径为: " + doc.Get("filename"+ "\n";
                    }
                    
    catch (Exception ex)
                    {
                        TextBox4.Text 
    = TextBox4.Text + ex.Message;
                    }
                }
            }
            str 
    = str + "---------------------------\n";
            TextBox1.Text 
    = str;
        }

        
    #endregion

    }


    完整demo下载,点击下载

  • 相关阅读:
    git 常用命令
    centos 7 mini 安装
    python打印杨辉三角
    python 求100内的素数/质数
    字符串与bytes
    format
    Python字符串格式化
    数据结构
    ARM工作模式
    C语言实现字符串逆序输出
  • 原文地址:https://www.cnblogs.com/weekzero/p/1217521.html
Copyright © 2011-2022 走看看