调用方法:
FSearch _FSearch = new FSearch(); List<string> o_Rtn = new List<string>(); string _foldPath = textBox1.Text; string _searckkey = textBox2.Text; string foldPath = _foldPath; if (string.IsNullOrEmpty(foldPath)) { MessageBox.Show("请选择文件路径!"); return; } string searckkey = _searckkey; if (string.IsNullOrEmpty(searckkey)) { MessageBox.Show("请输入搜索关键字!"); return; } _FSearch.SelectFile(foldPath, searckkey, ref o_Rtn); string Msg= string.Format("查询到符合条件的文档份数:{0}", o_Rtn.Count);
代码
/// <summary> /// 文件内容检索 /// </summary> public class FSearch { /// <summary> /// 文件路径 /// </summary> string _foldPath = string.Empty; /// <summary> /// 检索关键字 /// </summary> string _searckkey = string.Empty; string _lbmessage = string.Empty; string _comboBox2 = string.Empty; List<string> _dataGridView1 = new List<string>(); /// <summary> /// 文件默认类型 /// </summary> string extension = ".DOC.DOCX.XLS.XLSX.PPT.PPTX.PDF.HTML.HTM.TXT"; /// <summary> /// 日志对象 /// </summary> private static readonly ILog log = LogManager.GetLogger(typeof(FSearch)); /// <summary> /// 筛选文件 /// </summary> /// <param name="i_foldPath">选择文件夹</param> /// <param name="i_searckkey">筛选关键字</param> public void SelectFile(string i_foldPath, string i_searckkey, ref List<string> o_Rtn) { try { _foldPath = i_foldPath; _searckkey = i_searckkey; string foldPath = _foldPath; if (string.IsNullOrEmpty(foldPath)) { log.Error("请选择文件路径!"); return; } string searckkey = _searckkey; if (string.IsNullOrEmpty(searckkey)) { log.Error("请输入搜索关键字!"); return; } _dataGridView1.Clear(); listDirectory(@foldPath, extension, _comboBox2, searckkey); o_Rtn = _dataGridView1; _lbmessage = ""; log.Error("搜索完毕"); } catch (Exception err) { log.Error(err.Message); } } /// <summary> /// 列出path路径对应的文件夹中的子文件夹和文件 /// 然后再递归列出子文件夹内的文件和文件夹 /// </summary> /// <param name="path">需要搜索文件夹的路径</param> public void listDirectory(string path, string extension, string coding, string searckkey) { DirectoryInfo theFolder = new DirectoryInfo(@path); DirectorySecurity s = new DirectorySecurity(path, AccessControlSections.Access); //判断目录是否 可以访问 if (!s.AreAccessRulesProtected) { foreach (FileInfo file in theFolder.GetFiles()) { if (string.IsNullOrEmpty(extension) || extension.Contains(file.Extension.ToUpper())) { _lbmessage = "正在搜索文件:" + path + "\" + file.Name; Application.DoEvents(); #region 标题和内容都检索 #region 检索判断标题 //默认检索 先搜索标题是否有,如果有,则退出循环,如果没有,再检索内容 if (file.Name.Contains(searckkey)) { _dataGridView1.Add(path + "\" + file.Name); continue; } #endregion #region 检索文档内容 try { using (FileStream fs = new FileStream(path + "\" + file.Name, FileMode.Open, FileAccess.Read)) { #region 读取Execl if (file.Extension.ToUpper().Contains(".XLS")) { try { IWorkbook workbook = null;//全局workbook ISheet sheet;//sheet switch (file.Extension) { //xls是03,用HSSFWorkbook打开,.xlsx是07或者10用XSSFWorkbook打开 case ".xls": workbook = new HSSFWorkbook(fs); break; case ".xlsx": workbook = new XSSFWorkbook(fs); break; default: break; } fs.Close();//关闭文件流 if (workbook != null) { int count = workbook.NumberOfSheets; bool bo = false; //bo初始化为假 for (int index = 0; index < count; index++) { if (bo)//如果bo为真 break;//退出第一层循环 sheet = workbook.GetSheetAt(index);//读取到指定的sheet //遍历读取cell for (int i = sheet.FirstRowNum; i <= sheet.LastRowNum; i++) { if (bo)//如果bo为真 break;//退出第二层循环 IRow row = sheet.GetRow(i);//得到一行 if (row != null) { for (int j = row.FirstCellNum; j < row.LastCellNum; j++) { ICell cell = row.GetCell(j);//得到cell if (cell != null)//如果cell为null,则赋值为空 { if (row.GetCell(j).ToString().Contains(searckkey)) { _dataGridView1.Add(path + "\" + file.Name); bo = true;//bo赋为真 break;//退出第三层循环 } } } } } } } //释放资源 workbook = null; sheet = null; } catch (Exception err) { //MessageBox.Show(err.Message); } } #endregion #region 读取ppt内容 else if (file.Extension.ToUpper().Contains(".PPT")) { //try //{ // //初始化一个Presentation类实例,并加载文档 // Presentation ppt = new Presentation(); // ppt.LoadFromFile(path + "\" + file.Name); // bool bo = false; // foreach (ISlide slide in ppt.Slides) // { // if (bo)//如果bo为真 // break;//退出第一层循环 // foreach (Spire.Presentation.IShape shape in slide.Shapes) // { // if (bo)//如果bo为真 // break;//退出第一层循环 // if (shape is IAutoShape) // { // foreach (TextParagraph tp in (shape as IAutoShape).TextFrame.Paragraphs) // { // if (tp.Text.Contains(searckkey)) // { // int GridIndex = this._dataGridView1.Add(); // this._dataGridView1[GridIndex].Cells[0].Value = GridIndex + 1; // this._dataGridView1[GridIndex].Cells[1].Value = file.Name; // this._dataGridView1[GridIndex].Cells[2].Value = path + "\" + file.Name; // bo = true;//bo赋为真 // break;//退出第三层循环 // } // } // } // } // } // ppt = null; //释放资源 //} //catch (Exception err) //{ // //MessageBox.Show(err.Message); //} } #endregion #region 读取pdf文件 else if (file.Extension.ToUpper().Contains(".PDF")) { try { PdfDocument pdf = new PdfDocument(); pdf.LoadFromFile(@path + "\" + file.Name); foreach (PdfPageBase page in pdf.Pages) { string content = page.ExtractText(); if (content.Contains(searckkey)) { _dataGridView1.Add(path + "\" + file.Name); continue; } } pdf = null;//释放资源 } catch (Exception err) { } } #endregion #region doc else if (file.Extension.ToUpper().Contains(".DOC")) { try { //我还要打开这个文档玩玩 MSWord.Application app = new MSWord.Application(); MSWord.Document doc = null; object _file = path + "\" + file.Name; doc = app.Documents.Open(ref _file); string text2 = Regex.Replace(doc.Content.Text, @"(\a|\t|\n|\s+)", ""); if (text2.Contains(searckkey)) { _dataGridView1.Add(path + "\" + file.Name); fs.Close(); } doc.Close(); app.Quit(); } catch (Exception err) { } } #endregion #region 读取其他文本文件 else { Encoding codingType = Encoding.Default; codingType = GetType(path + "\" + file.Name); //get encode from document . StreamReader sr = new StreamReader(fs, codingType); String str; while ((str = sr.ReadLine()) != null) { if (str.Contains(searckkey)) { _dataGridView1.Add(path + "\" + file.Name); sr.Close(); fs.Close(); break; } } } #endregion } } catch (Exception ex) { log.ErrorFormat("【{0}】文档解析异常:{1}",(path + "\" + file.Name),ex.Message); continue; } #endregion #endregion } } } //遍历文件夹 foreach (DirectoryInfo NextFolder in theFolder.GetDirectories()) { if ((NextFolder.Attributes & FileAttributes.Hidden) != FileAttributes.Hidden) { listDirectory(NextFolder.FullName, extension, coding, searckkey); } } } #region 共通方法 //编码问题目前为止,基本上没人解决,就连windows的IE的自动识别有时还识别错编码呢 //如果文件有BOM则判断,如果没有就用系统默认编码,缺点:没有BOM的非系统编码文件会显示乱码。 //调用方法: common.GetType(filename) public System.Text.Encoding GetType(string FILE_NAME) { using (FileStream fs = new FileStream(FILE_NAME, FileMode.Open, FileAccess.Read)) { System.Text.Encoding r = GetType(fs); fs.Close(); return r; } } /// <summary> /// 通过给定的文件流,判断文件的编码类型 /// </summary> /// <param name="fs">文件流</param> /// <returns>文件的编码类型</returns> public System.Text.Encoding GetType(FileStream fs) { byte[] Unicode = new byte[] { 0xFF, 0xFE, 0x41 }; byte[] UnicodeBIG = new byte[] { 0xFE, 0xFF, 0x00 }; byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF }; //带BOM Encoding reVal = Encoding.Default; BinaryReader r = new BinaryReader(fs, System.Text.Encoding.Default); int i; int.TryParse(fs.Length.ToString(), out i); byte[] ss = r.ReadBytes(i); if (IsUTF8Bytes(ss) || (ss[0] == 0xEF && ss[1] == 0xBB && ss[2] == 0xBF)) { reVal = Encoding.UTF8; } else if (ss[0] == 0xFE && ss[1] == 0xFF && ss[2] == 0x00) { reVal = Encoding.BigEndianUnicode; } else if (ss[0] == 0xFF && ss[1] == 0xFE && ss[2] == 0x41) { reVal = Encoding.Unicode; } r.Close(); return reVal; } /// <summary> /// 判断是否是不带 BOM 的 UTF8 格式 /// </summary> /// <param name=“data“></param> /// <returns></returns> private bool IsUTF8Bytes(byte[] data) { int charByteCounter = 1; //计算当前正分析的字符应还有的字节数 byte curByte; //当前分析的字节. for (int i = 0; i < data.Length; i++) { curByte = data[i]; if (charByteCounter == 1) { if (curByte >= 0x80) { //判断当前 while (((curByte <<= 1) & 0x80) != 0) { charByteCounter++; } //标记位首位若为非0 则至少以2个1开始 如:110XXXXX...........1111110X if (charByteCounter == 1 || charByteCounter > 6) { return false; } } } else { //若是UTF-8 此时第一位必须为1 if ((curByte & 0xC0) != 0x80) { return false; } charByteCounter--; } } if (charByteCounter > 1) { throw new Exception("非预期的byte格式"); } return true; } #endregion }