1.引用读取PDF文件组件
FontBox-0.1.0-dev.dll
IKVM.GNU.Classpath.dll
IKVM.Runtime.dll
PDFBox-0.7.3.dll
2.添加office 组件 这个就过吧
3.添加盘古分词
PanGu.dll
PanGu.HighLight.dll
PanGu.Lucene.Analyzer.dll
4.添加Lucene.net 引用
Lucene.Net.dll
5.创建索引库
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 #region 同步资料到索引库 2 static Queue<ResourcesModel> TaskQueue = new Queue<ResourcesModel>(); 3 private void tmResSync_Tick(object sender, EventArgs e) 4 { 5 //读取数据到队列 6 try 7 { 8 InitTaskQueue(); 9 LogHelper.writeLog("WinFrom【同步数据索引库读取数据到队列】成功!!"); 10 } 11 catch (Exception ex) 12 { 13 LogHelper.writeErrLog("WinFrom【同步数据索引库读取数据到队列】:" + ex.Message); 14 } 15 } 16 public void ServiceStart() 17 { 18 Thread TaskThread = new Thread(new ThreadStart(ThreadInvoke)); 19 TaskThread.IsBackground = true; 20 TaskThread.Start(); 21 } 22 public void ThreadInvoke() 23 { 24 while (true) 25 { 26 try 27 { 28 if (TaskQueue.Count > 0) 29 { 30 ResourcesModel res = null; 31 lock (TaskQueue) 32 { 33 res = TaskQueue.Dequeue(); 34 } 35 //调用方法 36 new CreateResIndex().CreateIndex(res); 37 } 38 else 39 { 40 Thread.Sleep(1000); 41 } 42 } 43 catch (Exception ex) 44 { 45 LogHelper.writeErrLog("WinFrom【同步数据索引库出错】:"+ex.ToString()); 46 } 47 } 48 } 49 public void InitTaskQueue() 50 { 51 //读取资料中心数据 52 var query = new CreateResIndex().Get_View_CreateResIndex(" and uploadTime is not null and IsIndex=0 "); 53 if (query!=null) 54 { 55 for (int i = 0; i < query.Rows.Count; i++) 56 { 57 var model =new ResourcesModel(); 58 model.ID =query.Rows[i]["ID"].ToString(); 59 model.FileName=query.Rows[i]["FileName"]!=null ? query.Rows[i]["FileName"].ToString():""; 60 model.FilePath=query.Rows[i]["FilePath"]!=null ? query.Rows[i]["FilePath"].ToString():""; 61 model.CreaetBy=query.Rows[i]["UserName"]!=null ? query.Rows[i]["UserName"].ToString():""; 62 model.Types=query.Rows[i]["Name"]!=null ? query.Rows[i]["Name"].ToString():""; 63 model.TypeId=query.Rows[i]["Type"]!=null ? query.Rows[i]["Type"].ToString():""; 64 model.SimpleDesc=query.Rows[i]["SimpleDesc"]!=null ? query.Rows[i]["SimpleDesc"].ToString():""; 65 model.Title=query.Rows[i]["Title"]!=null ? query.Rows[i]["Title"].ToString():""; 66 model.Tags=query.Rows[i]["Tag"]!=null ? query.Rows[i]["Tag"].ToString():""; 67 model.OP = query.Rows[i]["IsDel"] != null && query.Rows[i]["IsDel"].ToString()!="" ? Convert.ToBoolean(query.Rows[i]["IsDel"].ToString())==true ? "0" : "1":"1"; 68 model.UploadTime = query.Rows[i]["uploadTime"] != null && query.Rows[i]["uploadTime"].ToString() != "" ? Convert.ToDateTime(query.Rows[i]["uploadTime"]).ToString("yyyy-MM-dd"):""; 69 TaskQueue.Enqueue(model); 70 } 71 } 72 73 74 } 75 #endregion
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 #region ResourcesModel 2 public class ResourcesModel 3 { 4 public ResourcesModel() { } 5 6 /// <summary> 7 /// 标识 8 /// </summary> 9 public string ID { get; set; } 10 11 /// <summary> 12 /// 标题 13 /// </summary> 14 public string Title { get; set; } 15 16 /// <summary> 17 ///标签 18 /// </summary> 19 public string Tags { get; set; } 20 21 /// <summary> 22 ///创建人 23 /// </summary> 24 public string CreaetBy { get; set; } 25 26 /// <summary> 27 ///上传时间 28 /// </summary> 29 public string UploadTime { get; set; } 30 31 /// <summary> 32 ///类别 33 /// </summary> 34 public string Types { get; set; } 35 36 /// <summary> 37 ///简介 38 /// </summary> 39 public string SimpleDesc { get; set; } 40 /// <summary> 41 ///内容 42 /// </summary> 43 public string ContextDesc { get; set; } 44 /// <summary> 45 /// 有来标注是 删除=0 增加=1 修改=2 46 /// </summary> 47 public string OP { get; set; } 48 /// <summary> 49 /// 类型Id 50 /// </summary> 51 public string TypeId { get; set; } 52 /// <summary> 53 /// 文件路径 54 /// </summary> 55 public string FilePath { get; set; } 56 /// <summary> 57 /// 文件名称 58 /// </summary> 59 public string FileName { get; set; } 60 } 61 #endregion
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 #region 读取文件 2 public class ReadFilesTxt 3 { 4 public string ResumeTxt(string path) 5 { 6 string str = string.Empty; 7 8 StreamReader reader = new StreamReader(path, System.Text.Encoding.Default); 9 str = reader.ReadToEnd(); 10 11 //再通过查询解析出来的的字符串有没有GB2312 的字段,来判断是否是GB2312格式的,如果是,则重新以GB2312的格式解析 12 System.Text.RegularExpressions.Regex reGB = new System.Text.RegularExpressions.Regex("GB2312", RegexOptions.IgnoreCase); 13 System.Text.RegularExpressions.Match mcGB = reGB.Match(str); 14 if (mcGB.Success) 15 { 16 StreamReader reader2 = new StreamReader(path, System.Text.Encoding.GetEncoding("GB2312")); 17 str = reader2.ReadToEnd(); 18 } 19 return str; 20 } 21 22 private string ResumeWord(string path) 23 { 24 string str = string.Empty; 25 object missing = System.Reflection.Missing.Value; 26 object readOnly = true; 27 object docPathp = path; 28 Microsoft.Office.Interop.Word.Application wordApp = new Microsoft.Office.Interop.Word.Application(); 29 30 Microsoft.Office.Interop.Word.Document wordDoc = wordApp.Documents.Open(ref docPathp, 31 ref missing, 32 ref readOnly, 33 ref missing, 34 ref missing, 35 ref missing, 36 ref missing, 37 ref missing, 38 ref missing, 39 ref missing, 40 ref missing, 41 ref missing, 42 ref missing, 43 ref missing, 44 ref missing, 45 ref missing); 46 str = wordDoc.Content.Text; 47 wordDoc.Close(ref missing, ref missing, ref missing); 48 wordApp.Quit(ref missing, ref missing, ref missing); 49 50 return str; 51 } 52 53 private string ResumeExcel(string path) 54 { 55 string str = string.Empty; 56 //创建Application对象 57 Microsoft.Office.Interop.Excel.Application xApp = new Microsoft.Office.Interop.Excel.Application(); 58 xApp.Visible = false; 59 object readOnly = true; 60 object missing = System.Reflection.Missing.Value; 61 ////得到WorkBook对象, 62 Microsoft.Office.Interop.Excel.Workbook xBook = xApp.Workbooks._Open(path, 63 missing, readOnly, missing, missing, 64 missing, missing, missing, missing, 65 missing, missing, missing, missing); 66 67 var count = xBook.Sheets.Count; 68 Microsoft.Office.Interop.Excel.Worksheet xSheet; 69 for (int k = 0; k < count; k++) 70 { 71 xSheet = (Microsoft.Office.Interop.Excel.Worksheet)xBook.Sheets[k + 1]; 72 var rcount = xSheet.UsedRange.Rows.Count; 73 var ccount = xSheet.UsedRange.Columns.Count; 74 75 for (int m = 0; m < rcount; m++) 76 { 77 for (int n = 0; n < ccount; n++) 78 { 79 str = str + ((Microsoft.Office.Interop.Excel.Range)xSheet.Cells[m + 1, n + 1]).Value2; 80 } 81 } 82 } 83 84 xSheet = null; 85 xBook.Close(missing, missing, missing); 86 xApp.Quit(); 87 88 return str; 89 } 90 91 public string ResumePDF(string path) 92 { 93 94 org.pdfbox.pdmodel.PDDocument doc = org.pdfbox.pdmodel.PDDocument.load(path); 95 96 org.pdfbox.util.PDFTextStripper pdfStripper = new org.pdfbox.util.PDFTextStripper(); 97 98 string text = pdfStripper.getText(doc); 99 100 return text; 101 102 } 103 104 public string GetReadContext(string ResourceRoute, string path) 105 { 106 StringBuilder sb = new StringBuilder(); 107 108 try 109 { 110 if (path != "") 111 { 112 string[] paths = path.Split(';'); 113 for (int i = 0; i < paths.Length; i++) 114 { 115 if (paths[i] != null && paths[i].ToString() != "") 116 { 117 118 string lpath = paths[i].ToString(); 119 var suffix = lpath.Substring(lpath.LastIndexOf(".") + 1, lpath.Length - lpath.LastIndexOf(".") - 1); 120 if ("doc" == suffix || "docx" == suffix) 121 { 122 sb.Append(ResumeWord(ResourceRoute + lpath)); 123 } 124 else if ("xls" == suffix || "xlsx" == lpath) 125 { 126 sb.Append(ResumeExcel(ResourceRoute + lpath)); 127 } 128 else if ("pdf" == suffix) 129 { 130 sb.Append(ResumePDF(ResourceRoute + lpath)); 131 } 132 else if ("txt" == suffix) 133 { 134 sb.Append(ResumeTxt(ResourceRoute + lpath)); 135 } 136 137 138 } 139 } 140 } 141 } 142 catch (Exception ex) 143 { 144 145 LogHelper.writeErrLog( "【读取文件出错:文件名称:" + path + " 】 错误消息:" + ex.Message.ToString()); 146 } 147 148 return sb.ToString(); 149 } 150 151 public string GetReadContextSingle(string ResourceRoute, string lpath) 152 { 153 StringBuilder sb = new StringBuilder(); 154 try 155 { 156 if (lpath != "") 157 { 158 var suffix = lpath.Substring(lpath.LastIndexOf(".") + 1, lpath.Length - lpath.LastIndexOf(".") - 1); 159 if ("doc" == suffix || "docx" == suffix) 160 { 161 sb.Append(ResumeWord(ResourceRoute + lpath)); 162 } 163 else if ("xls" == suffix || "xlsx" == lpath) 164 { 165 sb.Append(ResumeExcel(ResourceRoute + lpath)); 166 } 167 else if ("pdf" == suffix) 168 { 169 sb.Append(ResumePDF(ResourceRoute + lpath)); 170 } 171 else if ("txt" == suffix) 172 { 173 sb.Append(ResumeTxt(ResourceRoute + lpath)); 174 } 175 } 176 } 177 catch (Exception ex) 178 { 179 180 LogHelper.writeErrLog("【读取文件出错:文件名称:" + ResourceRoute + lpath + " 】 错误消息:" + ex.Message.ToString()); 181 } 182 183 return sb.ToString(); 184 } 185 } 186 #endregion
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 #region 创建索引 2 public class CreateResIndex 3 { 4 public static string IndexPath = ConfigurationManager.AppSettings["pathIndex"];//索引文件路径 5 public static string ResourceRoute = ConfigurationManager.AppSettings["ResourceRoute"];//文件路径 6 7 // private readonly ILog log = LogManager.GetLogger("CreateIndex"); 8 9 #region 属性 10 /// <summary> 11 /// 盘古分词器 12 /// </summary> 13 protected Analyzer NewPanGuAnalyzer 14 { 15 get { return new PanGuAnalyzer(); } 16 17 } 18 19 /// <summary> 20 /// Lucene.Net的目录-参数 21 /// </summary> 22 public FSDirectory DirectoryLuce 23 { 24 get 25 { 26 return FSDirectory.Open(new DirectoryInfo(IndexPath), new NativeFSLockFactory()); 27 } 28 } 29 #endregion 30 31 #region 创建索引 32 /// <summary> 33 ///创建索引 34 /// </summary> 35 public void CreateIndex(ResourcesModel res) 36 { 37 //创建索引目录 38 if (!System.IO.Directory.Exists(IndexPath)) 39 { 40 System.IO.Directory.CreateDirectory(IndexPath); 41 } 42 43 //FSDirectory directory = FSDirectory.Open(new DirectoryInfo(IndexDic), new NativeFSLockFactory()); 44 bool isUpdate = IndexReader.IndexExists(DirectoryLuce); 45 if (isUpdate) 46 { 47 if (IndexWriter.IsLocked(DirectoryLuce)) 48 { 49 IndexWriter.Unlock(DirectoryLuce); 50 } 51 } 52 53 IndexWriter writer = new IndexWriter(DirectoryLuce, NewPanGuAnalyzer, !isUpdate, IndexWriter.MaxFieldLength.UNLIMITED); 54 List<string> listIsdex = GetResourceTypePublicResources(); 55 List<string> modifyindex = new List<string>(); 56 if (res != null) 57 { 58 if (res.OP == "0") 59 { 60 writer.DeleteDocuments(new Term("ID", res.ID.ToString().Trim())); 61 modifyindex.Add(res.ID.ToString().Trim()); 62 LogHelper.writeLog("【删除索引编号】 【ID:" + res.ID.ToString().Trim() + "】"); 63 } 64 else 65 { 66 67 if (IsPublicResources(listIsdex, res.TypeId.Trim())) 68 { 69 70 writer.DeleteDocuments(new Term("ID", res.ID.ToString().Trim())); 71 72 var path = res.FilePath; 73 74 string ID = res.ID.ToString().Trim(); 75 string Title = res.Title != null ? res.Title.ToString() : ""; 76 77 string CreaetBy = res.CreaetBy == null ? "" : res.CreaetBy.ToString(); 78 string UploadTime = res.UploadTime; 79 string Types = res.Types != null ? res.Types.ToString() : ""; 80 string SimpleDesc = res.SimpleDesc == null ? "" : res.SimpleDesc.ToString(); 81 string Tags = res.Tags != null ? res.Tags.ToString() : ""; 82 string FileName = res.FileName; 83 try 84 { 85 string ContextDesc = ""; 86 AddIndex(writer, ID, Title, Tags, SimpleDesc, "1", Types, UploadTime, CreaetBy, FileName); 87 if (path != "") 88 { 89 string[] paths = path.Split(';'); 90 string[] pname = FileName.Split(';'); 91 for (int i = 0; i < paths.Length; i++) 92 { 93 if (paths[i] != null && paths[i].ToString() != "") 94 { 95 string lpath = paths[i].ToString(); 96 string lname = pname[i].ToString(); 97 ContextDesc= new ReadFilesTxt().GetReadContextSingle(ResourceRoute, lpath); 98 //SimpleDesc=ContextDesc ContextDesc="" 99 string NewFileName = GetFileName(lpath, lname); 100 AddIndex(writer, ID, NewFileName, Tags, ContextDesc, lpath, Types, UploadTime, CreaetBy, FileName); 101 } 102 } 103 } 104 105 // string ContextDesc = new ReadFilesTxt().GetReadContext(ResourceRoute,path); 106 LogHelper.writeLog("【添加索引编号】 【ID:" + res.ID.ToString().Trim() + "】"); 107 modifyindex.Add(ID); 108 } 109 catch (Exception ex) 110 { 111 LogHelper.writeLog("【添加索引失败】 【ID:" + ID + "】:" + ex.Message.ToString()); 112 113 } 114 115 } 116 } 117 } 118 writer.Optimize(); 119 writer.Close(); 120 ModifyResIndex(modifyindex); 121 } 122 123 public void AddIndex(IndexWriter writer, string ID, string Title, string Tags, string SimpleDesc, string ContextDesc, string Types, string UploadTime, string CreaetBy,string FileName) 124 { 125 try 126 { 127 Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document(); 128 doc.Add(new Lucene.Net.Documents.Field("ID", ID, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NOT_ANALYZED));//存储且索引 129 doc.Add(new Lucene.Net.Documents.Field("Title", Title, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引 130 doc.Add(new Lucene.Net.Documents.Field("Tags", Tags, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引 131 doc.Add(new Lucene.Net.Documents.Field("SimpleDesc", SimpleDesc, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引 132 doc.Add(new Lucene.Net.Documents.Field("FileName", FileName, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引 133 doc.Add(new Lucene.Net.Documents.Field("ContextDesc", ContextDesc, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引 134 doc.Add(new Lucene.Net.Documents.Field("Types", Types, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引 135 doc.Add(new Lucene.Net.Documents.Field("UploadTime", UploadTime, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NOT_ANALYZED));//存储且索引 136 doc.Add(new Lucene.Net.Documents.Field("CreaetBy", CreaetBy, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NO)); 137 writer.AddDocument(doc); 138 } 139 catch (FileNotFoundException fnfe) 140 { 141 throw fnfe; 142 } 143 catch (Exception ex) 144 { 145 throw ex; 146 } 147 } 148 public string GetFileName(object objfilepath, object FileName) 149 { 150 string result = ""; 151 if (FileName != null && FileName.ToString() != "") 152 { 153 result = FileName.ToString(); 154 } 155 else 156 { 157 if (objfilepath != null && objfilepath.ToString() != "") 158 { 159 string filename = objfilepath.ToString().Substring(objfilepath.ToString().LastIndexOf(',') + 1).Replace(";", ""); 160 result = filename; 161 } 162 } 163 return result; 164 } 165 #endregion 166 167 #region 获取数据库数据 168 /// <summary> 169 /// 获取中心资料库数据 170 /// </summary> 171 /// <param name="whereStr"></param> 172 /// <returns></returns> 173 public DataTable Get_View_CreateResIndex(string whereStr) 174 { 175 string sql = " Select * From Res_View_createResIndex where 1=1 " + whereStr; 176 DataTable dt = new DataTable(); 177 178 try 179 { 180 DataSet ds = Ruihua.Common.DbHelperSQL.Query(sql); 181 if (ds != null && ds.Tables != null && ds.Tables.Count > 0) 182 { 183 dt = ds.Tables[0]; 184 } 185 } 186 catch (Exception ex) 187 { 188 LogHelper.writeLog("【 获取中心资料库数据错误】:" + ex.ToString()); 189 } 190 return dt; 191 } 192 193 public void ModifyResIndex(List<string> list) 194 { 195 string sql = " update ResourceInfoNew set IsIndex=1 where id in ({0}) "; 196 StringBuilder sb = new StringBuilder("'-1'"); 197 //Ruihua.Common.DbHelperSQL.connectionString = ConfigurationManager.AppSettings["ResConStr"].ToString(); 198 LogHelper.writeLog("【更新索引编号开始】:" + string.Join(",", list.ToArray())); 199 if (list.Count > 0) 200 { 201 for (int i = 0; i < list.Count; i++) 202 { 203 sb.Append(",'" + list[i].ToString() + "'"); 204 } 205 sql = string.Format(sql, sb.ToString()); 206 int result = Ruihua.Common.DbHelperSQL.ExecuteSql(sql); 207 LogHelper.writeLog("【更新索引编号结束:" + result.ToString() + "】:" + string.Join(",", list.ToArray())); 208 } 209 } 210 211 /// <summary> 212 /// 判断是否公共资源 213 /// </summary> 214 /// <returns></returns> 215 public bool IsPublicResources(List<string> list, string Id) 216 { 217 218 if (list.Contains(Id)) 219 { 220 return true; 221 } 222 return false; 223 224 } 225 public List<string> GetResourceTypePublicResources() 226 { 227 ObjectCache cache = MemoryCache.Default; 228 List<string> ResourceType = cache["ResourceType"] as List<string>; 229 List<string> publicresource = new List<string>(); 230 if (ResourceType == null) 231 { 232 233 // Ruihua.Common.DbHelperSQL.connectionString = ConfigurationManager.AppSettings["ResConStr"].ToString(); 234 string sql = "select *From ResourceType "; 235 DataSet ds = Ruihua.Common.DbHelperSQL.Query(sql); 236 if (ds != null && ds.Tables != null && ds.Tables.Count > 0) 237 { 238 DataTable dt = ds.Tables[0]; 239 var query1 = from q1 in dt.AsEnumerable() 240 where q1.Field<string>("ParentID") == "0" 241 select q1; 242 if (query1 != null) 243 { 244 foreach (var item in query1) 245 { 246 publicresource.Add(item.Field<string>("TID").Trim()); 247 //第二层 248 AddListString(ref publicresource, dt, item.Field<string>("TID").Trim()); 249 } 250 } 251 } 252 CacheItemPolicy policy = new CacheItemPolicy(); 253 policy.AbsoluteExpiration = DateTimeOffset.Now.AddSeconds(1800.0);//属性设置为 60*30 秒后逐出缓存 254 cache.Set("ResourceType", publicresource, policy); 255 } 256 else 257 { 258 publicresource = ResourceType; 259 } 260 return publicresource; 261 262 } 263 public void AddListString(ref List<string> list, DataTable dt, string Id) 264 { 265 var query2 = from q2 in dt.AsEnumerable() 266 where q2.Field<string>("ParentID") == Id 267 select q2; 268 if (query2 != null) 269 { 270 foreach (var item in query2) 271 { 272 list.Add(item.Field<string>("TID").Trim()); 273 AddListString(ref list, dt, item.Field<string>("TID").Trim()); 274 } 275 } 276 } 277 278 279 280 281 #endregion 282 283 284 285 } 286 287 #endregion