zoukankan      html  css  js  c++  java
  • lucene 实现word,pdf全文检索源码

    创建索引:
      
    import java.io.BufferedReader;
    import java.io.File;   
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.FileReader;   
    import java.io.IOException;   
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.io.Reader;
    import java.io.StringReader;
    import java.text.SimpleDateFormat;
    import java.util.Date;   
      
    import org.apache.lucene.analysis.standard.StandardAnalyzer;   
    import org.apache.lucene.document.DateTools;   
    import org.apache.lucene.document.Document;   
    import org.apache.lucene.document.Field;   
    import org.apache.lucene.index.IndexWriter;   
    import org.apache.lucene.store.Directory;   
    import org.apache.lucene.store.SimpleFSDirectory;   
    import org.apache.lucene.util.Version;   
    import org.apache.pdfbox.pdfparser.PDFParser;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.util.PDFTextStripper;
    import org.apache.poi.hslf.HSLFSlideShow;
    import org.apache.poi.hslf.model.Slide;
    import org.apache.poi.hslf.model.TextRun;
    import org.apache.poi.hslf.usermodel.RichTextRun;
    import org.apache.poi.hslf.usermodel.SlideShow;
    import org.apache.poi.hssf.usermodel.HSSFCell;
    import org.apache.poi.hssf.usermodel.HSSFDateUtil;
    import org.apache.poi.hssf.usermodel.HSSFRow;
    import org.apache.poi.hssf.usermodel.HSSFSheet;
    import org.apache.poi.hssf.usermodel.HSSFWorkbook;
    import org.apache.poi.hwpf.HWPFDocument;
    import org.apache.poi.hwpf.usermodel.Paragraph;
    import org.apache.poi.hwpf.usermodel.Range;
    import org.apache.poi.poifs.filesystem.DocumentEntry;
    import org.apache.poi.poifs.filesystem.DocumentInputStream;
    import org.apache.poi.poifs.filesystem.POIFSFileSystem;
    import org.apache.poi.util.LittleEndian;
    /**  
     * 创建索引 Lucene 3.0+  
     * @author Administrator  
     *  
     */  
    public class indexer {   
    	
        /**  
         * @param args  
         * @throws Exception 
         */  
        public static void main(String[] args) throws Exception {   
            //保存索引文件的地方   
            String indexDir = "data\test\indexDir";   
            //将要搜索TXT文件的地方   
            String dateDir = "data\test\dateDir";   
            IndexWriter indexWriter = null;   
            //创建Directory对象   
            Directory dir = new SimpleFSDirectory(new File(indexDir));   
            //创建IndexWriter对象,
            //第一个参数是Directory,第二个是分词器,
            //第三个表示是否是创建,如果为false为在此基础上面修改,
            //第四表示表示分词的最大值,比如说new MaxFieldLength(2),就表示两个字一分,
            //一般用IndexWriter.MaxFieldLength.LIMITED    
            indexWriter = new IndexWriter(dir,new StandardAnalyzer(Version.LUCENE_30),true,
            		IndexWriter.MaxFieldLength.UNLIMITED);   
            File[] files = new File(dateDir).listFiles();   
            for (int i = 0; i < files.length; i++) { 
            	Document doc = null;
            	if(files[i].getName().endsWith(".txt")){
    	            doc = new Document();   
    	            //创建Field对象,并放入doc对象中   
    	            doc.add(new Field("contents", new FileReader(files[i])));    
    	            doc.add(new Field("filename", files[i].getName(),    
    	                                Field.Store.YES, Field.Index.NOT_ANALYZED));   
    	            doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
    	            		Field.Store.YES,Field.Index.NOT_ANALYZED)); 
            	}else if(files[i].getName().endsWith(".doc")){
            			doc = getDocument(files[i]);
            	}else if(files[i].getName().endsWith(".ppt")){
            		doc = getPPT(files[i]);
            	}else if(files[i].getName().endsWith(".xls")){
            		doc = getExcel(files[i]);
            	}else if(files[i].getName().endsWith(".pdf")){	
            		doc = getPdf(files[i]); 
            	}else{
            		doc = new Document();   
    	            //创建Field对象,并放入doc对象中   
    	            doc.add(new Field("contents", new FileReader(files[i])));    
    	            doc.add(new Field("filename", files[i].getName(),    
    	                                Field.Store.YES, Field.Index.NOT_ANALYZED));   
    	            doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
    	            		Field.Store.YES,Field.Index.NOT_ANALYZED));   
    	              
            	}
            	//写入IndexWriter
            	if(doc!= null) indexWriter.addDocument(doc);
            }   
            //查看IndexWriter里面有多少个索引   
            System.out.println("numDocs:"+indexWriter.numDocs());
            indexWriter.close();
            
        } 
        
        public static Document getDocument(File file) throws Exception {
    		String docPath = file.getAbsolutePath();
    		String title = file.getName();
    		
    		// 创建Document
    		Document document = new Document();
    		
    		/*InputStream inputStream = null;
    		Reader contents = null;
    		try {
    			inputStream = new FileInputStream(file);
    		} catch (FileNotFoundException e) {
    			e.printStackTrace();
    		}
    		
    		WordExtractor extractor = new WordExtractor();
    		//try{
    		//	POIFSFileSystem fsys = new POIFSFileSystem(inputStream);
    		//	DocumentEntry headerProps =
    		//	         (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
    		//	DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
    		//	byte[] header = new byte[headerProps.getSize()];
    			 
    			 
    		//	din.read(header);
    		//	din.close();
    			 
    		//	int info = LittleEndian.getShort(header, 0xa);
    		//	if ((info & 0x4) != 0)
    		//	{
    		//		throw new FastSavedException("Fast-saved files are unsupported at this time");
    		//	}
    		//	if ((info & 0x100) != 0)
    		//	{
    		//		throw new PasswordProtectedException("This document is password protected");
    		//	}
    		//}finally{
    			
    		//}
    		
    		try {
    			contents = new StringReader(extractor.extractText(inputStream));
    		} catch (Exception e) {
    			e.printStackTrace();
    		}*/
    
    		StringBuffer contents = new StringBuffer("");// 文档内容
            try {
            	FileInputStream fs = new FileInputStream(docPath);
                HWPFDocument doc = new HWPFDocument(fs);
                Range range = doc.getRange();
                int paragraphCount = range.numParagraphs();// 段落
                for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
                    Paragraph pp = range.getParagraph(i);
                    contents.append(pp.text());
                } 
    
            } catch (Exception e) {
    
            }
            String cont = contents.toString().trim();
    
    		
    		document.add(new Field("filename", title, Field.Store.YES,
    				Field.Index.ANALYZED));//TOKENIZED
    		//document.add(new Field("contents", contents));
    		document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));
    		//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));
    		document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
            		Field.Store.YES,Field.Index.NOT_ANALYZED));
    		return document;
    	}
        
        public static Document getPPT(File pptFile) throws IOException{
        	String docPath = pptFile.getAbsolutePath();
    		String title = pptFile.getName();
        	
        	
        	StringBuffer contents = new StringBuffer("");// 文档内容
        	InputStream is = new FileInputStream(pptFile);
        	SlideShow ppt = new SlideShow(new HSLFSlideShow(is));
        	Slide[] slides = ppt.getSlides();
        	//提取文本信息   
        	/*for (Slide each : slides) {
        		//System.out.println("title:" + each.getTitle()) ;
        		//System.out.println("content:") ;
        		TextRun[] textRuns = each.getTextRuns();
        		for (int i=0 ;i< textRuns.length; i++ ) {
        			//System.out.println(textRuns[i].getText());
        			RichTextRun[] richTextRuns = textRuns[i].getRichTextRuns();
        			for (int j = 0; j < richTextRuns.length; j++) {
        				//System.out.println(richTextRuns[j].getText());
        				contents.append(richTextRuns[j].getText());
        			}
        		}
        		contents.append(each.getTitle());
        	}*/
        	for(int i=0;i <slides.length;i++){
                TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun 
                for(int   j=0;j <t.length;j++){ 
                	contents.append(t[j].getText());//这里会将文字内容加到content中去 
                } 
                //contents.append(slides[i].getTitle()); 
            }
        	
        	Document document = new Document();
        	String cont = contents.toString().trim();
    
    		
    		document.add(new Field("filename", title, Field.Store.YES,
    				Field.Index.ANALYZED));//TOKENIZED
    		//document.add(new Field("contents", contents));
    		document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));
    		//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));
    		document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
            		Field.Store.YES,Field.Index.NOT_ANALYZED));
        	return document;
        }
      
        public static Document getPdf(File pdf) {
    		String pdfpath = pdf.getAbsolutePath();
    		// 创建输入流读取pdf文件
    		String title = pdf.getName();
    		String result = "";
    		FileInputStream is = null;
    		PDDocument doc = null;
    		try {
    			is = new FileInputStream(pdf);
    			PDFParser parser = new PDFParser(is);
    			parser.parse();
    			doc = parser.getPDDocument();
    			PDFTextStripper stripper = new PDFTextStripper();
    			result = stripper.getText(doc);
    
    		} catch (Exception e) {
    
    			e.printStackTrace();
    		} finally {
    			if (is != null) {
    				try {
    					is.close();
    				} catch (Exception e) {
    					e.printStackTrace();
    				}
    			}
    			if (doc != null) {
    				try {
    					doc.close();
    				} catch (Exception e) {
    					e.printStackTrace();
    				}
    			}
    		}
    		Document document = new Document();
    		document.add(new Field("filename", title, Field.Store.YES,
    				Field.Index.ANALYZED));//TOKENIZED
    		document.add(new Field("contents", result, Field.Store.YES,
    				Field.Index.ANALYZED));
    		//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));
    		return document;
    	}
        
        public static Document getExcel(File fileExcel) throws Exception {
    
        	InputStream is = new FileInputStream(fileExcel);
            StringBuffer content = new StringBuffer();
    
            HSSFWorkbook workbook = new HSSFWorkbook(is);
    
            for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
                HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
                content.append("
    ");
                if (null == aSheet) {
                   continue;
                }
                for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {
                   content.append("
    ");
                   HSSFRow aRow = aSheet.getRow(rowNum);
                   if (null == aRow) {
                       continue;
                   }
    
                   for (short cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {
                       HSSFCell aCell = aRow.getCell(cellNum);
                       if (null == aCell) {
                          continue;
                       }
    
                       if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
                          content.append(aCell.getRichStringCellValue().getString());
                       } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
                          boolean b = HSSFDateUtil.isCellDateFormatted(aCell);
                          if (b) {
                              Date date = aCell.getDateCellValue();
                              SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");
                              content.append(df.format(date));
                          }
                       }
                   }
                }
            }
            
            String cont = content.toString();
            Document document = new Document();
    		document.add(new Field("filename",fileExcel.getName(), Field.Store.YES,
    				Field.Index.ANALYZED));//TOKENIZED
    		document.add(new Field("contents", cont, Field.Store.YES,
    				Field.Index.ANALYZED));
    		//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));
    		return document;
         }
        
        public static String readHtml(String urlString) {
    
            StringBuffer content = new StringBuffer("");
            File file = new File(urlString);
            FileInputStream fis = null;
            try {
                fis = new FileInputStream(file);
                // 读取页面
                BufferedReader reader = new BufferedReader(new InputStreamReader(
                        fis,"utf-8"));//这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
                
                String line = null;
    
                while ((line = reader.readLine()) != null) {
                    content.append(line + "
    ");
                }
                reader.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
            String contentString = content.toString();
            return contentString;
        }
    } 
    

      搜索索引 

       
      
    import java.io.File;   
    import java.io.IOException;   
      
    import org.apache.lucene.analysis.standard.StandardAnalyzer;   
    import org.apache.lucene.document.Document;   
    import org.apache.lucene.queryParser.ParseException;   
    import org.apache.lucene.queryParser.QueryParser;   
    import org.apache.lucene.search.IndexSearcher;   
    import org.apache.lucene.search.Query;   
    import org.apache.lucene.search.ScoreDoc;   
    import org.apache.lucene.search.TopDocs;   
    import org.apache.lucene.store.Directory;   
    import org.apache.lucene.store.SimpleFSDirectory;   
    import org.apache.lucene.util.Version;   
    /**  
     * 搜索索引 Lucene 3.0+  
     * @author Administrator  
     *  
     */  
    public class searcher {   
      
        public static void main(String[] args) throws IOException, ParseException {   
            //保存索引文件的地方      
            String indexDir = "data\test\indexDir"; 
            Directory dir = new SimpleFSDirectory(new File(indexDir));   
            //创建 IndexSearcher对象,相比IndexWriter对象,这个参数就要提供一个索引的目录就行了   
            IndexSearcher indexSearch = new IndexSearcher(dir);   
            //创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器   
            QueryParser queryParser = new QueryParser(Version.LUCENE_30,   
                    "contents", new StandardAnalyzer(Version.LUCENE_30));   
            //生成Query对象   
            Query query = queryParser.parse("arcgis");   
            //搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值   
            TopDocs hits = indexSearch.search(query,10);   
            //hits.totalHits表示一共搜到多少个   
            System.out.println("找到了"+hits.totalHits+"个");   
            //循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值   
            for (int i = 0; i < hits.scoreDocs.length; i++) {   
                ScoreDoc sdoc = hits.scoreDocs[i];   
                Document doc = indexSearch.doc(sdoc.doc);
                System.out.println(doc.get("filename"));
            }
            indexSearch.close();   
        }   
    } 
    

      

  • 相关阅读:
    C# 视频监控系列(11):H264播放器——封装API[HikPlayM4.dll]
    php框架
    ExtJS带验证码登录框[新增回车提交]
    ant 读取环境变量的值
    Apache Velocity实现模板化
    23种设计模式概述
    android资源下载
    无序hashset与hashmap让其有序
    PermGen space错误解决方法
    设计模式之代理模式(Proxy)
  • 原文地址:https://www.cnblogs.com/zzlp/p/4757543.html
Copyright © 2011-2022 走看看