zoukankan      html  css  js  c++  java
  • lucene4.0 基于smb文件服务器的全文检索

    使用lucene 4.0版本的全文检索

    所需要的jar包 

    网速太慢,下次有空再把jar传上来

    1.FileIndex  建立索引,查询,删除,更新

    package com.strongit.tool.retrieval;
    
    import java.io.File;
    import java.net.MalformedURLException;
    import java.util.ArrayList;
    import java.util.Date;
    import java.util.List;
    
    import jcifs.smb.SmbException;
    import jcifs.smb.SmbFile;
    import jcifs.smb.SmbFileFilter;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.LongField;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.document.Field.Store;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.queryparser.classic.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    import com.strongit.util.BaseinfoConfigurer;
    
    public class FileIndex {
    	
    	private static String INDEX_DIR = "D:\index";
    	private static Analyzer analyzer = null;
    	private static Directory directory = null;
    	private static IndexWriter indexWriter = null;
    	private static String content = "";
    	
    	 public static void main(String[] args) {
    	        try {
    //	        createIndex();//创建索引
    //	            search("测试");
    //	            insert();//新增索引,不删除之前的
    //	            delete("1470817624520");
    //	            update();
    	        } catch (Exception e) {
    	            e.printStackTrace();
    	        }
    	    }
    
    	/**
    	 * 删除索引
    	 * 
    	 * @param @param str 删除的关键字 建立索引时的id
    	 * @param @throws Exception
    	 * @author wusongxiao
    	 * @date 2016年8月10日
    	 */
    	public static void delete(String str) throws Exception {
    		Date date1 = new Date();
    		analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    		directory = FSDirectory.open(new File(INDEX_DIR));
    
    		IndexWriterConfig config = new IndexWriterConfig(
    				Version.LUCENE_CURRENT, analyzer);
    		indexWriter = new IndexWriter(directory, config);
    
    		// indexWriter.deleteDocuments(new Term("filename",str));
    
    		indexWriter.deleteDocuments(new Term("id", str)); // 建立索引时 给这个索引赋一个id
    
    		indexWriter.close();
    
    		Date date2 = new Date();
    		System.out.println("删除索引耗时:" + (date2.getTime() - date1.getTime())
    				+ "ms
    ");
    	}
    
    	/**
    	 * 新增加索引,不覆盖之前的
    	 * 
    	 * @Description: TODO
    	 * @param @throws Exception
    	 * @return void
    	 * @throws
    	 * @author wusongxiao
    	 * @date 2016年8月10日
    	 */
    	public static void insert(List listname) throws Exception {
    
    //		String path = "smb://admini:2014wh@192.168.168.140/resource/Teaching/test001.txt";
    
    		for(int j =0;j<listname.size();j++){
    			
    			String path=   listname.get(j); //文件地址
    			SmbFile folder = new SmbFile(path);
    			List<SmbFile> fileList = new ArrayList<SmbFile>();
    			fileList.add(folder);
    			for (SmbFile file : fileList) {
    				content = "";
    				// 获取文件后缀
    				String type = file.getName().substring(
    						file.getName().lastIndexOf(".") + 1);
    				if ("txt".equalsIgnoreCase(type)) {
    
    					content += ReadFile.readTxt(file.getPath(),"gb2312");
    
    				} else if ("doc".equalsIgnoreCase(type)) {
    
    					content += ReadFile.readWorddoc(file.getPath());
    
    				} else if ("xls".equalsIgnoreCase(type)) {
    
    					content += ReadFile.xls2String(file.getPath());
    
    				} else if ("xlsx".equalsIgnoreCase(type)) {
    
    					content += ReadFile.readExcel2007(file.getPath());
    
    				} else if ("ppt".equalsIgnoreCase(type)) {
    
    					content += ReadFile.readPowerPoint(file.getPath());
    
    				} else if ("pdf".equalsIgnoreCase(type)) {
    
    					content += ReadFile.readPdf(file.getPath());
    
    				}else if ("docx".equalsIgnoreCase(type)) {
    
    					content += ReadFile.readWorddocx(file.getPath());
    
    				}
    				
    				
    
    //				System.out.println("name :" + file.getName());//名称
    //				System.out.println("path :" + file.getPath());//地址
    //                              System.out.println("content :"+content);//content内容
    
    				try {
    					analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    					directory = FSDirectory.open(new File(INDEX_DIR));
    
    					File indexFile = new File(INDEX_DIR);
    					if (!indexFile.exists()) {
    						indexFile.mkdirs();
    					}
    					IndexWriterConfig config = new IndexWriterConfig(
    							Version.LUCENE_CURRENT, analyzer);
    					indexWriter = new IndexWriter(directory, config);
    			//		String ID = pathname[1].toString();//赋值一个唯一的ID,方便删除
    					Document document = new Document();
    					document.add(new TextField("filename", file.getName(),
    							Store.YES));
    					document.add(new TextField("content", content, Store.YES));
    					document.add(new TextField("path", file.getPath(), Store.YES));
    			//		document.add(new TextField("id", ID, Store.YES));
    					indexWriter.addDocument(document);
    					indexWriter.commit();
    
    					ReadFile.closeWriter(indexWriter);
    
    				} catch (Exception e) {
    					e.printStackTrace();
    				}
    				content = "";
    			}
    			
    		}
    		
    	}
    
    	/**
    	 * 查询索引
    	 * 
    	 * @Description: TODO
    	 * @param @param str 查询关键字
    	 * @param @throws Exception
    	 * @return void
    	 * @throws
    	 * @author wusongxiao
    	 * @date 2016年8月10日
    	 */
    	public static List search(String str) throws Exception {
    		directory = FSDirectory.open(new File(INDEX_DIR));
    		analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    		DirectoryReader ireader = DirectoryReader.open(directory);
    		IndexSearcher isearcher = new IndexSearcher(ireader);
    QueryParser parser = new QueryParser(Version.LUCENE_30, "content", analyzer);//LUCENE_30 不分词查询,只搜关键词 Query query = parser.parse(str); List liatname = new ArrayList(); ScoreDoc[] hits = isearcher.search(query, null, 10000).scoreDocs; for (int i = 0; i < hits.length; i++) { Document hitDoc = isearcher.doc(hits[i].doc); System.out.println(hitDoc.get("filename"));// 文件名 liatname.add(hitDoc.get("filename")); // System.out.println(hitDoc.get("content"));//内容 } ireader.close(); directory.close(); return liatname; } /** * 更新索引 更新原来索引的内容---只是改变原来文件的索引 * * @Description: TODO * @param @throws Exception * @return void * @throws * @author wusongxiao * @date 2016年8月10日 */ public static void update() throws Exception { String path = "D:\file\file\f1\test2.txt"; SmbFile folder = new SmbFile(path); List<SmbFile> fileList = new ArrayList<SmbFile>(); fileList.add(folder); Date date1 = new Date(); for (SmbFile file : fileList) { content = ""; // 获取文件后缀 String type = file.getName().substring( file.getName().lastIndexOf(".") + 1); if ("txt".equalsIgnoreCase(type)) { content += ReadFile.readTxt(file.getPath(),"gb2312"); } else if ("doc".equalsIgnoreCase(type)) { content += ReadFile.readWorddoc(file.getPath()); } else if ("xls".equalsIgnoreCase(type)) { content += ReadFile.xls2String(file.getPath()); } else if ("xlsx".equalsIgnoreCase(type)) { content += ReadFile.readExcel2007(file.getPath()); } else if ("ppt".equalsIgnoreCase(type)) { content += ReadFile.readPowerPoint(file.getPath()); } else if ("pdf".equalsIgnoreCase(type)) { content += ReadFile.readPdf(file.getPath()); }else if ("docx".equalsIgnoreCase(type)) { content += ReadFile.readWorddocx(file.getPath()); } System.out.println("name :" + file.getName()); System.out.println("path :" + file.getPath()); // System.out.println("content :"+content);//content内容 System.out.println(); try { analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); directory = FSDirectory.open(new File(INDEX_DIR)); File indexFile = new File(INDEX_DIR); if (!indexFile.exists()) { indexFile.mkdirs(); } IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_CURRENT, analyzer); indexWriter = new IndexWriter(directory, config); Long time = date1.getTime(); String tt = time.toString(); Document document = new Document(); document.add(new TextField("filename", file.getName(), Store.YES)); document.add(new TextField("content", content, Store.YES)); document.add(new TextField("path", file.getPath(), Store.YES)); document.add(new TextField("id", tt, Store.YES)); indexWriter.updateDocument(new Term("filename", "text1"), document); indexWriter.close(); ReadFile.closeWriter(indexWriter); } catch (Exception e) { e.printStackTrace(); } content = ""; } Date date2 = new Date(); System.out.println("更新索引耗时:" + (date2.getTime() - date1.getTime()) + "ms "); } /** * 创建索引,删除之前的索引,更新全部文件的索引 * * @Description: TODO * @param @param path * @param @return * @param @throws Exception * @return boolean * @throws * @author wusongxiao * @date 2016年8月10日 */ public static boolean createIndex(String path) throws Exception { // public static boolean createIndex() throws Exception { Date date1 = new Date(); String username = (String) BaseinfoConfigurer .getContextProperty("username"); String possword = (String) BaseinfoConfigurer .getContextProperty("possword"); String fileServerIp = (String) BaseinfoConfigurer .getContextProperty("fileServerIp"); String sharedirectory = (String) BaseinfoConfigurer .getContextProperty("sharedirectory"); path = "smb" + "://" + username + ":" + possword + "@" + fileServerIp + "/" + sharedirectory + "/"; //删除之前索引 ReadFile.deleteDir(new File(INDEX_DIR+"\")); // String path = "smb://admini:2014wh@192.168.168.140/resource/"; SmbFile folder = new SmbFile(path); SmbFile[] result = searchFile(folder);// 根目录下的所有文件夹文件 for (SmbFile file : result) { content = ""; // 获取文件后缀 String type = file.getName().substring( file.getName().lastIndexOf(".") + 1); if ("txt".equalsIgnoreCase(type)) { content += ReadFile.readTxt(file.getPath(),"gb2312"); } else if ("doc".equalsIgnoreCase(type)) { content += ReadFile.readWorddoc(file.getPath()); } else if ("xls".equalsIgnoreCase(type)) { content += ReadFile.xls2String(file.getPath()); } else if ("xlsx".equalsIgnoreCase(type)) { content += ReadFile.readExcel2007(file.getPath()); } else if ("ppt".equalsIgnoreCase(type)) { content += ReadFile.readPowerPoint(file.getPath()); } else if ("pdf".equalsIgnoreCase(type)) { content += ReadFile.readPdf(file.getPath()); }else if ("docx".equalsIgnoreCase(type)) { content += ReadFile.readWorddocx(file.getPath()); } System.out.println("name :" + file.getName()); System.out.println("path :" + file.getPath()); // System.out.println("content :"+content); System.out.println(); try { analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); directory = FSDirectory.open(new File(INDEX_DIR)); File indexFile = new File(INDEX_DIR); if (!indexFile.exists()) { indexFile.mkdirs(); } IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_CURRENT, analyzer); indexWriter = new IndexWriter(directory, config); Document document = new Document(); document.add(new TextField("filename", file.getName(), Store.YES)); document.add(new TextField("content", content, Store.YES)); document.add(new TextField("path", file.getPath(), Store.YES)); // document.add(new TextField("id", tt, Store.YES)); indexWriter.addDocument(document); indexWriter.commit(); ReadFile.closeWriter(indexWriter); } catch (Exception e) { e.printStackTrace(); } content = ""; } Date date2 = new Date(); System.out.println("创建索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms "); return true; } /** * 递归查找所有的文件 * * @Description: TODO * @param @param folder * @param @return * @return SmbFile[] * @throws * @author wusongxiao * @date 2016年8月10日 */ public static SmbFile[] searchFile(SmbFile folder) { SmbFile[] subFolders = null; try { subFolders = folder.listFiles(new SmbFileFilter() { // 运用内部匿名类获得文件 @Override public boolean accept(SmbFile pathname) {// 实现FileFilter类的accept方法 try { if (pathname.isDirectory() || (pathname.isFile())) {// 目录或文件包含关键字 return true; } } catch (SmbException e) { } return false; } }); } catch (SmbException e1) { e1.printStackTrace(); } List<SmbFile> result = new ArrayList<SmbFile>();// 声明一个集合 for (int i = 0; i < subFolders.length; i++) {// 循环显示文件夹或文件 try { if (subFolders[i].isFile()) {// 如果是文件则将文件添加到结果列表中 result.add(subFolders[i]); } else {// 如果是文件夹,则递归调用本方法,然后把所有的文件加到结果列表中 SmbFile[] foldResult = searchFile(subFolders[i]); for (int j = 0; j < foldResult.length; j++) {// 循环显示文件 String smname = foldResult[j].toString(); String txtname = smname.substring(smname .lastIndexOf("/") + 1);// 截取文件名 String txtName = txtname.substring(txtname .lastIndexOf("."));// 截取格式 if (".txt".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".ppt".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".doc".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".xls".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".xlsx".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".pdf".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } } } } catch (SmbException e) { e.printStackTrace(); } } SmbFile files[] = new SmbFile[result.size()];// 声明文件数组,长度为集合的长度 result.toArray(files);// 集合数组化 return files; } }

      

    2.读取文档的方法类 txt,xlsx,xls,ppt,pdf,doc, docx(不能读取图片)

    package com.strongit.tool.retrieval;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.util.List;  
    
    import jcifs.smb.SmbFileInputStream;
    import jxl.Cell;
    import jxl.Sheet;
    import jxl.Workbook;
    
    import org.apache.lucene.index.IndexWriter;
    import org.apache.pdfbox.pdfparser.PDFParser;
    import org.apache.pdfbox.util.PDFTextStripper;
    import org.apache.poi.hslf.HSLFSlideShow;
    import org.apache.poi.hslf.model.Slide;
    import org.apache.poi.hslf.model.TextRun;
    import org.apache.poi.hslf.usermodel.SlideShow;
    import org.apache.poi.hwpf.HWPFDocument;
    import org.apache.poi.hwpf.usermodel.Paragraph;
    import org.apache.poi.hwpf.usermodel.Range;
    import org.apache.poi.xssf.usermodel.XSSFCell;
    import org.apache.poi.xssf.usermodel.XSSFRow;
    import org.apache.poi.xssf.usermodel.XSSFSheet;
    import org.apache.poi.xssf.usermodel.XSSFWorkbook;
    import org.apache.poi.xwpf.usermodel.XWPFDocument;
    import org.apache.poi.xwpf.usermodel.XWPFParagraph;
    
    /**
     * 读取文档方法
     * ClassName: ReadFile 
     * @Description: TODO
     * @date 2016年8月10日
     * @author wsx
     */
    public class ReadFile {
    	
    	 private static ReadFile indexManager;
    	
    	 /**
         * 读取doc文件内容
         * @param filepath 想要读取的文件地址
         * @return 返回文件内容
         */
    	 public static String readWorddoc(String filepath) {
    			StringBuffer content = new StringBuffer("");// 文档内容
    			try {
    				HWPFDocument doc = new HWPFDocument(new SmbFileInputStream(filepath));
    				Range range = doc.getRange();
    				int paragraphCount = range.numParagraphs();// 段落
    				for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
    					Paragraph pp = range.getParagraph(i);
    					content.append(pp.text());
    				}
    
    			} catch (Exception e) {
    				e.printStackTrace();
    			}
    			return content.toString().trim();
    		}
    	 /**
    	  * docx 格式建立索引,图片没有读到,只读取的数据  
    	  * @Description: TODO
    	  * @param @param filepath
    	  * @param @return   
    	  * @return String  
    	  * @date 2016年8月12日
    	  */
    	 public static String readWorddocx(String filepath) {
    			StringBuffer content = new StringBuffer("");// 文档内容
    			try {  
    				//     D://file//docx.docx     D://file//doc.doc
    //				filepath = "D://file//docx.docx";
    				SmbFileInputStream in = new SmbFileInputStream(filepath);//载入文档 
    				//word docx 图片不会被读取,只读取数据   
                    XWPFDocument xwpf = new XWPFDocument(in);//得到word文档的信息  
                  List<XWPFParagraph> listParagraphs = xwpf.getParagraphs();//得到段落信息
                  
                  for(int i =0;i<listParagraphs.size();i++){
                	  String cont = listParagraphs.get(i).getRuns().toString();
                	  content.append(cont);
                  }
                   
    	        } catch (Exception e) {  
    	            e.printStackTrace();  
    	        }  
    			return content.toString().trim();
    		}
        
        /**
         * 读取xls文件内容
         * @param filepath 想要读取的文件对象
         * @return 返回文件内容
         */
        public static String xls2String(String filepath){
            String result = "";
            try{
                SmbFileInputStream fis = new SmbFileInputStream(filepath);   
                StringBuilder sb = new StringBuilder();   
                jxl.Workbook rwb = Workbook.getWorkbook(fis);   
                Sheet[] sheet = rwb.getSheets();   
                for (int i = 0; i < sheet.length; i++) {   
                    Sheet rs = rwb.getSheet(i);   
                    for (int j = 0; j < rs.getRows(); j++) {   
                       Cell[] cells = rs.getRow(j);   
                       for(int k=0;k<cells.length;k++)   
                       sb.append(cells[k].getContents() + " ");   
                    }   
                }   
                fis.close();   
                result += sb.toString();
            }catch(Exception e){
                e.printStackTrace();
            }
            return result;
        }
        /**
         * PDF格式  文件创建索引
         * @Description: TODO
         * @param @param path
         * @param @return
         * @param @throws Exception   
         * @return String  
         * @date 2016年8月11日
         */
        public static String readPdf(String path) throws Exception {
            StringBuffer content = new StringBuffer("");// 文档内容
            SmbFileInputStream fis = new SmbFileInputStream(path);
            PDFParser p = new PDFParser(fis);
            p.parse();
            PDFTextStripper ts = new PDFTextStripper();
            content.append(ts.getText(p.getPDDocument()));
            fis.close();
            return content.toString().trim();
        }
        
        /**
         * 读取xlsx格式的excel文档
         * @param @param filepath
         * @param @throws IOException   
         * @author wusongxiao
         * @date 2016年8月10日
         */
        public static String readExcel2007(String filepath) throws IOException {
    
    //		System.out.println(filepath);
    
    		StringBuffer content = new StringBuffer();
    
    		// 构造 XSSFWorkbook 对象,strPath 传入文件路径 **** SmbFileInputStream SMB读取文件 ***
    		XSSFWorkbook xwb = new XSSFWorkbook(new SmbFileInputStream(filepath));
    		// 循环工作表Sheet
    		for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
    			XSSFSheet xSheet = xwb.getSheetAt(numSheet);
    			if (xSheet == null) {
    				continue;
    			}
    			// 循环行Row
    			for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
    				XSSFRow xRow = xSheet.getRow(rowNum);
    				if (xRow == null) {
    					continue;
    				}
    				// 循环列Cell
    				for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
    					XSSFCell xCell = xRow.getCell(cellNum);
    					if (xCell == null) {
    						continue;
    					}
    					String s = null;
    					if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
    						content.append(xCell.getBooleanCellValue());
    					} else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
    						content.append(xCell.getNumericCellValue());
    					} else {
    						content.append(xCell.getStringCellValue()  + ""); //+ "
    "
    					}
    				}
    			}
    		}
    
    		return content.toString();
    	}
        /**
         * 读取txt文档
         * @param @param filepath  地址
         * @param @param charSet  编码格式
         * @param @throws IOException   
         * @author wusongxiao
         * @date 2016年8月10日
         */
        public static String readTxt(String filepath, String charSet)
    			throws IOException {
    		BufferedReader reader = new BufferedReader(new InputStreamReader(
    				new SmbFileInputStream(filepath), charSet));  //reader.readLine() 读取txt文本  String的
    		String line = new String();
    		String temp = new String();
    		while ((line = reader.readLine()) != null) {
    			temp += line;
    		}
    		reader.close();
    		return temp;
    	}
        /**
         * 读取ppt文件
         * @Description: TODO
         * @param @param filepath
         * @param @return   
         * @return String  
         * @date 2016年8月10日
         */
        public static String readPowerPoint(String filepath) {
    		StringBuffer content = new StringBuffer("");
    		try {
    			SlideShow ss = new SlideShow(new HSLFSlideShow(new SmbFileInputStream(filepath)));// is
    			// 为文件的InputStream,建立SlideShow
    			Slide[] slides = ss.getSlides();// 获得每一张幻灯片
    			for (int i = 0; i < slides.length; i++) {
    				TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun
    				for (int j = 0; j < t.length; j++) {
    					content.append(t[j].getText());// 这里会将文字内容加到content中去
    				}
    			}
    		} catch (Exception ex) {
    			System.out.println(ex.toString());
    		}
    		return content.toString();
    	}
        
        public static void closeWriter(IndexWriter indexWriter) throws Exception {
            if (indexWriter != null) {
                indexWriter.close();
            }
        }
        
        /**
         * 创建索引管理器
         * @return 返回索引管理器对象
         */
        public ReadFile getManager(){
            if(indexManager == null){
                this.indexManager = new ReadFile();
            }
            return indexManager;
        }
        /**
         * 删除目录下的所有索引
         * @Description: TODO
         * @param @param file
         * @param @return   
         * @return boolean  
         * @throws
         * @author wusongxiao
         * @date 2016年8月10日
         */
        public static boolean deleteDir(File file){
            if(file.isDirectory()){
                File[] files = file.listFiles();
                for(int i=0; i<files.length; i++){
                    deleteDir(files[i]);
                }
            }
            file.delete();
            return true;
        }
    
    }
    

      

    整个都是基于SMB 文件服务器的lucene4.0全文检索,如果是本地文件的话   只需要把所有的地址   类似 SmbFileInputStream  去掉 Smb   就可以了

    
    
  • 相关阅读:
    Session服务器配置指南与使用经验
    关于SetLocaleInfo()
    创业及野心的一定要看
    创业公司CEO每周应该做的13件事
    NSIS 打包工具
    共勉
    Access denied for user 'root'@'localhost' (using password: NO)
    给浮躁的软件业同仁(转)
    NSIS 一点经验
    家用办公机
  • 原文地址:https://www.cnblogs.com/wusx/p/5765592.html
Copyright © 2011-2022 走看看