zoukankan      html  css  js  c++  java
  • lucene4.0 基于smb文件服务器的全文检索

    使用lucene 4.0版本的全文检索

    所需要的jar包 

    网速太慢,下次有空再把jar传上来

    1.FileIndex  建立索引,查询,删除,更新

    package com.strongit.tool.retrieval;
    
    import java.io.File;
    import java.net.MalformedURLException;
    import java.util.ArrayList;
    import java.util.Date;
    import java.util.List;
    
    import jcifs.smb.SmbException;
    import jcifs.smb.SmbFile;
    import jcifs.smb.SmbFileFilter;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Document;
    import org.apache.lucene.document.LongField;
    import org.apache.lucene.document.TextField;
    import org.apache.lucene.document.Field.Store;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
    import org.apache.lucene.index.Term;
    import org.apache.lucene.queryparser.classic.QueryParser;
    import org.apache.lucene.search.IndexSearcher;
    import org.apache.lucene.search.Query;
    import org.apache.lucene.search.ScoreDoc;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    
    import com.strongit.util.BaseinfoConfigurer;
    
    public class FileIndex {
    	
    	private static String INDEX_DIR = "D:\index";
    	private static Analyzer analyzer = null;
    	private static Directory directory = null;
    	private static IndexWriter indexWriter = null;
    	private static String content = "";
    	
    	 public static void main(String[] args) {
    	        try {
    //	        createIndex();//创建索引
    //	            search("测试");
    //	            insert();//新增索引,不删除之前的
    //	            delete("1470817624520");
    //	            update();
    	        } catch (Exception e) {
    	            e.printStackTrace();
    	        }
    	    }
    
    	/**
    	 * 删除索引
    	 * 
    	 * @param @param str 删除的关键字 建立索引时的id
    	 * @param @throws Exception
    	 * @author wusongxiao
    	 * @date 2016年8月10日
    	 */
    	public static void delete(String str) throws Exception {
    		Date date1 = new Date();
    		analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    		directory = FSDirectory.open(new File(INDEX_DIR));
    
    		IndexWriterConfig config = new IndexWriterConfig(
    				Version.LUCENE_CURRENT, analyzer);
    		indexWriter = new IndexWriter(directory, config);
    
    		// indexWriter.deleteDocuments(new Term("filename",str));
    
    		indexWriter.deleteDocuments(new Term("id", str)); // 建立索引时 给这个索引赋一个id
    
    		indexWriter.close();
    
    		Date date2 = new Date();
    		System.out.println("删除索引耗时:" + (date2.getTime() - date1.getTime())
    				+ "ms
    ");
    	}
    
    	/**
    	 * 新增加索引,不覆盖之前的
    	 * 
    	 * @Description: TODO
    	 * @param @throws Exception
    	 * @return void
    	 * @throws
    	 * @author wusongxiao
    	 * @date 2016年8月10日
    	 */
    	public static void insert(List listname) throws Exception {
    
    //		String path = "smb://admini:2014wh@192.168.168.140/resource/Teaching/test001.txt";
    
    		for(int j =0;j<listname.size();j++){
    			
    			String path=   listname.get(j); //文件地址
    			SmbFile folder = new SmbFile(path);
    			List<SmbFile> fileList = new ArrayList<SmbFile>();
    			fileList.add(folder);
    			for (SmbFile file : fileList) {
    				content = "";
    				// 获取文件后缀
    				String type = file.getName().substring(
    						file.getName().lastIndexOf(".") + 1);
    				if ("txt".equalsIgnoreCase(type)) {
    
    					content += ReadFile.readTxt(file.getPath(),"gb2312");
    
    				} else if ("doc".equalsIgnoreCase(type)) {
    
    					content += ReadFile.readWorddoc(file.getPath());
    
    				} else if ("xls".equalsIgnoreCase(type)) {
    
    					content += ReadFile.xls2String(file.getPath());
    
    				} else if ("xlsx".equalsIgnoreCase(type)) {
    
    					content += ReadFile.readExcel2007(file.getPath());
    
    				} else if ("ppt".equalsIgnoreCase(type)) {
    
    					content += ReadFile.readPowerPoint(file.getPath());
    
    				} else if ("pdf".equalsIgnoreCase(type)) {
    
    					content += ReadFile.readPdf(file.getPath());
    
    				}else if ("docx".equalsIgnoreCase(type)) {
    
    					content += ReadFile.readWorddocx(file.getPath());
    
    				}
    				
    				
    
    //				System.out.println("name :" + file.getName());//名称
    //				System.out.println("path :" + file.getPath());//地址
    //                              System.out.println("content :"+content);//content内容
    
    				try {
    					analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    					directory = FSDirectory.open(new File(INDEX_DIR));
    
    					File indexFile = new File(INDEX_DIR);
    					if (!indexFile.exists()) {
    						indexFile.mkdirs();
    					}
    					IndexWriterConfig config = new IndexWriterConfig(
    							Version.LUCENE_CURRENT, analyzer);
    					indexWriter = new IndexWriter(directory, config);
    			//		String ID = pathname[1].toString();//赋值一个唯一的ID,方便删除
    					Document document = new Document();
    					document.add(new TextField("filename", file.getName(),
    							Store.YES));
    					document.add(new TextField("content", content, Store.YES));
    					document.add(new TextField("path", file.getPath(), Store.YES));
    			//		document.add(new TextField("id", ID, Store.YES));
    					indexWriter.addDocument(document);
    					indexWriter.commit();
    
    					ReadFile.closeWriter(indexWriter);
    
    				} catch (Exception e) {
    					e.printStackTrace();
    				}
    				content = "";
    			}
    			
    		}
    		
    	}
    
    	/**
    	 * 查询索引
    	 * 
    	 * @Description: TODO
    	 * @param @param str 查询关键字
    	 * @param @throws Exception
    	 * @return void
    	 * @throws
    	 * @author wusongxiao
    	 * @date 2016年8月10日
    	 */
    	public static List search(String str) throws Exception {
    		directory = FSDirectory.open(new File(INDEX_DIR));
    		analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    		DirectoryReader ireader = DirectoryReader.open(directory);
    		IndexSearcher isearcher = new IndexSearcher(ireader);
    QueryParser parser = new QueryParser(Version.LUCENE_30, "content", analyzer);//LUCENE_30 不分词查询,只搜关键词 Query query = parser.parse(str); List liatname = new ArrayList(); ScoreDoc[] hits = isearcher.search(query, null, 10000).scoreDocs; for (int i = 0; i < hits.length; i++) { Document hitDoc = isearcher.doc(hits[i].doc); System.out.println(hitDoc.get("filename"));// 文件名 liatname.add(hitDoc.get("filename")); // System.out.println(hitDoc.get("content"));//内容 } ireader.close(); directory.close(); return liatname; } /** * 更新索引 更新原来索引的内容---只是改变原来文件的索引 * * @Description: TODO * @param @throws Exception * @return void * @throws * @author wusongxiao * @date 2016年8月10日 */ public static void update() throws Exception { String path = "D:\file\file\f1\test2.txt"; SmbFile folder = new SmbFile(path); List<SmbFile> fileList = new ArrayList<SmbFile>(); fileList.add(folder); Date date1 = new Date(); for (SmbFile file : fileList) { content = ""; // 获取文件后缀 String type = file.getName().substring( file.getName().lastIndexOf(".") + 1); if ("txt".equalsIgnoreCase(type)) { content += ReadFile.readTxt(file.getPath(),"gb2312"); } else if ("doc".equalsIgnoreCase(type)) { content += ReadFile.readWorddoc(file.getPath()); } else if ("xls".equalsIgnoreCase(type)) { content += ReadFile.xls2String(file.getPath()); } else if ("xlsx".equalsIgnoreCase(type)) { content += ReadFile.readExcel2007(file.getPath()); } else if ("ppt".equalsIgnoreCase(type)) { content += ReadFile.readPowerPoint(file.getPath()); } else if ("pdf".equalsIgnoreCase(type)) { content += ReadFile.readPdf(file.getPath()); }else if ("docx".equalsIgnoreCase(type)) { content += ReadFile.readWorddocx(file.getPath()); } System.out.println("name :" + file.getName()); System.out.println("path :" + file.getPath()); // System.out.println("content :"+content);//content内容 System.out.println(); try { analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); directory = FSDirectory.open(new File(INDEX_DIR)); File indexFile = new File(INDEX_DIR); if (!indexFile.exists()) { indexFile.mkdirs(); } IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_CURRENT, analyzer); indexWriter = new IndexWriter(directory, config); Long time = date1.getTime(); String tt = time.toString(); Document document = new Document(); document.add(new TextField("filename", file.getName(), Store.YES)); document.add(new TextField("content", content, Store.YES)); document.add(new TextField("path", file.getPath(), Store.YES)); document.add(new TextField("id", tt, Store.YES)); indexWriter.updateDocument(new Term("filename", "text1"), document); indexWriter.close(); ReadFile.closeWriter(indexWriter); } catch (Exception e) { e.printStackTrace(); } content = ""; } Date date2 = new Date(); System.out.println("更新索引耗时:" + (date2.getTime() - date1.getTime()) + "ms "); } /** * 创建索引,删除之前的索引,更新全部文件的索引 * * @Description: TODO * @param @param path * @param @return * @param @throws Exception * @return boolean * @throws * @author wusongxiao * @date 2016年8月10日 */ public static boolean createIndex(String path) throws Exception { // public static boolean createIndex() throws Exception { Date date1 = new Date(); String username = (String) BaseinfoConfigurer .getContextProperty("username"); String possword = (String) BaseinfoConfigurer .getContextProperty("possword"); String fileServerIp = (String) BaseinfoConfigurer .getContextProperty("fileServerIp"); String sharedirectory = (String) BaseinfoConfigurer .getContextProperty("sharedirectory"); path = "smb" + "://" + username + ":" + possword + "@" + fileServerIp + "/" + sharedirectory + "/"; //删除之前索引 ReadFile.deleteDir(new File(INDEX_DIR+"\")); // String path = "smb://admini:2014wh@192.168.168.140/resource/"; SmbFile folder = new SmbFile(path); SmbFile[] result = searchFile(folder);// 根目录下的所有文件夹文件 for (SmbFile file : result) { content = ""; // 获取文件后缀 String type = file.getName().substring( file.getName().lastIndexOf(".") + 1); if ("txt".equalsIgnoreCase(type)) { content += ReadFile.readTxt(file.getPath(),"gb2312"); } else if ("doc".equalsIgnoreCase(type)) { content += ReadFile.readWorddoc(file.getPath()); } else if ("xls".equalsIgnoreCase(type)) { content += ReadFile.xls2String(file.getPath()); } else if ("xlsx".equalsIgnoreCase(type)) { content += ReadFile.readExcel2007(file.getPath()); } else if ("ppt".equalsIgnoreCase(type)) { content += ReadFile.readPowerPoint(file.getPath()); } else if ("pdf".equalsIgnoreCase(type)) { content += ReadFile.readPdf(file.getPath()); }else if ("docx".equalsIgnoreCase(type)) { content += ReadFile.readWorddocx(file.getPath()); } System.out.println("name :" + file.getName()); System.out.println("path :" + file.getPath()); // System.out.println("content :"+content); System.out.println(); try { analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); directory = FSDirectory.open(new File(INDEX_DIR)); File indexFile = new File(INDEX_DIR); if (!indexFile.exists()) { indexFile.mkdirs(); } IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_CURRENT, analyzer); indexWriter = new IndexWriter(directory, config); Document document = new Document(); document.add(new TextField("filename", file.getName(), Store.YES)); document.add(new TextField("content", content, Store.YES)); document.add(new TextField("path", file.getPath(), Store.YES)); // document.add(new TextField("id", tt, Store.YES)); indexWriter.addDocument(document); indexWriter.commit(); ReadFile.closeWriter(indexWriter); } catch (Exception e) { e.printStackTrace(); } content = ""; } Date date2 = new Date(); System.out.println("创建索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms "); return true; } /** * 递归查找所有的文件 * * @Description: TODO * @param @param folder * @param @return * @return SmbFile[] * @throws * @author wusongxiao * @date 2016年8月10日 */ public static SmbFile[] searchFile(SmbFile folder) { SmbFile[] subFolders = null; try { subFolders = folder.listFiles(new SmbFileFilter() { // 运用内部匿名类获得文件 @Override public boolean accept(SmbFile pathname) {// 实现FileFilter类的accept方法 try { if (pathname.isDirectory() || (pathname.isFile())) {// 目录或文件包含关键字 return true; } } catch (SmbException e) { } return false; } }); } catch (SmbException e1) { e1.printStackTrace(); } List<SmbFile> result = new ArrayList<SmbFile>();// 声明一个集合 for (int i = 0; i < subFolders.length; i++) {// 循环显示文件夹或文件 try { if (subFolders[i].isFile()) {// 如果是文件则将文件添加到结果列表中 result.add(subFolders[i]); } else {// 如果是文件夹,则递归调用本方法,然后把所有的文件加到结果列表中 SmbFile[] foldResult = searchFile(subFolders[i]); for (int j = 0; j < foldResult.length; j++) {// 循环显示文件 String smname = foldResult[j].toString(); String txtname = smname.substring(smname .lastIndexOf("/") + 1);// 截取文件名 String txtName = txtname.substring(txtname .lastIndexOf("."));// 截取格式 if (".txt".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".ppt".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".doc".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".xls".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".xlsx".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".pdf".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } } } } catch (SmbException e) { e.printStackTrace(); } } SmbFile files[] = new SmbFile[result.size()];// 声明文件数组,长度为集合的长度 result.toArray(files);// 集合数组化 return files; } }

      

    2.读取文档的方法类 txt,xlsx,xls,ppt,pdf,doc, docx(不能读取图片)

    package com.strongit.tool.retrieval;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.util.List;  
    
    import jcifs.smb.SmbFileInputStream;
    import jxl.Cell;
    import jxl.Sheet;
    import jxl.Workbook;
    
    import org.apache.lucene.index.IndexWriter;
    import org.apache.pdfbox.pdfparser.PDFParser;
    import org.apache.pdfbox.util.PDFTextStripper;
    import org.apache.poi.hslf.HSLFSlideShow;
    import org.apache.poi.hslf.model.Slide;
    import org.apache.poi.hslf.model.TextRun;
    import org.apache.poi.hslf.usermodel.SlideShow;
    import org.apache.poi.hwpf.HWPFDocument;
    import org.apache.poi.hwpf.usermodel.Paragraph;
    import org.apache.poi.hwpf.usermodel.Range;
    import org.apache.poi.xssf.usermodel.XSSFCell;
    import org.apache.poi.xssf.usermodel.XSSFRow;
    import org.apache.poi.xssf.usermodel.XSSFSheet;
    import org.apache.poi.xssf.usermodel.XSSFWorkbook;
    import org.apache.poi.xwpf.usermodel.XWPFDocument;
    import org.apache.poi.xwpf.usermodel.XWPFParagraph;
    
    /**
     * 读取文档方法
     * ClassName: ReadFile 
     * @Description: TODO
     * @date 2016年8月10日
     * @author wsx
     */
    public class ReadFile {
    	
    	 private static ReadFile indexManager;
    	
    	 /**
         * 读取doc文件内容
         * @param filepath 想要读取的文件地址
         * @return 返回文件内容
         */
    	 public static String readWorddoc(String filepath) {
    			StringBuffer content = new StringBuffer("");// 文档内容
    			try {
    				HWPFDocument doc = new HWPFDocument(new SmbFileInputStream(filepath));
    				Range range = doc.getRange();
    				int paragraphCount = range.numParagraphs();// 段落
    				for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
    					Paragraph pp = range.getParagraph(i);
    					content.append(pp.text());
    				}
    
    			} catch (Exception e) {
    				e.printStackTrace();
    			}
    			return content.toString().trim();
    		}
    	 /**
    	  * docx 格式建立索引,图片没有读到,只读取的数据  
    	  * @Description: TODO
    	  * @param @param filepath
    	  * @param @return   
    	  * @return String  
    	  * @date 2016年8月12日
    	  */
    	 public static String readWorddocx(String filepath) {
    			StringBuffer content = new StringBuffer("");// 文档内容
    			try {  
    				//     D://file//docx.docx     D://file//doc.doc
    //				filepath = "D://file//docx.docx";
    				SmbFileInputStream in = new SmbFileInputStream(filepath);//载入文档 
    				//word docx 图片不会被读取,只读取数据   
                    XWPFDocument xwpf = new XWPFDocument(in);//得到word文档的信息  
                  List<XWPFParagraph> listParagraphs = xwpf.getParagraphs();//得到段落信息
                  
                  for(int i =0;i<listParagraphs.size();i++){
                	  String cont = listParagraphs.get(i).getRuns().toString();
                	  content.append(cont);
                  }
                   
    	        } catch (Exception e) {  
    	            e.printStackTrace();  
    	        }  
    			return content.toString().trim();
    		}
        
        /**
         * 读取xls文件内容
         * @param filepath 想要读取的文件对象
         * @return 返回文件内容
         */
        public static String xls2String(String filepath){
            String result = "";
            try{
                SmbFileInputStream fis = new SmbFileInputStream(filepath);   
                StringBuilder sb = new StringBuilder();   
                jxl.Workbook rwb = Workbook.getWorkbook(fis);   
                Sheet[] sheet = rwb.getSheets();   
                for (int i = 0; i < sheet.length; i++) {   
                    Sheet rs = rwb.getSheet(i);   
                    for (int j = 0; j < rs.getRows(); j++) {   
                       Cell[] cells = rs.getRow(j);   
                       for(int k=0;k<cells.length;k++)   
                       sb.append(cells[k].getContents() + " ");   
                    }   
                }   
                fis.close();   
                result += sb.toString();
            }catch(Exception e){
                e.printStackTrace();
            }
            return result;
        }
        /**
         * PDF格式  文件创建索引
         * @Description: TODO
         * @param @param path
         * @param @return
         * @param @throws Exception   
         * @return String  
         * @date 2016年8月11日
         */
        public static String readPdf(String path) throws Exception {
            StringBuffer content = new StringBuffer("");// 文档内容
            SmbFileInputStream fis = new SmbFileInputStream(path);
            PDFParser p = new PDFParser(fis);
            p.parse();
            PDFTextStripper ts = new PDFTextStripper();
            content.append(ts.getText(p.getPDDocument()));
            fis.close();
            return content.toString().trim();
        }
        
        /**
         * 读取xlsx格式的excel文档
         * @param @param filepath
         * @param @throws IOException   
         * @author wusongxiao
         * @date 2016年8月10日
         */
        public static String readExcel2007(String filepath) throws IOException {
    
    //		System.out.println(filepath);
    
    		StringBuffer content = new StringBuffer();
    
    		// 构造 XSSFWorkbook 对象,strPath 传入文件路径 **** SmbFileInputStream SMB读取文件 ***
    		XSSFWorkbook xwb = new XSSFWorkbook(new SmbFileInputStream(filepath));
    		// 循环工作表Sheet
    		for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
    			XSSFSheet xSheet = xwb.getSheetAt(numSheet);
    			if (xSheet == null) {
    				continue;
    			}
    			// 循环行Row
    			for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
    				XSSFRow xRow = xSheet.getRow(rowNum);
    				if (xRow == null) {
    					continue;
    				}
    				// 循环列Cell
    				for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
    					XSSFCell xCell = xRow.getCell(cellNum);
    					if (xCell == null) {
    						continue;
    					}
    					String s = null;
    					if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
    						content.append(xCell.getBooleanCellValue());
    					} else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
    						content.append(xCell.getNumericCellValue());
    					} else {
    						content.append(xCell.getStringCellValue()  + ""); //+ "
    "
    					}
    				}
    			}
    		}
    
    		return content.toString();
    	}
        /**
         * 读取txt文档
         * @param @param filepath  地址
         * @param @param charSet  编码格式
         * @param @throws IOException   
         * @author wusongxiao
         * @date 2016年8月10日
         */
        public static String readTxt(String filepath, String charSet)
    			throws IOException {
    		BufferedReader reader = new BufferedReader(new InputStreamReader(
    				new SmbFileInputStream(filepath), charSet));  //reader.readLine() 读取txt文本  String的
    		String line = new String();
    		String temp = new String();
    		while ((line = reader.readLine()) != null) {
    			temp += line;
    		}
    		reader.close();
    		return temp;
    	}
        /**
         * 读取ppt文件
         * @Description: TODO
         * @param @param filepath
         * @param @return   
         * @return String  
         * @date 2016年8月10日
         */
        public static String readPowerPoint(String filepath) {
    		StringBuffer content = new StringBuffer("");
    		try {
    			SlideShow ss = new SlideShow(new HSLFSlideShow(new SmbFileInputStream(filepath)));// is
    			// 为文件的InputStream,建立SlideShow
    			Slide[] slides = ss.getSlides();// 获得每一张幻灯片
    			for (int i = 0; i < slides.length; i++) {
    				TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun
    				for (int j = 0; j < t.length; j++) {
    					content.append(t[j].getText());// 这里会将文字内容加到content中去
    				}
    			}
    		} catch (Exception ex) {
    			System.out.println(ex.toString());
    		}
    		return content.toString();
    	}
        
        public static void closeWriter(IndexWriter indexWriter) throws Exception {
            if (indexWriter != null) {
                indexWriter.close();
            }
        }
        
        /**
         * 创建索引管理器
         * @return 返回索引管理器对象
         */
        public ReadFile getManager(){
            if(indexManager == null){
                this.indexManager = new ReadFile();
            }
            return indexManager;
        }
        /**
         * 删除目录下的所有索引
         * @Description: TODO
         * @param @param file
         * @param @return   
         * @return boolean  
         * @throws
         * @author wusongxiao
         * @date 2016年8月10日
         */
        public static boolean deleteDir(File file){
            if(file.isDirectory()){
                File[] files = file.listFiles();
                for(int i=0; i<files.length; i++){
                    deleteDir(files[i]);
                }
            }
            file.delete();
            return true;
        }
    
    }
    

      

    整个都是基于SMB 文件服务器的lucene4.0全文检索,如果是本地文件的话   只需要把所有的地址   类似 SmbFileInputStream  去掉 Smb   就可以了

    
    
  • 相关阅读:
    168. Excel Sheet Column Title
    171. Excel Sheet Column Number
    264. Ugly Number II java solutions
    152. Maximum Product Subarray java solutions
    309. Best Time to Buy and Sell Stock with Cooldown java solutions
    120. Triangle java solutions
    300. Longest Increasing Subsequence java solutions
    63. Unique Paths II java solutions
    221. Maximal Square java solutions
    279. Perfect Squares java solutions
  • 原文地址:https://www.cnblogs.com/wusx/p/5765592.html
Copyright © 2011-2022 走看看