使用apache poi解析 Excel文件:
package excellucene; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import org.apache.poi.EncryptedDocumentException; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.WorkbookFactory; import com.sun.media.sound.InvalidFormatException; public class ParseExcel { public static void main(String[] args) throws IOException { String path = "C:\Users\Desktop\a01hos\img"; File f = new File(path); File[] files = f.listFiles(); System.out.println(files.length); File[] filesxls = f.listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { if (name.endsWith(".xls") || name.endsWith(".xlsx")) { return true; } return false; } }); System.out.println("Excel文件有: " + filesxls.length); for (File f2 : filesxls) { String fileDirectPathName = f2.getCanonicalPath(); System.out.println(fileDirectPathName); // System.out.println("文件名: " + f2.getName()); new ParseExcel().parseXml(fileDirectPathName); } /* * IndexWriter writer; // 创建 Lucene Index Writer Directory dir = * FSDirectory.open(Paths.get("f:/excelindex")); writer = new * IndexWriter(dir, new IndexWriterConfig( new StandardAnalyzer())); * * for (File f2 : filesxls) { // FileReader fr = new FileReader(f); // * BufferedReader br = new BufferedReader(fr); * System.out.println(f2.getCanonicalPath()); System.out.println("文件名: " * + f2.getName()); * * * // 创建dom对象创建索引 创建索引 Document document = new Document(); * * Document doc = new Document(); doc.add(new Field("contents", * ExcelFileReader(f2.getCanonicalPath()), TextField.TYPE_NOT_STORED)); * doc.add(new Field("filename", f2.getName(), TextField.TYPE_STORED)); * doc.add(new StringField("fullpath", f2.getCanonicalPath(), * Field.Store.YES)); * * writer.addDocument(doc); * * writer.numDocs(); * * } */ } /** * Excel表格提取数据 * * @param fileName * 路径 * @return * @throws IOException */ public static String ExcelFileReader(String fileName) throws IOException { InputStream path = new FileInputStream(fileName); String content = null; // 1、创建新的Excel文件 HSSFWorkbook wb = new HSSFWorkbook(path); ExcelExtractor extractor = new ExcelExtractor(wb); extractor.setFormulasNotResults(true); extractor.setIncludeSheetNames(false); content = extractor.getText(); return content; } public void parseXml(String filename) { Workbook wb = null; try { wb = WorkbookFactory.create(new File(filename)); Sheet sheet = wb.getSheetAt(0); for (Row row : sheet) { for (Cell cell : row) { System.out.print(getCellValue(cell) + "---"); save(getCellValue(cell) + "---"); } System.out.println(); } } catch (EncryptedDocumentException e) { e.printStackTrace(); } catch (InvalidFormatException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (org.apache.poi.openxml4j.exceptions.InvalidFormatException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public Object getCellValue(Cell cell) { int type = cell.getCellType(); String show = null; switch (type) { case Cell.CELL_TYPE_BLANK:// 空值 show = null; break; case Cell.CELL_TYPE_BOOLEAN:// Boolean show = String.valueOf(cell.getBooleanCellValue()); break; case Cell.CELL_TYPE_ERROR:// 故障 show = String.valueOf(cell.getErrorCellValue()); break; case Cell.CELL_TYPE_FORMULA:// 公式 show = cell.getCellFormula(); break; case Cell.CELL_TYPE_NUMERIC:// 数字 show = String.valueOf(cell.getNumericCellValue()); break; case Cell.CELL_TYPE_STRING:// 字符串 show = cell.getStringCellValue(); break; default: show = null; } return show; } /** * 保存字符串到文本中 * * @param str */ public boolean save(String str) { boolean flag = false; // 声明操作标记 String fileName = "file/haha.txt"; // 定义文件名 File f = new File(fileName); if(!f.exists()){ try { f.createNewFile(); } catch (IOException e) { e.printStackTrace(); } } FileWriter fw = null; // 用来写入字符文件的便捷类 PrintWriter out = null; // 向文本输出流打印对象的格式化表示形式类 try { fw = new FileWriter(f, true); // 创建一个FileWriter out = new PrintWriter(fw); // 创建一个PrintWriter,以追加方式将内容插入到最后一行 out.println(str); // 将字符串打印到文本中 out.flush(); // 刷新缓存 flag = true; } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { // 关闭PrintWriter if (out != null) { out.close(); out = null; } // 关闭FileWriter if (fw != null) { fw.close(); fw = null; } } catch (IOException e) { e.printStackTrace(); } } return flag; } }
使用lucene建立索引:
package excellucene; import java.io.IOException; import java.nio.file.Paths; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; public class SearchExcel { public static void main(String[] args) throws IOException, ParseException { if(args.length!=2){ throw new IllegalArgumentException(SearchExcel.class.getName()+" <> <query>"); } // String indexDir = args[0];//解析输入的索引路径 // String q = args[1];//解析输入的查询字符串 String indexDir = "F:\excelindex"; String q = "zhangxing"; search(indexDir, q); } public static void search(String indexDir, String q) throws IOException, ParseException{ // Directory dir = FSDirectory.open(Paths.get(indexDir)); IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexDir))); IndexSearcher is = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); // 需要添加 .jar 包 // lucene-queryparser-7.4.0.jar QueryParser parser = new QueryParser("filename", analyzer); Query query = parser.parse(q); long start = System.currentTimeMillis(); TopDocs hits = is.search(query, 10); long end = System.currentTimeMillis(); System.err.println("Found "+hits.totalHits+" document(s) (in "+ (end-start) +" milliseconds) that matched query'"+q+"':"); for(ScoreDoc scoreDoc:hits.scoreDocs){ Document doc = is.doc(scoreDoc.doc); System.out.println(doc.get("fullpath")); } } }
使用了的jar包: