zoukankan      html  css  js  c++  java
  • Lucene学习入门记录

    前一段时间,公司让用Lucene检索文档,自己写了些代码,在这里记录一下,以免忘记了。

    其实,简单的Lucene的入门还是很简单的,它的整体构造和关系型数据库差不多,一个键对应一个值,生成索引,然后根据索引去查找文档内容,在将内容通过别的方式显示出来。

    Lucene创建、增、删、改索引:

    package com.haiyisoft.szgl.file.service.impl;
    
    import java.io.File;
    
    /**
     * 档案管理的创建索引
     *
     * @author    haojiahong
     * 
     * <p>Modification History:</p>
     * <p>Date             Author      Description</p>
     * <p>--------------------------------------------------------------</p>
     * <p>20151027        haojiahong              new</p>
     * <p>  </p>
     */
    @Component("schDocForDocBuilderService")
    public class SchDocForDocBuilderServiceImpl implements SchDocForDocBuilderService {
    
    	@Autowired
    	public DocService docService;
    
    	@Autowired
    	public FileContentService fileContentService;
    
    	private long time = 0;
    
    	/**
    	 * 创建lucene索引
    	 */
    	public void creatLucene() {
    		IndexWriter indexWriter = null;
    		try {
    			File indexDir = new File(FileManage.searchCenterForDocPath);
    			creatFile(indexDir);
    			delAllFile(indexDir);
    			Directory dir = FSDirectory.open(indexDir);
    			Analyzer luceneAnalyzer = new IKAnalyzer();
    			IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, luceneAnalyzer);
    			indexWriter = new IndexWriter(dir, iwc);
    
    			LogUtil.getAppLoger().debug("开始创建索引");
    
    			long indexcount = this.createIndex(indexWriter);
    			LogUtil.getAppLoger().debug("创建索引结束,共处理数据行数" + indexcount + "条");
    			indexWriter.commit();
    			indexWriter.close();
    		} catch (IOException ex) {
    			ex.printStackTrace();
    		}
    	}
    
    	/**
    	 * 按照数据集创建索引
    	 * @param indexWriter
    	 * @return
    	 */
    	private long createIndex(IndexWriter indexWriter) {
    		try {
    			this.showtime();
    			long current = 0;
    			// current += this.initFile(indexWriter);//根据文档建立索引
    			current += this.initFileWithDocument(indexWriter);// 根据档案建立索引
    
    			return current;
    		} catch (Exception e) {
    			e.printStackTrace();
    			return -1;
    		}
    	}
    
    	private long initFileWithDocument(IndexWriter indexWriter) {
    		long current = 0;
    		String jpql = "select file from FileManage file where 1=1";
    		List<FileManage> fmLs = (List) JPAUtil.find(jpql);
    		for (FileManage fm : fmLs) {
    			try {
    				indexWriter.addDocument(initLuceneDocument(indexWriter, current, fm));
    				current++;
    			} catch (Exception e) {
    				e.printStackTrace();
    			}
    		}
    		return current;
    	}
    
    	/**
    	 * 增量添加 lucene索引
    	 */
    	@Override
    	public void insertLucene(String fmUuid) {
    		IndexWriter indexWriter = null;
    		try {
    			File indexDir = new File(FileManage.searchCenterForDocPath);
    			Directory dir = FSDirectory.open(indexDir);
    			Analyzer luceneAnalyzer = new IKAnalyzer();
    			IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, luceneAnalyzer);
    			indexWriter = new IndexWriter(dir, iwc);
    			LogUtil.getAppLoger().debug("开始增量添加索引");
    			long indexcount = this.insertIndex(indexWriter, fmUuid);
    			LogUtil.getAppLoger().debug("添加索引结束,共处理数据行数" + indexcount + "条");
    			indexWriter.commit();
    			indexWriter.close();
    		} catch (IOException ex) {
    			ex.printStackTrace();
    		}
    	}
    
    	/**
    	 * 增量添加索引
    	 * @param indexWriter
    	 * @return
    	 */
    	private long insertIndex(IndexWriter indexWriter, String fmUuid) {
    		try {
    			this.showtime();
    			long current = 0;
    			current += this.insertFileWithDocument(indexWriter, fmUuid);// 根据档案添加索引
    			return current;
    		} catch (Exception e) {
    			e.printStackTrace();
    			return -1;
    		}
    	}
    
    	private long insertFileWithDocument(IndexWriter indexWriter, String fmUuid) {
    		long current = 0;
    		FileManage fm = JPAUtil.loadById(FileManage.class, fmUuid);
    		try {
    			indexWriter.addDocument(initLuceneDocument(indexWriter, current, fm));
    			current++;
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    		return current;
    	}
    
    	/**
    	 * 更新Lucene索引
    	 */
    	@Override
    	public void updateLucene(String fmUuid) {
    		IndexWriter indexWriter = null;
    		try {
    			File indexDir = new File(FileManage.searchCenterForDocPath);
    			Directory dir = FSDirectory.open(indexDir);
    			Analyzer luceneAnalyzer = new IKAnalyzer();
    			IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, luceneAnalyzer);
    			indexWriter = new IndexWriter(dir, iwc);
    			LogUtil.getAppLoger().debug("开始更新索引");
    			long indexcount = this.updateIndex(indexWriter, fmUuid);
    			LogUtil.getAppLoger().debug("更新索引结束,共处理数据行数" + indexcount + "条");
    			indexWriter.commit();
    			indexWriter.close();
    		} catch (IOException ex) {
    			ex.printStackTrace();
    		}
    	}
    
    	/**
    	 * 更新Lucene索引
    	 */
    	private long updateIndex(IndexWriter indexWriter, String fmUuid) {
    		try {
    			this.showtime();
    			long current = 0;
    			current += this.updateFileWithDocument(indexWriter, fmUuid);// 根据档案更新索引
    			return current;
    		} catch (Exception e) {
    			e.printStackTrace();
    			return -1;
    		}
    	}
    
    	private long updateFileWithDocument(IndexWriter indexWriter, String fmUuid) {
    		long current = 0;
    		FileManage fm = JPAUtil.loadById(FileManage.class, fmUuid);
    		try {
    			indexWriter.updateDocument(new Term("UUID", fmUuid), initLuceneDocument(indexWriter, current, fm));
    			current++;
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    		return current;
    	}
    
    	/**
    	 * 删除Lucene索引
    	 */
    	@Override
    	public void deteleLucene(String fmUuid) {
    		IndexWriter indexWriter = null;
    		try {
    			File indexDir = new File(FileManage.searchCenterForDocPath);
    			Directory dir = FSDirectory.open(indexDir);
    			Analyzer luceneAnalyzer = new IKAnalyzer();
    			IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, luceneAnalyzer);
    			indexWriter = new IndexWriter(dir, iwc);
    			LogUtil.getAppLoger().debug("开始删除索引");
    			long indexcount = this.deteleIndex(indexWriter, fmUuid);
    			LogUtil.getAppLoger().debug("删除索引结束,共处理数据行数" + indexcount + "条");
    			indexWriter.commit();
    			indexWriter.close();
    		} catch (IOException ex) {
    			ex.printStackTrace();
    		}
    
    	}
    
    	/**
    	 * 删除Lucene索引
    	 * @param indexWriter
    	 * @param fmUuid
    	 * @return
    	 */
    	private long deteleIndex(IndexWriter indexWriter, String fmUuid) {
    		try {
    			this.showtime();
    			long current = 0;
    			current += this.deleteFileWithDocument(indexWriter, fmUuid);// 根据档案删除索引
    			return current;
    		} catch (Exception e) {
    			e.printStackTrace();
    			return -1;
    		}
    	}
    
    	private long deleteFileWithDocument(IndexWriter indexWriter, String fmUuid) {
    		long current = 0;
    		try {
    			indexWriter.deleteDocuments(new Term("UUID", fmUuid));
    			current++;
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    		return current;
    	}
    
    	/**
    	 * 批量删除索引
    	 */
    	@Override
    	public void deteleLuceneLs(List<FileManage> fmList) {
    		for (FileManage fm : fmList) {
    			this.deteleLucene(fm.getUuid());
    		}
    	}
    
    	/**
    	 * 每个档案创建一个Lucene的document(创建、添加、更新索引用到此方法)
    	 * @param indexWriter
    	 * @param current
    	 * @param fm
    	 * @return
    	 */
    	private Document initLuceneDocument(IndexWriter indexWriter, long current, FileManage fm) {
    		String fileName = "";
    		String fileType = "";
    		String fileContent = "";
    		// TODO 版本号待定
    		List<EdocFileObjectRelation> fileList = docService.queryFiles(DocConstant.NO_DIR, fm.getUuid(), fm.getClass()
    				.getName(), null);
    		if (fileList.size() > 0) {
    			for (EdocFileObjectRelation file : fileList) {
    				InputStream fs = docService.getFileInputStream(file.getFileId());
    				fileName = file.getEdocFile().getName();
    				fileType = fileName.substring(fileName.lastIndexOf(".") + 1);
    				fileContent += FileManageUtil.getContent(fileType, fs);
    				// String content = fileContentService(file.getEdocFile().)
    
    			}
    		}
    		/**
    		 * 写入索引文件
    		 */
    		Document doc = new Document();
    		doc.add(new Field("UUID", fm.getUuid(), Store.YES, Index.NOT_ANALYZED));
    		// doc.add(new Field("FILENAME", fileName, Store.YES,
    		// Index.NO));// 文档名称
    		if (!SzglCommonUtil.strIsNull(fileContent)) {
    			doc.add(new Field("CONTENT", fileContent, Store.YES, Index.ANALYZED));// 具体内容
    		}
    		if (!SzglCommonUtil.strIsNull(fm.getTitle())) {
    			doc.add(new Field("TITLE", fm.getTitle(), Store.YES, Index.ANALYZED));// 档案标题
    		}
    		if (!SzglCommonUtil.strIsNull(fm.getDocNum())) {
    			doc.add(new Field("DOCNUM", fm.getDocNum(), Store.YES, Index.ANALYZED));// 档案文号
    		}
    		if (fm.getFromMan() != null) {
    			doc.add(new Field("FROMMAN", fm.getFromMan().toString(), Store.YES, Index.ANALYZED));// 创建人
    		}
    		if (fm.getType() != null) {
    			doc.add(new Field("TYPE", fm.getType(), Store.YES, Index.ANALYZED));// 档案类别
    		}
    		if (fm.getStatus() != null) {
    			doc.add(new Field("STATUS", fm.getStatus(), Store.YES, Index.ANALYZED));// 档案状态
    		}
    
    		if (!SzglCommonUtil.strIsNull(fm.getIsShare())) {
    			doc.add(new Field("ISHARE", fm.getIsShare(), Store.YES, Index.ANALYZED));// 档案是否共享
    		}
    		if (fm.getFromTime() != null) {
    			doc.add(new Field("FROMTIME", fm.getFromTime() + "", Store.YES, Index.ANALYZED));// 档案传来时间
    		}
    		if (!SzglCommonUtil.strIsNull(fm.getBoxUuid())) {
    			FileBox filebox = JPAUtil.loadById(FileBox.class, fm.getBoxUuid());
    			if (!SzglCommonUtil.strIsNull(filebox.getYearCode())) {
    				doc.add(new Field("YEARCODE", filebox.getYearCode(), Store.YES, Index.ANALYZED));// 档案所属的档案盒年度
    			}
    		}
    
    		return doc;
    	}
    
    	private boolean delAllFile(File file) {
    		boolean flag = false;
    		if (file != null) {
    			File[] tempList = file.listFiles();
    			File temp = null;
    			for (int i = 0; i < tempList.length; i++) {
    				temp = tempList[i];
    				if (temp.isFile()) {
    					temp.delete();
    				}
    			}
    		}
    		return flag;
    	}
    
    	/**
    	 * 显示时间
    	 */
    	private void showtime() {
    		long time1 = System.currentTimeMillis();
    		if (time > 0) {
    			LogUtil.getAppLoger().debug("MilliSecond:" + (time1 - time));
    		} else {
    			LogUtil.getAppLoger().debug("Start time:" + (new Timestamp(System.currentTimeMillis())));
    		}
    		time = time1;
    	}
    
    	private void creatFile(File file) {
    		if (!file.exists()) {
    			file.mkdirs();
    		}
    	}
    
    	private long initFile(IndexWriter indexWriter) {
    		long current = 0;
    		String jpql = "select file from FileManage file where 1=1";
    		List<FileManage> fmLs = (List) JPAUtil.find(jpql);
    		for (FileManage fm : fmLs) {
    			// TODO 版本号待定
    			List<EdocFileObjectRelation> fileList = docService.queryFiles(DocConstant.NO_DIR, fm.getUuid(), fm
    					.getClass().getName(), null);
    			if (fileList.size() > 0) {
    				for (EdocFileObjectRelation file : fileList) {
    					InputStream fs = docService.getFileInputStream(file.getFileId());
    					String fileName = file.getEdocFile().getName();
    					String fileType = fileName.substring(fileName.lastIndexOf(".") + 1);
    					String fileContent = FileManageUtil.getContent(fileType, fs);
    					// String content = fileContentService(file.getEdocFile().)
    					/**
    					 * 写入索引文件
    					 */
    					try {
    						Document doc = new Document();
    						// doc.add(new Field("TYPE", fm.getType(), Store.YES,
    						// Index.ANALYZED)); // 档案类别
    						doc.add(new Field("UUID", fm.getUuid(), Store.YES, Index.NO));
    						doc.add(new Field("FILENAME", fileName, Store.YES, Index.NO));// 文档名称
    						if (!SzglCommonUtil.strIsNull(fileContent)) {
    							doc.add(new Field("CONTENT", fileContent, Store.YES, Index.ANALYZED));// 具体内容
    						}
    						if (!SzglCommonUtil.strIsNull(fm.getTitle())) {
    							doc.add(new Field("TITLE", fm.getTitle(), Store.YES, Index.ANALYZED));// 档案标题
    						}
    						if (!SzglCommonUtil.strIsNull(fm.getDocNum())) {
    							doc.add(new Field("DOCNUM", fm.getDocNum(), Store.YES, Index.ANALYZED));// 档案文号
    						}
    						if (fm.getFromMan() != null) {
    							doc.add(new Field("FROMMAN", fm.getFromMan().toString(), Store.YES, Index.ANALYZED));// 创建人
    						}
    
    						indexWriter.addDocument(doc);
    						current++;
    						if ((current - (current / 10000) * 10000) == 0) {
    							LogUtil.getAppLoger().debug("current row num:" + current);
    						}
    					} catch (Exception ex) {
    						ex.printStackTrace();
    					}
    				}
    			}
    
    		}
    		return current;
    	}
    }
    

    根据索引去查询,并将关键词标红

    package com.haiyisoft.szgl.file.service.impl;
    
    import java.io.File;
    
    /**
     * 通过Lucene查询
     *
     * @author    haojiahong
     * 
     * <p>Modification History:</p>
     * <p>Date             Author      Description</p>
     * <p>--------------------------------------------------------------</p>
     * <p>20151102        haojiahong              new</p>
     * <p>  </p>
     */
    
    @Component("fileSchByLuceneService")
    public class FileSchByLuceneServiceImpl implements FileSchByLuceneService {
    
    	@Override
    	public List<FileManage> retrieveByLucene(String keyword, String titleSch, String docNumSch, String typeSch,
    			String isShareSch, Timestamp yearCodeBegin, Timestamp yearCodeEnd, SortParamList sortParamList,
    			PageInfo pageInfo) {
    		List<FileManage> result = new ArrayList<FileManage>();
    		IndexSearcher searcher = null;
    		String indexDir = FileManage.searchCenterForDocPath;
    		File file = new File(indexDir);
    		if ((SzglCommonUtil.strIsNull(keyword) && SzglCommonUtil.strIsNull(docNumSch)
    				&& SzglCommonUtil.strIsNull(titleSch) && SzglCommonUtil.strIsNull(typeSch) && SzglCommonUtil
    					.strIsNull(isShareSch)) || (!file.exists())) {
    			if (pageInfo != null) {
    				pageInfo.setAllRowNum(0);
    			}
    			return null;
    		}
    
    		try {
    			Directory dir = FSDirectory.open(new File(indexDir));
    			IndexReader reader = IndexReader.open(dir);
    			searcher = new IndexSearcher(reader);
    			BooleanQuery query = new BooleanQuery();
    
    			Analyzer anal = new IKAnalyzer();
    			QueryParser qp = new QueryParser(Version.LUCENE_36, "CONTENT", anal);
    			QueryParser qpTitle = new QueryParser(Version.LUCENE_36, "TITLE", anal);
    			QueryParser qpDocNum = new QueryParser(Version.LUCENE_36, "DOCNUM", anal);
    			QueryParser qpIshare = new QueryParser(Version.LUCENE_36, "ISHARE", anal);
    			if (!SzglCommonUtil.strIsNull(keyword)) {
    				query.add(qp.parse(keyword), Occur.MUST);
    			}
    			if (!SzglCommonUtil.strIsNull(titleSch)) {
    				query.add(qpTitle.parse(titleSch), Occur.MUST);
    			}
    			if (!SzglCommonUtil.strIsNull(docNumSch)) {
    				query.add(qpDocNum.parse(docNumSch), Occur.MUST);
    			}
    			if (!SzglCommonUtil.strIsNull(typeSch)) {
    				query.add(new TermQuery(new Term("TYPE", typeSch)), Occur.MUST);
    			}
    			if (!SzglCommonUtil.strIsNull(isShareSch)) {
    				query.add(qpIshare.parse(isShareSch), Occur.MUST);
    			}
    			if (yearCodeBegin != null || yearCodeEnd != null) {
    				query.add(new TermRangeQuery("YEARCODE", yearCodeBegin.toString(), yearCodeEnd.toString(), true, true),
    						Occur.MUST);
    			}
    			ScoreDoc[] hits = searcher.search(query, Integer.MAX_VALUE).scoreDocs;
    			int nowPagestart = (pageInfo.getCurPageNum() - 1) * pageInfo.getRowOfPage();// 当前页第一条数据是总数的第几条数据
    			int allPage = hits.length;// 总条数
    			pageInfo.setAllRowNum(allPage);
    			int nowPageEnd = (nowPagestart + pageInfo.getRowOfPage()) < allPage ? (nowPagestart + pageInfo
    					.getRowOfPage()) : allPage;
    			for (int i = nowPagestart; i < nowPageEnd; i++) {
    				FileManage fm = new FileManage();
    				Document doc = searcher.doc(hits[i].doc);
    				fm.setUuid(doc.get("UUID"));
    				if (!SzglCommonUtil.strIsNull(docNumSch)) {
    					fm.setDocNum(lighterStr(anal, qpDocNum.parse(docNumSch), doc.get("DOCNUM"), "DOCNUM"));
    				} else {
    					fm.setDocNum(doc.get("DOCNUM"));
    				}
    				if (!SzglCommonUtil.strIsNull(titleSch)) {
    					fm.setTitle(lighterStr(anal, qpTitle.parse(titleSch), doc.get("TITLE"), "TITLE"));
    				} else {
    					fm.setTitle(doc.get("TITLE"));
    				}
    				if (!SzglCommonUtil.strIsNull(keyword)) {
    					fm.setFileContent(lighterStr(anal, qp.parse(keyword), doc.get("CONTENT"), "CONTENT"));
    				} else {
    					fm.setFileContent(doc.get("CONTENT"));
    				}
    				fm.setFromMan(Long.valueOf(doc.get("FROMMAN")));
    				fm.setFileName(doc.get("FILENAME"));
    				fm.setType(doc.get("TYPE"));
    				fm.setStatus(doc.get("STATUS"));
    				fm.setFromTime(Timestamp.valueOf(doc.get("FROMTIME")));
    				result.add(fm);
    			}
    		} catch (IOException e) {
    			e.printStackTrace();
    		} catch (Exception e) {
    			e.printStackTrace();
    		} finally {
    			if (searcher != null)
    				try {
    					searcher.close();
    				} catch (IOException e) {
    					e.printStackTrace();
    				}
    		}
    
    		return result;
    	}
    
    	private String lighterStr(Analyzer a, Query query, String txt, String fieldname) throws Exception {
    		String str = null;
    		QueryScorer scorer = new QueryScorer(query);
    		Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
    		Formatter fmt = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
    		Highlighter lighter = new Highlighter(fmt, scorer);
    		lighter.setTextFragmenter(fragmenter);
    		str = lighter.getBestFragment(a, fieldname, txt);
    		if (str == null)
    			return txt;
    		return str;
    	}
    
    }
    

      一些工具类,用于读文档内容。

     * Copyright (C) 2014-2020 Yantai HaiYi Software Co.,Ltd
    package com.haiyisoft.szgl.file.util;
    
    import java.io.BufferedReader;
    
    /**
     * 档案管理工具类
     *
     * @author    haojiahong
     * 
     * <p>Modification History:</p>
     * <p>Date             Author      Description</p>
     * <p>--------------------------------------------------------------</p>
     * <p>20151102       haojiahong              new</p>
     * <p>  </p>
     */
    public class FileManageUtil {
    
    	/**
    	 * 根据文件类型,获取文本内容
    	 * @param type
    	 * @param fs
    	 * @return
    	 */
    	public static String getContent(String type, InputStream fs) {
    		String text = null;
    		if ("doc".equals(type)) {
    			POITextExtractor ex = null;
    			try {
    				ex = new WordExtractor(fs);
    				text = ex.getText();
    			} catch (Exception e) {
    				e.printStackTrace();
    			}
    		} else if ("docx".equals(type)) {
    			POITextExtractor ex = null;
    			try {
    				OPCPackage opcPackage = OPCPackage.open(fs);
    				ex = new XWPFWordExtractor(opcPackage);
    				text = ex.getText();
    			} catch (Exception e) {
    				e.printStackTrace();
    			}
    		} else if ("txt".equals(type)) {
    			BufferedReader reader = new BufferedReader(new InputStreamReader(fs));
    			String line = null;
    			try {
    				while ((line = reader.readLine()) != null) {
    					text += line;
    				}
    			} catch (IOException e) {
    				e.printStackTrace();
    			}
    		} else if ("pdf".equals(type)) {
    			try {
    				PDDocument pdfDocument = PDDocument.load(fs);
    				text = new PDFTextStripper().getText(pdfDocument);
    			} catch (IOException e) {
    				e.printStackTrace();
    			}
    		}
    
    		return text;
    	}
    }
    

      

  • 相关阅读:
    npm 之 --save , -D,--save -dev的区别
    webpack 之 打包(最新版)
    npm 与 yarn 对比
    webpack 之 打包图片文件
    webpack 之 打包less文件
    javascript 之 Event Loop
    package.json中type的含义
    webpack 之 打包css文件操作
    常见问题 之 webpack打包css问题
    类方法和对象方法
  • 原文地址:https://www.cnblogs.com/haojiahong/p/4959427.html
Copyright © 2011-2022 走看看