lucene总结
公司项目:portal中期刊文章内容作为大字段存储在Oracle中,首页有一个搜索功能:要求将所有包括搜索字段的文章的标题列出来(文章的内容存储在Oracle的CLOB字段中),也就是要用Lucene实现对数据库的大字段进行索引(索引通过计划任务定时建立索引)和搜索。。。
==================定时建立索引文件:===============
Main方法:
- package zxt.lucene.index;
- import java.util.Timer;
- public class IndexerServer {
- /**
- * 定时调用建立索引任务
- * @author wulihai
- * @create 2009-06-02
- */
- public static void main(String[] args) {
- String propFile = "directory.properties";
- Config.setConfigFileName(propFile);
- Timer timer = new Timer();
- LuceneDBIndexerTask luceneTask=LuceneDBIndexerTask.getInstance();
- timer.scheduleAtFixedRate(luceneTask, 0,DataTypeUtil.toLong(Constant.CREATE_INDEX_SLEEP_TIME));
- }
- }
定时调用建立索引任务:
- package zxt.lucene.index;
- import java.util.Timer;
- public class IndexerServer {
- /**
- * 定时调用建立索引任务
- * @author wulihai
- * @create 2009-06-02
- */
- public static void main(String[] args) {
- String propFile = "directory.properties";
- Config.setConfigFileName(propFile);
- Timer timer = new Timer();
- LuceneDBIndexerTask luceneTask=LuceneDBIndexerTask.getInstance();
- timer.scheduleAtFixedRate(luceneTask, 0,DataTypeUtil.toLong(Constant.CREATE_INDEX_SLEEP_TIME));
- }
- }
建立索引的核心实现:
- package zxt.lucene.index;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.IOException;
- import java.io.StringWriter;
- import java.sql.Connection;
- import java.sql.DriverManager;
- import java.sql.ResultSet;
- import java.sql.SQLException;
- import java.sql.Statement;
- import java.text.SimpleDateFormat;
- import java.util.Arrays;
- import java.util.Date;
- import java.util.TimerTask;
- import oracle.sql.CLOB;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.IndexWriter;
- /**
- * 建立索引的任务类
- * @author wulihai
- * @create 2009-06-02
- */
- public class LuceneDBIndexerTask extends TimerTask {
- //缺省索引目录
- private static String DEFAULT_INDEX_DIR="C:\\IndexDB";
- //临时索引目录的父目录
- private File parentDir=null;
- //被搜索的索引文件
- private static LuceneDBIndexerTask index=new LuceneDBIndexerTask();
- //构造方法
- private LuceneDBIndexerTask(){
- String dirStr=Constant.INDEX_STORE_DIRECTORY;
- if(dirStr!=null&&!"".equals(dirStr)){
- this.parentDir=new File(dirStr);
- }else{
- this.parentDir=new File(DEFAULT_INDEX_DIR);
- }
- if(!this.parentDir.exists()){
- this.parentDir.mkdir();
- }
- }
- /**
- * 单实例访问接口
- * @return
- */
- public static LuceneDBIndexerTask getInstance(){
- return index;
- }
- /**
- * 锁定目录以及文件
- * 只允许单线程访问
- *
- */
- /*public synchronized void singleRunning(){
- if(flag==false){
- flag=true;
- run(parentDir);
- }
- }*/
- /**
- * 为数据库字段建立索引
- */
- public void run() {
- System.out.println("====LuceneDBIndexerTask$run()===============");
- System.out.println("~~~开始建立索引文件~~~~~~~~~~~~~~~");
- Connection conn=null;
- Statement stmt=null;
- ResultSet rs=null;
- try {
- Class.forName(Constant.DB_DRIVER_STRING);
- conn = DriverManager.getConnection(Constant.DB_URI_STRING, Constant.DB_USERNAME, Constant.DB_PWD);
- stmt = conn.createStatement();
- rs = stmt.executeQuery(Constant.DB_QUERY_STRING);
- File file=new File(parentDir+File.separator+new SimpleDateFormat("yyyyMMddHHmmss").format(new Date())+File.separator);
- if(!file.exists()){
- file.mkdir();
- }
- IndexWriter writer = new IndexWriter(file,new StandardAnalyzer(), true);
- long startTime = new Date().getTime();
- while (rs.next()) {
- Document doc = new Document();
- doc.add(new Field("ARTICLEID", rs.getString("ARTICLEID"), Field.Store.YES,Field.Index.TOKENIZED));
- doc.add(new Field("TITLE", rs.getString("TITLE"), Field.Store.YES,Field.Index.TOKENIZED));
- doc.add(new Field("USERNAME", rs.getString("USERNAME"), Field.Store.YES,Field.Index.TOKENIZED));
- doc.add(new Field("USERID", rs.getString("USERID"), Field.Store.YES,Field.Index.TOKENIZED));
- //对日期建立索引
- String createdate=new SimpleDateFormat("yyyy-MM-dd").format(rs.getTimestamp("CREATEDATE"));
- doc.add(new Field("CREATEDATE", createdate, Field.Store.YES,Field.Index.TOKENIZED));
- //对大字段建立索引
- BufferedReader in=null;
- String content="";
- CLOB clob = (CLOB) rs.getClob("CONTENT");
- if (clob != null) {
- //得到一个读入流
- in=new BufferedReader(clob.getCharacterStream());
- StringWriter out=new StringWriter();
- int c;
- while((c=in.read())!=-1){
- out.write(c);
- }
- content=out.toString();
- }
- doc.add(new Field("CONTENT", content, Field.Store.YES, Field.Index.TOKENIZED));
- writer.addDocument(doc);
- }
- writer.optimize();
- writer.close();
- //测试一下索引的时间
- long endTime = new Date().getTime();
- System.out.println("索引文件"+file.getPath()+"建立成功...");
- System.out.println("这花费了" + (endTime - startTime) + " 毫秒来把文档增加到索引里面去!");
- //判断文件目录file下的文件个数如果大于3,就将文件建立最早的文件给删除掉
- checkFiles(parentDir);
- } catch (IOException e) {
- e.printStackTrace();
- } catch (SQLException e) {
- e.printStackTrace();
- } catch (ClassNotFoundException e) {
- e.printStackTrace();
- }finally{
- try {
- if(rs!=null){
- rs.close();
- }
- if(stmt!=null){
- stmt.close();
- }
- if(conn!=null){
- conn.close();
- }
- } catch (SQLException e) {
- e.printStackTrace();
- }
- }
- }
- /**
- * 判断文件目录file下的文件个数如果大于3,就将文件建立最早的文件给删除掉
- */
- public void checkFiles(File dir) {
- int length=dir.listFiles().length;
- while(length>3){
- //删除生成最早的文件
- File [] files=dir.listFiles();
- String[] names=dir.list();
- Arrays.sort(names);
- File deletefile=files[0];
- deleteDirectory(deletefile);
- length--;
- }
- }
- /*
- * 递归删除一个目录以及下面的文件
- */
- public boolean deleteDirectory(File path) {
- if( path.exists() ) {
- File[] files = path.listFiles();
- for(int i=0; i<files.length; i++) {
- if(files.isDirectory()) {
- deleteDirectory(files);
- }
- else {
- //删除文件
- files.delete();
- }
- }
- }
- //删除目录
- boolean hasdelete=path.delete();
- if(hasdelete){
- System.out.println("删除索引目录"+path);
- }
- return hasdelete;
- }
- public static void main(String[] args) {
- new LuceneDBIndexerTask().run();
- }
- }
配置文件管理类:
- package zxt.lucene.index;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.Properties;
- /**
- *
- * @author wulihai
- * @create 2009-06-02
- *
- */
- public class Config {
- private static Config cfg = null;
- private static String configFileName = null;
- private Properties props;
- public Config() {
- props = new java.util.Properties();
- }
- /**
- * 单例访问接口
- * @return
- */
- public synchronized static Config getInstance() {
- if (cfg == null) {
- cfg = new Config();
- cfg.loadConfig();
- return cfg;
- } else {
- return cfg;
- }
- }
- private int loadConfig() {
- if (configFileName != null || configFileName.length() > 0) {
- InputStream inputStream = Config.class.getClassLoader()
- .getResourceAsStream("directory.properties");
- System.out.println("configFileName=" + configFileName);
- try {
- props.load(inputStream);
- } catch (IOException e) {
- e.printStackTrace();
- }
- return 1;
- }
- return 0;
- }
- public static void setConfigFileName(String cfg) {
- configFileName = cfg;
- }
- public String getProperty(String keyName) {
- return props.getProperty(keyName);
- }
- }
常量配置
- package zxt.lucene.index;
- /**
- * 常量配置类 *
- * @author wulihai
- * @create 2009-06-02
- */
- public class Constant {
- // 隔多长时间建立一次索引
- public static final String CREATE_INDEX_SLEEP_TIME = Config.getInstance()
- .getProperty("create_index_sleep_time");
- // 索引文件存放路径
- public static final String INDEX_STORE_DIRECTORY = Config.getInstance()
- .getProperty("index_store_directory");
- //数据库驱动程序
- public static final String DB_DRIVER_STRING = Config.getInstance()
- .getProperty("db_driver_string");
- //数据库连接URI
- public static final String DB_URI_STRING = Config.getInstance()
- .getProperty("db_uri_string");
- //数据库连接username
- public static final String DB_USERNAME= Config.getInstance()
- .getProperty("db_username");
- //数据库连接pwd
- public static final String DB_PWD= Config.getInstance()
- .getProperty("db_pwd");
- //数据库查询语句db_query_str
- public static final String DB_QUERY_STRING= Config.getInstance()
- .getProperty("db_query_string");
- }
数据类型处理类:
- package zxt.lucene.index;
- /**
- * 数据类型转换工具类
- * @author wulihai
- * @create 2009-06-02
- */
- public class DataTypeUtil {
- /**
- * 将对象转换为整数型
- * @param o 源对象
- * @return 对应的Long值,如果出错,则返回Long.MIN_VALUE
- */
- public static long toLong(Object o) {
- if (o == null) {
- throw new IllegalArgumentException("该对象为空");
- }
- String s = o.toString();
- try {
- return Long.parseLong(s);
- } catch (Exception ex) {
- return Long.MAX_VALUE;
- }
- }
- }
配置文件 :
- #== the directory for store lucene-index ========#
- index_store_directory=D:/lucene/indexDB/
- #======== two hours ========#
- #create_index_sleep_time=7200000
- #======== two minutes ========#
- create_index_sleep_time=120000
- db_driver_string=oracle.jdbc.driver.OracleDriver
- db_uri_string=jdbc:oracle:thin:@localhost:1521:lportal
- db_username=lportal
- db_pwd=lportal
- db_query_string=SELECT * from journalarticle
==================搜索类:===============
核心搜索类:
- package com.liferay.portal.util;
- import java.io.File;
- import java.io.IOException;
- import java.io.InputStream;
- import java.text.SimpleDateFormat;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.Date;
- import java.util.List;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.queryParser.ParseException;
- import org.apache.lucene.queryParser.QueryParser;
- import org.apache.lucene.search.Hits;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import com.liferay.portlet.journal.model.JournalArticle;
- /**
- * 负责搜索的类
- */
- public class LuceneDBQuery {
- private static LuceneDBQuery search = new LuceneDBQuery();
- // 构造方法
- private LuceneDBQuery() {
- }
- /**
- * 单实例访问接口
- *
- * @return
- */
- public static LuceneDBQuery getInstance() {
- return search;
- }
- /**
- * 搜索方法
- *
- * @throws java.text.ParseException
- * @throws Exception
- */
- public List search(String queryString) {
- int count = 0;
- long startTime = new Date().getTime();
- Hits hits = null;
- // 搜索目录
- File searchDir = null;
- Query query = null;
- InputStream inputStream=null;;
- String filePath="index.xml";
- String indexDir="";
- indexDir= LuceneDBQueryUtil.getIndexPath();
- if (indexDir != null && !"".equals(indexDir)) {
- searchDir = new File(indexDir);
- if(!searchDir.exists()){
- searchDir.mkdir();
- }
- }
- // 这里注意索引存放的目录的父目录
- // searchDir=new File("E:\\index\\indexDB\\");
- File targetDir = getTargetDir(searchDir);
- IndexSearcher searcher = null;
- List results = new ArrayList();
- try {
- Directory dir=FSDirectory.getDirectory(targetDir,false);
- searcher = new IndexSearcher(dir);
- } catch (Exception e1) {
- e1.printStackTrace();
- System.out.println("创建索引对象出现异常...");
- }
- Analyzer analyzer = new StandardAnalyzer();
- // 构建查询对象Query,对CONTENT字段进行搜索
- QueryParser qp = new QueryParser("CONTENT", analyzer);
- try {
- query = qp.parse(queryString);
- } catch (ParseException e1) {
- e1.printStackTrace();
- }
- if (searcher != null) {
- // 得到搜索结果Hits
- try {
- hits = searcher.search(query);
- } catch (IOException e1) {
- System.out.println("查询索引库出现异常...");
- e1.printStackTrace();
- }
- // 查到的记录条数
- count = hits.length();
- if (hits.length() > 0) {
- for (int i = 0; i < hits.length(); i++) {// 输出搜索信息
- JournalArticle article = new JournalArticle();
- Document document = null;
- try {
- document = hits.doc(i);
- } catch (Exception e1) {
- System.out.println("返回查询结果集出现异常...");
- e1.printStackTrace();
- }
- try {
- article.setDisplayDate(new SimpleDateFormat("yyyyMMdd")
- .parse(document.get("CREATEDATE")));
- article.setCreateDate(new SimpleDateFormat("yyyyMMdd")
- .parse(document.get("CREATEDATE")));
- } catch (java.text.ParseException e) {
- e.printStackTrace();
- }
- article.setTitle(document.get("TITLE"));
- article.setArticleId(document.get("ARTICLEID"));
- article.setUserName(document.get("USERNAME"));
- article.setUserId(document.get("USERID"));
- results.add(article);
- }
- // 测试一下索引的时间
- long endTime = new Date().getTime();
- System.out.println("查询过程花费了" + (endTime - startTime) + " 毫秒!");
- } else {
- System.out.println("0个结果!");
- }
- }
- return results;
- }
- /**
- * 确定搜索索引所在目录目录
- */
- private File getTargetDir(File dir) {
- int length = dir.listFiles().length;
- File searchFile = null;
- // length=3的时候最多
- // 同时搜索和同时建索引的时候会出现length=4
- if (length >= 2) {
- // 找到次最新建立的索引文件
- String[] names = dir.list();
- Arrays.sort(names);
- searchFile = new File(dir + File.separator + names[length - 2]);
- }
- if (length == 1) {
- File files[] = dir.listFiles();
- searchFile = files[0];
- }
- if (length == 0) {
- // 如果没有索引文件则,建立第一个索引
- // TestDBIndexer.getInstance().isInstanceRunning();
- // search();
- }
- return searchFile;
- }
- //
- // public static void main(String[] args) throws Exception {
- // new LuceneDBQuery().search("纳税人");
- // }
- }
配置文件管理类:
- package com.liferay.portal.util;
- import java.io.IOException;
- import org.jdom.Document;
- import org.jdom.Element;
- import org.jdom.JDOMException;
- import org.jdom.input.SAXBuilder;
- public class LuceneDBQueryUtil {
- public static String getIndexPath(){
- String filePath = "zxt_index.xml";
- String indexPath="";
- SAXBuilder builder = new SAXBuilder(false);
- try {
- Document doc = builder.build(Thread.currentThread().getContextClassLoader().getResource(filePath));
- Element rootElement = doc.getRootElement();
- Element index=rootElement.getChild("index");
- indexPath=index.getText();
- System.out.println(indexPath);
- } catch (JDOMException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- return indexPath;
- }
- }
配置文件:zxt_index.xml
- <?xml version="1.0" encoding="UTF-8"?>
- <list>
- <index>D:\\index\\IndexDB</index>
- </list>