zoukankan      html  css  js  c++  java
  • lucene 3.0.2 + 多文件夹微博数据(时间,微博)构建索引

    package lia.meetlucene;
    
    import java.io.File;
    import java.io.IOException;
    import java.util.LinkedList;
    
    import javax.xml.parsers.DocumentBuilder;
    import javax.xml.parsers.DocumentBuilderFactory;
    
    import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
    import org.apache.lucene.analysis.standard.StandardAnalyzer;
    import org.apache.lucene.document.Field;
    import org.apache.lucene.index.CorruptIndexException;
    //import org.apache.lucene.document.Document;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.FSDirectory;
    import org.apache.lucene.util.Version;
    import org.w3c.dom.Document;
    import org.w3c.dom.Element;
    import org.w3c.dom.NodeList;
    
    public class Unicode1 {
    
        static boolean numTime = false;
        static boolean numText = false;
        static String timeTmp = null;
        static String textTmp = null;
        
        static void indexer(IndexWriter writer) throws CorruptIndexException, IOException
        {
            org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
            Field field = new Field("context",textTmp,Field.Store.YES,
                    Field.Index.ANALYZED);
            doc.add(field);
            
            field = new Field("time",timeTmp,Field.Store.YES,
                    Field.Index.NOT_ANALYZED);
            doc.add(field);
            
            writer.addDocument(doc);
            //System.out.println("微博: " + textTmp+ "  "+timeTmp);
        }
        
        
        static void Dfs(NodeList nodecur,IndexWriter writer) {
            for (int j = 0; j < nodecur.getLength(); j++) {
    
                if ("timestamp".equals(nodecur.item(j).getNodeName())) // 输出pass
                {
                    //System.out.println("时间: " + nodecur.item(j).getTextContent());
                    timeTmp = nodecur.item(j).getTextContent();
                    numTime = true;
                }
                /*
                 * else if ("origtext".equals(nodecur.item(j).getNodeName()))
                 * System.out.println("原微博: " + nodecur.item(j).getTextContent());
                 */
                else if ("text".equals(nodecur.item(j).getNodeName())) // 输出code
                {
                    //System.out.println("微博: " + nodecur.item(j).getTextContent());
                    textTmp = nodecur.item(j).getTextContent();
                    numText = true; 
                }
                if(numText&&numTime)
                    try {
                        indexer(writer);
                        numText = false;
                        numTime = false;
                    } catch (CorruptIndexException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                    
                NodeList childNodes = nodecur.item(j).getChildNodes();
                Dfs(childNodes,writer);
            }
        }
    
        public static void main(String[] args) throws IOException {
    
            long a = System.currentTimeMillis();
    
            // String dataDir ="C:/Users/Administrator/Desktop/xdj/tengxun/A__Vae";
            //File dataDir = new File("C:/Users/Administrator/Desktop/xdj/tengxun");
            //String indexDir = "C:/Users/Administrator/Desktop/xdj/suoyin";
            File dataDir = new File("E:/xdj/tengxun");
            String indexDir = "E:/xdj/tengxunsuoying";
            Directory dir = FSDirectory.open(new File(indexDir));
    
            IndexWriter writer = new IndexWriter(dir, 
                    new SmartChineseAnalyzer(Version.LUCENE_20),
                    //new StandardAnalyzer(Version.LUCENE_30), 
                    true, 
                    IndexWriter.MaxFieldLength.UNLIMITED);
            
            
    
            LinkedList list = new LinkedList();
            File file[] = dataDir.listFiles();
            for (int i = 0; i < file.length; i++) {
                if (file[i].isDirectory())
                    list.add(file[i]);
            }
            File tmp;
            int num = 0;
            while (!list.isEmpty()) {
                
                tmp = (File) list.removeFirst();
                file = tmp.listFiles();
                for (int i = 0; i < file.length; i++) {
                    System.out.println(file[i].getAbsolutePath());
    
                    Element element = null;
                    // documentBuilder为抽象不能直接实例化(将XML文件转换为DOM文件)
                    DocumentBuilder db = null;
                    DocumentBuilderFactory dbf = null;
                    try {
                        // 返回documentBuilderFactory对象
                        dbf = DocumentBuilderFactory.newInstance();
                        // 返回db对象用documentBuilderFatory对象获得返回documentBuildr对象
                        db = dbf.newDocumentBuilder();
                        // 得到一个DOM并返回给document对象
                        Document dt = db.parse(file[i]);
                        // 得到一个elment根元素
                        element = dt.getDocumentElement();
                        // 获得根节点
                        System.out.println("根元素:" + element.getNodeName());
                        // 获得根元素下的子节点
    
                        Dfs(element.getChildNodes(),writer);
    
                        num++;
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
    
                System.out.println(System.currentTimeMillis() - a + "    " + num);
            }
            writer.close();
            
        }
    }
    View Code
  • 相关阅读:
    [USACO06NOV]Corn Fields(状压DP)
    关灯问题II (状态压缩 BFS)
    天梯---至多删三个字符(DP)
    天梯
    蓝桥
    天梯
    天梯
    天梯
    天梯
    蓝桥
  • 原文地址:https://www.cnblogs.com/XDJjy/p/4437539.html
Copyright © 2011-2022 走看看