zoukankan      html  css  js  c++  java
  • 使用SolrJ生成索引

    这个例子使用两种方式来演示如何生成全量索引:
    一个是从db中通过sql生成全量索引
    一个是通过tika解析文件生成全量索引

      1 package SolrJExample;
      2 
      3 import org.apache.solr.client.solrj.SolrServerException;
      4 import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer;
      5 import org.apache.solr.client.solrj.impl.XMLResponseParser;
      6 import org.apache.solr.client.solrj.response.UpdateResponse;
      7 import org.apache.solr.common.SolrInputDocument;
      8 import org.apache.tika.metadata.Metadata;
      9 import org.apache.tika.parser.AutoDetectParser;
     10 import org.apache.tika.parser.ParseContext;
     11 import org.apache.tika.sax.BodyContentHandler;
     12 import org.xml.sax.ContentHandler;
     13 
     14 import java.io.File;
     15 import java.io.FileInputStream;
     16 import java.io.IOException;
     17 import java.io.InputStream;
     18 import java.sql.*;
     19 import java.util.ArrayList;
     20 import java.util.Collection;
     21 
     22 /* Example class showing the skeleton of using Tika and
     23    Sql on the client to index documents from
     24    both structured documents and a SQL database.
     25 
     26    NOTE: The SQL example and the Tika example are entirely orthogonal.
     27    Both are included here to make a
     28    more interesting example, but you can omit either of them.
     29 
     30  */
     31 public class SqlTikaExample {
     32   private StreamingUpdateSolrServer _server;
     33   private long _start = System.currentTimeMillis();
     34   private AutoDetectParser _autoParser;
     35   private int _totalTika = 0;
     36   private int _totalSql = 0;
     37 
     38   private Collection _docs = new ArrayList();
     39 
     40   public static void main(String[] args) {
     41     try {
     42       SqlTikaExample idxer = new SqlTikaExample("http://localhost:8983/solr");
     43 
     44       idxer.doTikaDocuments(new File("/Users/Erick/testdocs"));
     45       idxer.doSqlDocuments();
     46 
     47       idxer.endIndexing();
     48     } catch (Exception e) {
     49       e.printStackTrace();
     50     }
     51   }
     52 
     53   private SqlTikaExample(String url) throws IOException, SolrServerException {
     54       // Create a multi-threaded communications channel to the Solr server.
     55       // Could be CommonsHttpSolrServer as well.
     56       //
     57     _server = new StreamingUpdateSolrServer(url, 10, 4);
     58 
     59     _server.setSoTimeout(1000);  // socket read timeout
     60     _server.setConnectionTimeout(1000);
     61     _server.setMaxRetries(1); // defaults to 0.  > 1 not recommended.
     62          // binary parser is used by default for responses
     63     _server.setParser(new XMLResponseParser()); 
     64 
     65       // One of the ways Tika can be used to attempt to parse arbitrary files.
     66     _autoParser = new AutoDetectParser();
     67   }
     68 
     69     // Just a convenient place to wrap things up.
     70   private void endIndexing() throws IOException, SolrServerException {
     71     if (_docs.size() > 0) { // Are there any documents left over?
     72       _server.add(_docs, 300000); // Commit within 5 minutes
     73     }
     74     _server.commit(); // Only needs to be done at the end,
     75                       // commitWithin should do the rest.
     76                       // Could even be omitted
     77                       // assuming commitWithin was specified.
     78     long endTime = System.currentTimeMillis();
     79     log("Total Time Taken: " + (endTime - _start) +
     80          " milliseconds to index " + _totalSql +
     81         " SQL rows and " + _totalTika + " documents");
     82   }
     83 
     84   // I hate writing System.out.println() everyplace,
     85   // besides this gives a central place to convert to true logging
     86   // in a production system.
     87   private static void log(String msg) {
     88     System.out.println(msg);
     89   }
     90 
     91   /**
     92    * ***************************Tika processing here
     93    */
     94   // Recursively traverse the filesystem, parsing everything found.
     95   private void doTikaDocuments(File root) throws IOException, SolrServerException {
     96 
     97     // Simple loop for recursively indexing all the files
     98     // in the root directory passed in.
     99     for (File file : root.listFiles()) {
    100       if (file.isDirectory()) {
    101         doTikaDocuments(file);
    102         continue;
    103       }
    104         // Get ready to parse the file.
    105       ContentHandler textHandler = new BodyContentHandler();
    106       Metadata metadata = new Metadata();
    107       ParseContext context = new ParseContext();
    108 
    109       InputStream input = new FileInputStream(file);
    110 
    111         // Try parsing the file. Note we haven't checked at all to
    112         // see whether this file is a good candidate.
    113       try {
    114         _autoParser.parse(input, textHandler, metadata, context);
    115       } catch (Exception e) {
    116           // Needs better logging of what went wrong in order to
    117           // track down "bad" documents.
    118         log(String.format("File %s failed", file.getCanonicalPath()));
    119         e.printStackTrace();
    120         continue;
    121       }
    122       // Just to show how much meta-data and what form it's in.
    123       dumpMetadata(file.getCanonicalPath(), metadata);
    124 
    125       // Index just a couple of the meta-data fields.
    126       SolrInputDocument doc = new SolrInputDocument();
    127 
    128       doc.addField("id", file.getCanonicalPath());
    129 
    130       // Crude way to get known meta-data fields.
    131       // Also possible to write a simple loop to examine all the
    132       // metadata returned and selectively index it and/or
    133       // just get a list of them.
    134       // One can also use the LucidWorks field mapping to
    135       // accomplish much the same thing.
    136       String author = metadata.get("Author");
    137 
    138       if (author != null) {
    139         doc.addField("author", author);
    140       }
    141 
    142       doc.addField("text", textHandler.toString());
    143 
    144       _docs.add(doc);
    145       ++_totalTika;
    146 
    147       // Completely arbitrary, just batch up more than one document
    148       // for throughput!
    149       if (_docs.size() >= 1000) {
    150           // Commit within 5 minutes.
    151         UpdateResponse resp = _server.add(_docs, 300000);
    152         if (resp.getStatus() != 0) {
    153           log("Some horrible error has occurred, status is: " +
    154                   resp.getStatus());
    155         }
    156         _docs.clear();
    157       }
    158     }
    159   }
    160 
    161     // Just to show all the metadata that's available.
    162   private void dumpMetadata(String fileName, Metadata metadata) {
    163     log("Dumping metadata for file: " + fileName);
    164     for (String name : metadata.names()) {
    165       log(name + ":" + metadata.get(name));
    166     }
    167     log("\n\n");
    168   }
    169 
    170   /**
    171    * ***************************SQL processing here
    172    */
    173   private void doSqlDocuments() throws SQLException {
    174     Connection con = null;
    175     try {
    176       Class.forName("com.mysql.jdbc.Driver").newInstance();
    177       log("Driver Loaded");
    178 
    179       con = DriverManager.getConnection("jdbc:mysql://192.168.1.103:3306/test?"
    180                 + "user=testuser&password=test123");
    181 
    182       Statement st = con.createStatement();
    183       ResultSet rs = st.executeQuery("select id,title,text from test");
    184 
    185       while (rs.next()) {
    186         // DO NOT move this outside the while loop
    187         // or be sure to call doc.clear()
    188         SolrInputDocument doc = new SolrInputDocument(); 
    189         String id = rs.getString("id");
    190         String title = rs.getString("title");
    191         String text = rs.getString("text");
    192 
    193         doc.addField("id", id);
    194         doc.addField("title", title);
    195         doc.addField("text", text);
    196 
    197         _docs.add(doc);
    198         ++_totalSql;
    199 
    200         // Completely arbitrary, just batch up more than one
    201         // document for throughput!
    202         if (_docs.size() > 1000) {
    203              // Commit within 5 minutes.
    204           UpdateResponse resp = _server.add(_docs, 300000);
    205           if (resp.getStatus() != 0) {
    206             log("Some horrible error has occurred, status is: " +
    207                   resp.getStatus());
    208           }
    209           _docs.clear();
    210         }
    211       }
    212     } catch (Exception ex) {
    213       ex.printStackTrace();
    214     } finally {
    215       if (con != null) {
    216         con.close();
    217       }
    218     }
    219   }
    220 }
  • 相关阅读:
    PTA 7-9 空心字母金字塔 (10分)【待优化】
    PTA 7-12 交换最小值和最大值 (15分)
    PTA 7-8 分队列 (10分)
    PTA 7-6 计算阶乘和 (10分)
    PTA 7-5 得分计算器 (20分)
    PTA 7-4 有重复的数据 (10分)
    PTA 7-1 数组元素循环右移问题 (20分)
    PTA 7-1 换硬币 (20分)
    PTA 7-6 又来一个上三角数字三角形 (10分)【待完善】
    PTA 7-5 画菱形 (10分)
  • 原文地址:https://www.cnblogs.com/SuperBing/p/2882820.html
Copyright © 2011-2022 走看看