导入clob很简单。但是blob好像没有提供方法,所以改了一下源码,重新编译替换class文件,竟然成功了。
先把配置文件贴上
SCHEMA.XML
<?xml version="1.0" ?> <schema name="test" version="1.1"> <types> <fieldtype name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/> <fieldType name="standard" class="solr.TextField" positionIncrementGap="100"> <analyzer type="index"> <tokenizer class="solr.StandardTokenizerFactory"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> <filter class="solr.LowerCaseFilterFactory"/> </analyzer> <analyzer type="query"> <tokenizer class="solr.StandardTokenizerFactory"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> <filter class="solr.LowerCaseFilterFactory"/> </analyzer> </fieldType> <fieldType name="ik" class="solr.TextField"> <analyzer class="org.wltea.analyzer.lucene.IKAnalyzer"/> </fieldType> </types> <fields> <field name="blogId" type="string" indexed="false" stored="true" multiValued="false"/> <field name="blogTitle" type="ik" indexed="true" stored="true" multiValued="false" /> <field name="blogAuthorName" type="ik" indexed="true" stored="true" multiValued="false" /> <field name="blogContent" type="ik" indexed="true" stored="true" multiValued="false" /> <field name="TITLE" type="ik" indexed="true" stored="true" /> <field name="TEXT" type="ik" indexed="true" stored="true" /> </fields> <defaultSearchField>blogTitle</defaultSearchField> <solrQueryParser defaultOperator="OR"/> </schema>
这里的field只用到了blogContent一个。
SOLRCONFIG.XML
<?xml version="1.0" encoding="UTF-8" ?> <config> <luceneMatchVersion>LUCENE_34</luceneMatchVersion> <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.StandardDirectoryFactory}"/> <updateHandler class="solr.DirectUpdateHandler2" /> <requestDispatcher handleSelect="true" > <requestParsers enableRemoteStreaming="false" multipartUploadLimitInKB="2048" /> </requestDispatcher> <requestHandler name="standard" class="solr.StandardRequestHandler" default="true" /> <requestHandler name="/update" class="solr.XmlUpdateRequestHandler" /> <requestHandler name="/admin/" class="org.apache.solr.handler.admin.AdminHandlers" /> <!-- the dataimport requestHandler --> <requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler"> <lst name="defaults"> <str name="config">db-data-config.xml</str> </lst> </requestHandler> <admin> <defaultQuery>solr</defaultQuery> </admin> <unlockOnStartup>true</unlockOnStartup> <lockType>simple</lockType> <requestHandler name="/analysis/field" startup="lazy" class="solr.FieldAnalysisRequestHandler" /> </config>
db-data-config.xml
<dataConfig> <dataSource name="f1" type="FieldStreamDataSource"/> <dataSource driver="oracle.jdbc.driver.OracleDriver" url="jdbc:oracle:thin:@127.0.0.1:1521:orcl" user="HT" password="HT"/> <document> <entity name="blog" query="SELECT BLOG_CONTENT from TB_ENT_BLOG" transformer="ClobTransformer"> <field column="BLOG_CONTENT" name="blogContent" clob="true"/> </entity> </document> </dataConfig>
然后修改了ClobTransformer.java。使其同时支持BLOG格式。
package org.apache.solr.handler.dataimport; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.sql.Blob; import java.sql.Clob; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import java.util.Map; public class ClobTransformer extends Transformer { public static final String CLOB = "clob"; public Object transformRow(Map<String, Object> aRow, Context context) { for (Map map : context.getAllEntityFields()) { if ("true".equals(map.get("clob"))) { String column = (String)map.get("column"); String srcCol = (String)map.get("sourceColName"); if (srcCol == null) srcCol = column; Object o = aRow.get(srcCol); if ((o instanceof List)) { List inputs = (List)o; List results = new ArrayList(); for (Object input : inputs) { if ((input instanceof Clob)) { Clob clob = (Clob)input; results.add(readFromClob(clob)); }else if(input instanceof Blob){ Blob blob = (Blob)input; results.add(readFromBlob(blob)); } } aRow.put(column, results); } else if ((o instanceof Clob)) { Clob clob = (Clob)o; aRow.put(column, readFromClob(clob)); }else if(o instanceof Blob){ Blob blob = (Blob)o; aRow.put(column, readFromBlob(blob)); } } } return aRow; } private String readFromBlob(Blob blob) { try{ InputStream is = blob.getBinaryStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is)); String str = ""; String res = ""; while((str=br.readLine())!=null){ res += str; } return res; }catch (Exception e) { e.printStackTrace(); return ""; } } private String readFromClob(Clob clob) { Reader reader = null; try { reader = clob.getCharacterStream(); } catch (SQLException e1) { e1.printStackTrace(); } StringBuilder sb = new StringBuilder(); char[] buf = new char[1024]; try { int len; while ((len = reader.read(buf)) != -1) sb.append(buf, 0, len); } catch (IOException e) { DataImportHandlerException.wrapAndThrow(500, e); } return sb.toString(); } }
这里加了一个readFromBlob方法,加了两个else if。异常的处理很粗糙。
这样替换class文件,导入索引就正常了。在query ":" 页面的response会出现所有blob内容。
如果response没有blob字段或者显示为对象地址,都是错了。