zoukankan      html  css  js  c++  java
  • Solr DIH dataconfig配置

    1.

    配置文件data-config.xml定义了数据库的基本配置,以及导出数据的映射规则,即导出数据库表中对应哪些字段的值,以及对特定字段的值做如何处理

    </pre><p><pre name="code" class="html"><dataConfig>
        <dataSource name="jdbc" driver="com.mysql.jdbc.Driver"
            url="jdbc:mysql://172.0.8.249:5606/marketing_db_saved?zeroDateTimeBehavior=convertToNull"
            user="developer" password="sedept@shiyanjun.cn" />
        <document name="mkt_data">
            <entity name="marketing_data" pk="id"
                query="select * from marketing_data where id between ${dataimporter.request.offset} and ${dataimporter.request.offset}+1000000"
                deltaQuery="select * from marketing_data where updated_at > '${dih.last_index_time}'"
                transformer="RegexTransformer">
                <field column="id" name="id" />
                <field column="domain" name="domain" />
                <field column="alex_rank" name="alex_rank" />
                <field column="server_port" name="server_port" />
                <field column="cert_validity_notBefore" name="cert_validity_notBefore" />
                <field column="cert_validity_notAfter" />
                <field column="cert_validity_notAfter_yyyyMMdd" regex="(.*?)s+.*"
                    name="cert_validity_notAfter_yyyyMMdd" sourceColName="cert_validity_notAfter" />
                <field column="cert_issuer_brand" name="cert_issuer_brand" />
                <field column="cert_validation" name="cert_validation" />
                <field column="cert_isMultiDomain" name="cert_isMultiDomain" />
                <field column="cert_issuer_brand_isXRelated" name="cert_issuer_brand_isXRelated" />
                <field column="cert_isWildcard" name="cert_isWildcard" />
                <field column="cert_notAfter" name="cert_notAfter" />
                <field column="special_ssl" name="special_ssl" />
                <field column="competitor_logo" name="competitor_logo" />
                <field column="segment" name="segment" />
            </entity>
        </document>
    </dataConfig>


    Solr的DIH暴露了请求中传递的变量 ${dataimporter.request.offset},也就是在请求的requestHandler中可以附带附加属性条件,例如,下面请求URL中的offset=5000000参数:

    http://172.0.8.212:8080/seaarch-server/core0/dataimport?command=full-import&offset=5000000

    另外,还有一个参数是很重要的,它决定着是否清除已经存在的索引数据,默认为clean=true,如果不想删除以前的索引数据,一定要在请求的URL中指定该属性为false,请求URL如下:

    http://172.0.8.212:8080/seaarch-server/core0/dataimport?command=full-import&offset=5000000&clean=false
    

    另外,索引完成后一半需要执行commit操作,将内存中索引数据持久化到文件系统,防止改变丢失,所以需要在请求的URL中增加commit=true,例如:

    http://172.0.8.212:8080/seaarch-server/core0/dataimport?command=full-import&offset=5000000&clean=false&commit=true

    2.

    <dataConfig>
        <dataSource type="FileDataSource" encoding="UTF-8" />
        <entity name="xml_stories" rootEntity="false" dataSource="null"
            processor="FileListEntityProcessor" fileName="legacy_stories.*.xml$"
            recursive="false" baseDir="/usr/local/extracts"
            newerThan="${dataimporter.xml_stories.last_index_time}">
            <entity name="stories" pk="id" dataSource="xml_stories"
                processor="XPathEntityProcessor" url="${xml_stories.fileAbsolutePath}"
                forEach="/RECORDS/RECORD" stream="true"
    
                transformer="DateFormatTransformer,HTMLStripTransformer,RegexTransformer,TemplateTransformer"
                onError="continue">
                <field column="_modified_date"
                    xpath="/RECORDS/RECORD/PROP[@NAME='R_ModifiedTime']/PVAL" />
                <field column="modified_date" sourceColName="_modified_date"
                    dateTimeFormat="yyyy-MM-dd'T'hh:mm:ss'Z'" />
    
                <field column="_df_date_published" xpath="/RECORDS/RECORD/PROP[@NAME='R_StoryDate']/PVAL" />
                <field column="df_date_published" sourceColName="_df_date_published"
                    dateTimeFormat="yyyy-MM-dd'T'hh:mm:ss'Z'" />
    
                <field column="sort_date_modified" sourceColName="modified_date"
                    dateTimeFormat="yyyyMMddhhmmss" />
                <field column="sort_date_published" sourceColName="df_date_published"
                    dateTimeFormat="yyyyMMddhhmmss" />
            </entity>
        </entity>
    </document>
    </dataConfig>

    3.

    数据源:

    <?xml version="1.0" encoding="utf-8"?>
    <urlset>
        <url>
            <loc>http://nn.meituan.com/deal/527742.html?source=hao123</loc>
            <data>
                <display>
                    <website>美团网</website>
                    <siteurl>http://nn.meituan.com</siteurl>
                    <city>南宁</city>
                    <sort>餐饮美食</sort>
                    <title>【朝阳】比巴卜自助烤涮餐厅单人自助晚餐1次,无需预约,节假日通用</title>
                    <image>http://p0.meituan.net/275.168/deal/201211/19/1111_1119205421.jpg</image>
                    <startTime>1353513600</startTime>
                    <endTime>1414476000</endTime>
                    <value>59</value>
                    <price>50.00</price>
                    <rebate>8.5折</rebate>
                    <bought>42573</bought>
                    <spend_start_time>1353513600</spend_start_time>
                    <spend_close_time>1414511999</spend_close_time>
                    <longitude>108.321188</longitude>
                    <latitude>22.816958</latitude>
                    <collections>0</collections>
                    <type>1</type>
                    <soldout>no</soldout>
                </display>
            </data>
        </url>
    </urlset>
    View Code

    配置:

    <dataConfig>
        <script><![CDATA[
                    function ReplaceLocAddId(row)    {
                        var loc_1 = row.get('loc').split('/deal/');
                        var loc_2 = loc_1[1].split('.html');
                        var id = loc_2[0];
                        row.put('id', id);
                        //格式化时间.
                        //var sdf = new java.text.SimpleDateFormat('yyyy-MM-dd HH:mm:ss');
                        //开始时间.
                        row.put('startTime', com.sitech.util.DateUtils.parseMT(row.get('startTime'),null));
                        //结束时间.
                           row.put('endTime', com.sitech.util.DateUtils.parseMT(row.get('endTime'),null));
                        //去掉折扣汉字.
                        row.put('rebate', row.get('rebate').replace('折',''));
                        
                        return row;
                    }
            ]]></script>
        <dataSource type="FileDataSource" encoding="utf-8" />
        <document>
            <entity name="tuan" pk="loc" url="D:/solr/source_data/meituan_hao123.xml"
                processor="XPathEntityProcessor" forEach="/urlset/url"
                transformer="script:ReplaceLocAddId,DateFormatTransformer">
                <field column="loc" xpath="/urlset/url/loc" commonField="true" />
                <field column="city" xpath="/urlset/url/data/display/city"
                    commonField="true" />
                <field column="sort" xpath="/urlset/url/data/display/sort"
                    commonField="true" />
                <field column="title" xpath="/urlset/url/data/display/title"
                    commonField="true" />
                <field column="image" xpath="/urlset/url/data/display/image"
                    commonField="true" />
                <field column="value" xpath="/urlset/url/data/display/value"
                    commonField="true" />
                <field column="price" xpath="/urlset/url/data/display/price"
                    commonField="true" />
                <field column="rebate" xpath="/urlset/url/data/display/rebate"
                    commonField="true" />
                <field column="bought" xpath="/urlset/url/data/display/bought"
                    commonField="true" />
                <field column="startTime" xpath="/urlset/url/data/display/startTime"
                    dateTimeFormat="yyyy-MM-dd HH:mm:ss" commonField="true" />
                <field column="endTime" xpath="/urlset/url/data/display/endTime"
                    dateTimeFormat="yyyy-MM-dd HH:mm:ss" commonField="true" />
            </entity>
        </document>
    </dataConfig>

     4、从oracle抽取数据建立索引

    <dataConfig>
    <dataSource name="jdbc" driver="oracle.jdbc.driver.OracleDriver" url="jdbc:oracle:thin:@127.0.0.1:1522:ORCLLI"        user="root" password="root"/>
        <document>
            <entity name="tm_details" query="select t.docid as id,t.tempid,t.cruser as userid,t.crtime,t.A_GZMC||t.A_XXMS||t.A_ZZPZZY as content from TM_DETAILS t  where t.type=2 "
                transformer="ClobTransformer,HTMLStripTransformer,RegexTransformer,DateFormatTransformer">
                <field column="ID" name="id" />
                <field column="TEMPID" name="tempid" />
                <entity name="template" query="select te.name from kmstemplate  te where te.id=${tm_details.TEMPID}">
                    <field column="NAME" name="template"/>
                </entity>
                <entity   name="user" query="select msg.name  from tb_sys_loginmsg msg where msg.login_id='${tm_details.USERID}'" >
                    <field column="NAME" name="cruser"/>
                </entity>
                <field column="CRTIME" name="crtime"  dateTimeFormat="yyyy-MM-dd HH:mm:ss"/>
                <entity name="doc" query="select rtrim(d.doctitle,'.htm') as title  from kmsdocument d where d.docid=${tm_details.ID}">
                    <field column="TITLE" name="title" clob="true"/>
                </entity>
                <field column="CONTENT" name="content"  clob="true"  stripHTML="true" regex="\t|
    |
    " 
                replaceWith=""/>
            </entity>
        </document>
    </dataConfig>
    <dataConfig>
    <dataSource name="jdbc" driver="oracle.jdbc.driver.OracleDriver" url="jdbc:oracle:thin:@172.21.144.200:1522:ORCLLI"        user="kms_user_js" password="kms_user_js"/>
        <document>
            <entity name="taocan" query="select t.docid||'e'||t.eid||'o'||t.ordernum||'n'||t.numgroup as id ,t.tempid,t.cruser as userid,t.crtime,t.faq_wt2 as title,da.faq_da2 as content  from TM_DETAILS_LIST_FAQ_WT t  join  TM_DETAILS_LIST_FAQ_DA da on  t.docid=da.docid and t.ordernum=da.ordernum and t.numgroup=da.numgroup and t.eid=da.eid"
                transformer="ClobTransformer,DateFormatTransformer">
                <field column="ID" name="id" />
                <entity name="template" query="select te.name from kmstemplate  te where te.id=${taocan.TEMPID}">
                    <field column="NAME" name="template"/>
                </entity>
                <entity   name="user" query="select msg.name  from tb_sys_loginmsg msg where msg.login_id='${taocan.USERID}'" >
                    <field column="NAME" name="cruser"/>
                </entity>
                <field column="CRTIME" name="crtime"  dateTimeFormat="yyyy-MM-dd HH:mm:ss"/>
                <field column="TITLE" name="title" clob="true"/>
                <field column="CONTENT" name="content"  clob="true"/>
            </entity>
        </document>
    </dataConfig>

     5、mysql 和文件集成

    <dataConfig>
    <dataSource   name="jdbc" type="JdbcDataSource" driver="com.mysql.jdbc.Driver"    batchSize="-1"        
        url="jdbc:mysql://127.0.0.1:3306/test?characterEncoding=UTF-8"    user="root" password="root"/>
    <dataSource name="file" type="FileDataSource" encoding="utf-8" />
        <document>
            <entity pk="id" name="kms" dataSource="jdbc" 
                query="SELECT id,docid,path  FROM gx_kmsindex ORDER BY TIME ASC"
                deltaQuery="select id from gx_kmsindex   where time>'${dih.last_index_time}' and type in ('add','update') ORDER BY TIME ASC"
                deletedPkQuery="select docid as id   from gx_kmsindex where type='delete' and time>'${dih.last_index_time}'  ORDER BY TIME ASC"
                deltaImportQuery="select path from gx_kmsindex where id='${dih.delta.id}' ORDER BY TIME ASC"
                transformer="DateFormatTransformer" >
                <entity  dataSource="file" name="xml" url="${kms.path}" processor="XPathEntityProcessor"    
                     forEach="/datas/data/"   transformer="DateFormatTransformer" >
                    <field column="id"     xpath="/datas/data/id"  />
                    <field column="title" xpath="/datas/data/title"  />
                    <field column="content" xpath="/datas/data/content"  />
                    <field column="crtime" xpath="/datas/data/crtime"   dateTimeFormat="yyyy-MM-dd HH:mm:ss"/>
                    <field column="templateid" xpath="/datas/data/templateid"   />
                    <field column="price" xpath="/datas/data/price"  />
                </entity>
            </entity>
        </document>
    </dataConfig>

     
  • 相关阅读:
    Arrays.asList的使用
    php之sql语句 创建数据库、表、插入字段,自动判断是否成功
    初识 canvas 绘图
    自定义音频audio播放器
    我的晨练
    js获取屏幕或可视范围
    js 查看脚本运行时间的办法
    a:hover伪类在ios移动端浏览器内触发无法取消
    js模拟用户触摸事件
    持续健身带来的变化
  • 原文地址:https://www.cnblogs.com/a198720/p/4022441.html
Copyright © 2011-2022 走看看