zoukankan      html  css  js  c++  java
  • solr 自聚类实现

      参考官网:https://lucene.apache.org/solr/guide/6_6/result-clustering.html

      最近用到solr自聚类的,先简单介绍如下:

      1、配置文件

        主要配置文件必须配置如下内容:

    <lib dir="${solr.install.dir:../../..}/contrib/clustering/lib/" regex=".*.jar" />
    <lib dir="${solr.install.dir:../../..}/dist/" regex="solr-clustering-d.*.jar" />

        

    <searchComponent name="clustering" enable="${solr.clustering.enabled:true}" class="solr.clustering.ClusteringComponent">
        <!-- Lingo clustering algorithm -->
        <lst name="engine">
          <str name="name">lingo</str>
          <!--<bool name="optional">true</bool>-->
          <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
          <str name="carrot.resourcesDir">clustering/carrot2</str>
        </lst>
    
        <!-- An example definition for the STC clustering algorithm. -->
        <lst name="engine">
          <str name="name">stc</str>
          <bool name="optional">true</bool>
          <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
          <str name="carrot.resourcesDir">clustering/carrot2</str>
        </lst>
    
        <lst name="engine">
          <str name="name">kmeans</str>
          <!--<bool name="optional">true</bool>-->
          <str name="carrot.algorithm">org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm</str>
          <str name="carrot.resourcesDir">clustering/carrot2</str>
        </lst>
      </searchComponent>

        下面的配置文件根据自己的实际情况进行修改:

     <requestHandler name="/clustering"
                      startup="lazy"
                      class="solr.SearchHandler">
        <lst name="defaults">
          <bool name="clustering">true</bool>
          <bool name="clustering.results">true</bool>
    
          <!-- Field name with the logical "title" of a each document (optional) -->
          <str name="carrot.title">keyword</str>
          <!-- Logical field to physical field mapping. -->
          <str name="carrot.url">id</str>
          <!-- Field name with the logical "content" of a each document (optional) -->
          <str name="carrot.snippet">summary</str>
          <!-- Apply highlighter to the title/ content and use this for clustering. -->
          <bool name="carrot.produceSummary">true</bool>
          <!-- the maximum number of labels per cluster -->
          <!--<int name="carrot.numDescriptions">5</int>-->
          <!-- produce sub clusters -->
          <bool name="carrot.outputSubClusters">false</bool>
    
          <!-- Configure any other request handler parameters. We will cluster the
             top 100 search results so bump up the 'rows' parameter. -->
          <!--<str name="defType">edismax</str>
          <str name="qf">
            text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
          </str>
          <str name="q.alt">*:*</str>-->
          <str name="defType">edismax</str>
          <!--<str name="qf">
            summary^0.5 category^1.2  id^10.0
          </str>-->
          <str name="qf">keyword^0.5 title^1.2  id^10.0</str>
          <str name="rows">100</str>
          <str name="fl">*,score</str>
        </lst>
    
        <!-- Append clustering at the end of the list of search components. -->
        <arr name="last-components">
          <str>clustering</str>
        </arr>
      </requestHandler>

        managed-schema配置文件包含以下内:

       

     <fieldType name="text_ik" class="solr.TextField">
        <analyzer type="index" class="org.wltea.analyzer.lucene.IKAnalyzer"/>
        <analyzer type="query" class="org.wltea.analyzer.lucene.IKAnalyzer"/>
      </fieldType>
      <field name="id" type="string" multiValued="false" indexed="true" required="true" stored="true"/>
      <field name="text" type="text_ik" multiValued="false" indexed="true" stored="true" termVectors ="true"/>
      <field name="title" type="text_ik" multiValued="false" indexed="true" stored="true" />
      <field name="snippet" type="text_ik" multiValued="false" indexed="true" stored="true" />
      <field name="keyword" type="text_ik" multiValued="false" indexed="true" stored="true" />
      <field name="category" type="text_ik" multiValued="false" indexed="true" stored="true" />
      <field name="summary" type="text_ik" multiValued="false" indexed="true" stored="true"/>
      <field name="path" type="string" multiValued="false" indexed="true" stored="true"/>

        注意:text_ik对应的分词组件,要引用对应的jar包,具体参见:http://www.cnblogs.com/shaosks/p/8204615.html

      2、测试索引的文件

        启动solr服务,在浏览器输入:http://localhost:8983/solr/mycore/clustering?q=*:*&rows=10

        结果如下:

        

      3、java查询代码

    import org.apache.solr.client.solrj.SolrClient;
    import org.apache.solr.client.solrj.SolrQuery;
    import org.apache.solr.client.solrj.SolrServerException;
    import org.apache.solr.client.solrj.impl.HttpSolrClient;
    import org.apache.solr.client.solrj.response.Cluster;
    import org.apache.solr.client.solrj.response.QueryResponse;
    import org.apache.solr.client.solrj.response.ClusteringResponse;
    import org.apache.solr.common.SolrDocument;
    
    import java.io.IOException;
    import java.util.List;
    
    /**
     * @Author:sks
     * @Description:
     * @Date:Created in 9:41 2018/1/18
     * @Modified by:
     **/
    public class AutoCluster {
    
        private static SolrClient solr;
    
        /**
         * @Author:sks
         * @Description:初始化solr客户端
         * @Date:
         */
        public static void Init(String urlString){
    
            solr = new HttpSolrClient.Builder(urlString).build();
        }
        public static void main(String[] args) throws SolrServerException,IOException {
    
            String urlString = "http://localhost:8983/solr/mycore";
            String path = "D:/work/Solr/ImportData";
    
            Init(urlString);
            getAutoClusterInfo();
            System.exit(0);
        }
    
        /**
         * @Author:sks
         * @Description:获取聚类数据
         * @Date:
         */
        private static void getAutoClusterInfo() throws SolrServerException,IOException {
            //使用这个对象做查询
            SolrQuery params = new SolrQuery();
            //查询所有数据
            params.set("qt", "/clustering");
            params.setQuery("*:*");
            params.setStart(0);
            params.setRows(30);
    
            QueryResponse queryResponse = solr.query(params);
            ClusteringResponse clr = queryResponse.getClusteringResponse();
            List<Cluster> list = clr.getClusters();
            //拿到聚类数据集合,返回查询结果
    
            String  txt = "";
            for(Cluster c :list){
                //类别标签
                List<String> lblist = c.getLabels();
                for(String lb:lblist){
                    System.out.println(lb);
                }
                //聚类文档ID
                List<String> doclist  = c.getDocs();
                for(String doc:doclist){
                    System.out.println("        " + doc);
                }
            }
    
    
        }
    
    }

        查询结果如下:

      

        

  • 相关阅读:
    oracle 当行函数 日期
    veridata实验举例(1)验证TCUSTMER与TCUSTORD两节点同步情况
    sdut1730 数字三角形问题(dp入门题)
    Android4.0 Design之UI设计易犯的错误2
    怎样提高团队管理能力6
    Effective C++ 29-33
    内存补齐序列一:关于内存对齐和填充
    【 D3.js 入门系列 --- 10.1 】 简化 GeoJSON 文件
    Android TrafficStats类的使用
    新手上路:Laravel-控制器基础
  • 原文地址:https://www.cnblogs.com/shaosks/p/8309149.html
Copyright © 2011-2022 走看看