1.官网直接安装
官网下载:https://www.elastic.co/cn/downloads/elasticsearch
mac安装es:
下载完成后,打开bin文件夹下的elasticsearch文件,注意jdk版本要正确。因为我电脑有两个jdk版本,默认是jdk7.
执行vim ~/.bash_profile
进入编辑模式,输入i,修改后,esc退出,输入:wq退出insert
安装成功,访问http://localhost:9200/
2.docker安装
mac安装docker:https://www.runoob.com/docker/macos-docker-install.html
3.安装kibana
下载地址:https://www.elastic.co/cn/downloads/kibana
安装:打开bin文件夹,打开文件kibana
安装遇到错误:
Error: getaddrinfo ENOTFOUND localhost,是由于localhost没有绑定到127.0.0.1
启动后,在浏览器上打开 http://localhost:5601/
如果想修改Kibana连接的Elasticsearch地址,或是Kibana自身的端口5601,可以在Kibana目录下的config下面的kibana.yml文件中进行修改;
4.es分词器安装
安装指南:https://github.com/medcl/elasticsearch-analysis-ik
两种安装方式:
a.下载解压后安装,下载地址:https://github.com/medcl/elasticsearch-analysis-ik/releases
b.直接命令安装:./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v7.9.0/elasticsearch-analysis-ik-7.9.0.zip
安装完重启es
5.term vectors
官网文档:https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-termvectors.html#docs-termvectors-api-term-info
PUT /my-index-000001 { "mappings": { "properties": { "text": { "type": "text", "term_vector": "with_positions_offsets_payloads", "store" : true, "analyzer" : "fulltext_analyzer" }, "fullname": { "type": "text", "term_vector": "with_positions_offsets_payloads", "analyzer" : "fulltext_analyzer" } } }, "settings" : { "index" : { "number_of_shards" : 1, "number_of_replicas" : 0 }, "analysis": { "analyzer": { "fulltext_analyzer": { "type": "custom", "tokenizer": "whitespace", "filter": [ "lowercase", "type_as_payload" ] } } } } } PUT /my-index-000001/_doc/1 { "fullname" : "John Doe", "text" : "test test test " } PUT /my-index-000001/_doc/2?refresh=wait_for { "fullname" : "Jane Doe", "text" : "Another test ..." } PUT /my-index-000001/_doc/3?refresh=wait_for { "fullname" : "huyanxia liangming", "text" : "test Another baby ..." } GET /my-index-000001/_termvectors { "fields" : ["text"], "offsets" : true, "payloads" : true, "positions" : true, "term_statistics" : true, "field_statistics" : true } GET /my-index-000001/_termvectors { "doc" : { "fullname" : "John Doe diannao", "text" : "test test test" }, "filter": { "max_num_terms": 3, "min_term_freq": 1, "min_doc_freq": 1 } }
6.聚合计算,es版本7.9.1
PUT /user_profiles1
{
"settings": {
"index": {
"number_of_shards": "32",
"number_of_replicas": "1"
}
},
"mappings": {
"properties": {
"type": {
"type": "keyword"
},
"user_id": {
"type": "keyword"
},
"item_id": {
"type": "keyword"
},
"boost": {
"type": "double"
},
"created": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
},
"keywords": {
"type": "nested",
"properties": {
"word": {
"type": "keyword"
},
"weight": {
"type": "double"
}
}
}
}
}
}
PUT /user_profiles1/_doc/1_1_1001
{
"type": "1",
"user_id": "1",
"item_id": "1001",
"factor": 1.2,
"created" : "2020-09-07 14:54:37",
"keywords": [
{
"word": "中国",
"weight": 3.2
},
{
"word": "美国",
"weight": 1.4
}
]
}
PUT /user_profiles1/_doc/1_1_1002
{
"type": "1",
"user_id": "1",
"item_id": "1002",
"factor": 1.2,
"created" : "2020-09-07 14:54:37",
"keywords": [
{
"word": "中国辅导费",
"weight": 6.2
},
{
"word": "美国当时的",
"weight": 1.9
}
]
}
POST /user_profiles1/_search
{
"query": {
"bool": {
"must": [{
"terms": {
"type": [
"1"
]
}
},
{
"term": {
"user_id": {
"value": "1"
}
}
},
{
"range": {
"created": {
"gte": "2020-09-07 14:54:37"
}
}
}
]
}
},
"size": 0,
"aggs": {
"agg_keywords": {
"nested": {
"path": "keywords"
},
"aggs": {
"agg_word": {
"terms": {
"field": "keywords.word",
"order": {
"agg_score": "desc"
},
"size": 2 //决定返回大小
},
"aggs": {
"agg_score": {
"sum": {
"field": "keywords.weight"
}
}
}
}
}
}
}
}
7.从本地读取文件
8.termVector es2.1
//第一种 TermVectorsResponse termVectorResponse = ElasticSearchUtils.getEsClient() .prepareTermVectors() .setIndex("knowledge_items") .setType("knowledge_items") .setId(itemId) .setSelectedFields("content") .setTermStatistics(true) .setFieldStatistics(false) .setOffsets(false) .setPayloads(false) .setPositions(false) .execute() .actionGet(); //第二种 TermVectorsRequest termVectorsRequest = new TermVectorsRequest(); //设置参数 ElasticSearchUtils.getEsClient().termVectors(termVectorsRequest).actionGet();
结果json化输出
try { XContentBuilder builder = XContentFactory.jsonBuilder(); builder.startObject(); termVectorResponse.toXContent(builder, ToXContent.EMPTY_PARAMS); builder.endObject(); System.out.println("json termVectorResponse:" + builder.string()); } catch (IOException e) { e.printStackTrace(); }
结果遍历
Fields fields = termVectorResponse.getFields(); Iterator<String> iterator = fields.iterator(); while (iterator.hasNext()) { String field = iterator.next(); Terms terms = fields.terms(field); int docCount = terms.getDocCount();//field_statistics TermsEnum termsEnum = terms.iterator(); int currentTotalTermFreq = 0; List<TermInfoEntity> termInfoEntities = new ArrayList<>(); while (termsEnum.next() != null) {//每个词条 BytesRef term = termsEnum.term(); String termName = term.utf8ToString(); if(NumUtils.isNum(termName) || termName.length() == 1){ LOG.info("termName filter:{}" + termName); continue; } if (term != null) { int docFreq = termsEnum.docFreq(); int termFreq = termsEnum.postings(null, PostingsEnum.FREQS).freq(); currentTotalTermFreq = currentTotalTermFreq + termFreq; TermInfoEntity termInfoEntity = new TermInfoEntity(term.utf8ToString(), termFreq, docFreq); termInfoEntities.add(termInfoEntity); } } int finalCurrentTotalTermFreq = currentTotalTermFreq; double finalItemBoost = itemBoost; //计算每个词的tf-idf termInfoEntities.forEach(termInfoEntity -> { double tf = (double) termInfoEntity.getTermFreq()/ (double) finalCurrentTotalTermFreq; double idf = Math.log10(docCount/termInfoEntity.getDocFreq()) + 1; double tfIDf = NumUtils.doubleValueScale(6, tf * idf); KeyWordEntity keyWordEntity = new KeyWordEntity(termInfoEntity.getTermName(), tfIDf * userActionTypeEnum.getBoost() * finalItemBoost); keyWordEntities.add(keyWordEntity); }); } } catch (IOException e) { LOG.error("es termVectorResponse 遍历失败:", e); }