准备篇
- 安装: Elasticsearch
- 连接: ElasticSearch Head
- 建立索引: 详见文末
1. 安装依赖
pip install elasticsearch
2. 建立连接
from elasticsearch import Elasticsearch
es = Elasticsearch(["192.168.1.84"],http_auth=("elastic", "elastic"),port=9200)
3. 写入数据
doc = {'id': 1, 'lv_id': 12, 'sentiment':0, 'news_id': 1673578, 'review': '错字连篇,受不了,还真的看完了[笑着哭]', 'keyword': '受不了 错字连篇', 'ner': ''}
res = es.index(index="match_review", doc_type='review_feature' ,id=doc['news_id'], body=doc)
print(res['result'])
4. 批量写入
from elasticsearch import helpers
actions = []
data = [{'id': 1, 'lv_id': 12, 'sentiment':0, 'news_id': 1673578, 'review': '错字连篇,受不了,还真的看完了[笑着哭]', 'keyword': '受不了 错字连篇', 'ner': ''}, ...]
for doc in data:
action = {
"_index": "match_review",
"_type": "review_feature",
"_id": doc["news_id"],
"_source": doc
}
actions.append(action)
helpers.bulk(es, actions)
5. 根据id查询
news_id = 1673578
res = es.get(index="match_review", doc_type='review_feature' ,id=news_id)
6. 查询全部
query = es.search(index="match_review", body={"query": {"match_all": {}}}, scroll='5m', size=100)
res = query['hits']['hits'] # es查询出的结果第一页
total = query['hits']['total'] # es查询出的结果总量
scroll_id = query['_scroll_id'] # 游标用于输出es查询出的所有结果
for i in range(0, int(total/100)+1):
# scroll参数必须指定否则会报错
query_scroll = es.scroll(scroll_id=scroll_id,scroll='5m')['hits']['hits']
res += query_scroll
7. 按条件搜索
body = {
"query":{
"bool":{
# lv_id相等, sentiment满足范围-1到1
"must":[{"term":{"lv_id":lv_id}}, {"range":{"sentiment":{"gte": -1, "lte":1}}}],
# ner匹配到一个
"should": [{"match": {"ner":i}} for i in ["中国", "美国"]], "minimum_should_match": 1}
},
# 随机排序
"sort" : [{"_script" : {"script" : {"source" : "Math.random()","lang" : "painless"},"type" : "number","order" : "asc"}}]
}
res = es.search(index="match_review", body=body)
附录
自定义索引语句(指定分词方式)
{ "settings":{
"analysis":{
"analyzer":{
"my_lowercase_analyzer":{
"type":"custom",
"tokenizer":"whitespace",
"filter":[
"lowercase"
]
}
}
}
},
"mappings":{
"review_feature":{
"properties":{
"id": {
"type": "integer"
},
"keyword": {
"type": "text",
"analyzer":"my_lowercase_analyzer"
},
"lv1_id": {
"type": "integer"
},
"ner": {
"type": "text",
"analyzer":"my_lowercase_analyzer",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"news_id": {
"type": "integer"
},
"review": {
"type": "text"
},
"sentiment": {
"type": "integer"
}
}
}
}
}