zoukankan      html  css  js  c++  java
  • elasticsearch搜索提示

    elasticsearch搜索提示(补全)接口需要新增suggest字段并设type为:completion,结合到scrapy,修改es_types.py文件:

    from datetime import datetime
    from elasticsearch_dsl import DocType, Date, Nested, Boolean, analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
    from elasticsearch_dsl.connections import connections
    connections.create_connection(hosts=['localhost'])
    class ArticleType(DocType):
        #文章类型
        suggest = Completion(analyzer="ik_max_word") #这样做由于原码问题这里会报错
        title = Text(analyzer="ik_max_word")
        create_date = Date()
        praise_nums = Integer()
        fav_nums = Integer()
        comment_nums = Integer()
        tags = Text(analyzer="ik_max_word")
        front_image_url = Keyword()
        url_object_id = Keyword()
        front_image_path = Keyword()
        url = Keyword()
        content = Text(analyzer="ik_max_word")
    
        class Meta:
            index = 'jobbole'
            doc_type = 'article'
    if __name__ == '__main__':
        ArticleType.init()

    解决办法:自定义CustomAnalysis类,继承自elasticsearch_dsl.analysis下的CustomAnalysis类:

    from datetime import datetime
    from elasticsearch_dsl import DocType, Date, Nested, Boolean, 
        analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
    
    from elasticsearch_dsl.analysis import CustomAnalyzer as _CustomAnalyzer
    
    from elasticsearch_dsl.connections import connections
    connections.create_connection(hosts=["localhost"])
    
    class CustomAnalyzer(_CustomAnalyzer):
        def get_analysis_definition(self):
            return {}
    
    ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])#大小写转换(搜索时忽略大小写影响)
    class ArticleType(DocType):
        #伯乐在线文章类型
        suggest = Completion(analyzer=ik_analyzer)
        title = Text(analyzer="ik_max_word")
        create_date = Date()
        url = Keyword()
        url_object_id = Keyword()
        front_image_url = Keyword()
        front_image_path = Keyword()
        praise_nums = Integer()
        comment_nums = Integer()
        fav_nums = Integer()
        tags = Text(analyzer="ik_max_word")
        content = Text(analyzer="ik_max_word")
    
        class Meta:
            index = "jobbole"
            doc_type = "article"
    
    if __name__ == "__main__":
        ArticleType.init()

    在item中生成搜索建议词:

    from spider.models.es_types import ArticleType
    from elasticsearch_dsl.connections import connections
    es = connections.create_connection(ArticleType._doc_type.using)
    def gen_suggests(index, info_tuple):
        #根据字符串生成搜索建议数组
        used_words = set() #set为去重功能
        suggests = []
        for text, weight in info_tuple:
            if text:
                #字符串不为空时,调用elasticsearch的analyze接口分析字符串(分词、大小写转换)
                words = es.indices.analyze(index=index, analyzer="ik_max_word", params={'filter':["lowercase"]}, body=text)
                anylyzed_words = set([r["token"] for r in words["tokens"] if len(r["token"])>1])
                new_words = anylyzed_words - used_words
            else:
                new_words = set()
    
            if new_words:
                suggests.append({'input': list(new_words), 'weight': weight})
        return suggests
    
    
    class JobboleArticleItem(scrapy.Item):
        title = scrapy.Field()
        create_date = scrapy.Field(input_processor=MapCompose(date_convert))
        praise_nums = scrapy.Field(input_processor=MapCompose(number_convert))
        fav_nums = scrapy.Field(input_processor=MapCompose(number_convert))
        comment_nums = scrapy.Field(input_processor=MapCompose(number_convert))
        tags = scrapy.Field(input_processor=MapCompose(remove_comment_tags), output_processor=Join(','))
        front_image_url = scrapy.Field(output_processor=MapCompose(returnValue))
        url_object_id = scrapy.Field(input_processor=MapCompose(get_md5))
        front_image_path = scrapy.Field()
        url = scrapy.Field()
        content = scrapy.Field()

    def save_to_elasticsearch(self): article = ArticleType() article.title = self['title'] article.create_date = self['create_date'] article.content = remove_tags(self['content']) # remove_tags()去除html标签 article.front_image_url = self['front_image_url'] if 'front_image_path' in self: article.front_image_path = self['front_image_path'] article.praise_nums = self['praise_nums'] article.fav_nums = self['fav_nums'] article.comment_nums = self['comment_nums'] article.url = self['url'] article.tags = self['tags'] article.meta.id = self['url_object_id'] #生成搜索建议词 article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) article.save() # 保存 return
  • 相关阅读:
    Json数据解析
    Fragment
    android dom解析相关理解
    数据解析--sax解析
    android pull解析相关理解
    Android-通过URL获取网络资源
    HttpURLConnection
    Android平台关于时间和日期的相关类和方法(Date/Time)
    Asynctask onPostExecute未执行的问题分析
    windows下构建发布python模块(转载)
  • 原文地址:https://www.cnblogs.com/jp-mao/p/6937260.html
Copyright © 2011-2022 走看看