zoukankan      html  css  js  c++  java
  • Redis实现搜索和排序

    明日更新文字。

    建立反向索引

    基于文件建立单词与文档的反向索引,使用集合存储。

    # # #!/usr/bin/env python
    # # # -*- coding: UTF-8 -*-
    import jieba
    import codecs
    import redis
    import uuid
    #分词
    def cut_words(file):
        with open(file, 'r',encoding="utf-8") as f:
            text = f.read()
            words = jieba.cut_for_search(text)
            #print(len(words),words) #查看分词结果
        return words
    
    #去停用词
    def drop_Disable_Words(cut_res,stopwords):
        res = []
        for word in cut_res:
            if(len(word)) > 2:
                if word in stopwords or word =="
    " or word =="u3000":
                    continue
                res.append(word)
        #print(len(res),res) #查看去停用词结果
        return res
    
    #读取停用词
    def read_stop_word(file_path):
        file = file_path
        stopwords = codecs.open(file,'r',encoding='utf8').readlines()
        stopwords = [ w.strip() for w in stopwords ]
        return stopwords
    
    #建立反向索引
    def index_document(conn,docid,keywords):
        pipe = conn.pipeline(True) #管道里执行的命令可以保证执行的原子性
        for keyword in keywords:
            pipe.sadd(keyword,docid)
        return len(pipe.execute())
    
    
    def _set_conmon(conn,method,names,ttl = 30,execute = True):
        id = str(uuid.uuid4())
    
    #读取原始语料、停用词表
    files = ['file1.txt','file2.txt']
    stopwords = read_stop_word("stop_word.txt")
    
    dic = {}
    #分词、去停用词
    corpus = []
    for file in files:
        #分词
        cut_res = cut_words(file)
        #去停用词
        res = drop_Disable_Words(cut_res,stopwords)
        #记录页标和位置
        corpus.append(res)
    print(corpus)
    pool = redis.ConnectionPool(host='localhost', password='lin@Wen.',port=6379, decode_responses=True)
    conn = redis.Redis(connection_pool=pool)
    pipeline = conn.pipeline(True)
    for i in range(0,len(corpus)):
        for word in corpus[i]:
            pipeline.sadd('idx:'+word,files[i])
        print(len(pipeline.execute()))

    对单词进行搜索

    #搜索
    def _set_common(conn,method,names,ttl=30,execute =True):
        id = str(uuid.uuid4())
        pipeline = conn.pipeline(True)
        names = ['idx:' + name for name in names]
        getattr(pipeline,method)('idx:' + id,*names)
        pipeline.expire('idx:' + id,ttl)
        if execute:
            print(pipeline.execute())
        return id
    #交集计算
    def intersect(conn,items,ttl = 30,_execute=True):
        return _set_common(conn,'sinterstore',items,ttl,_execute)
    #并集计算
    def union(conn,items,ttl = 30,_execute=True):
        return _set_common(conn,'sunionstore',items,ttl,_execute)
    #差集计算
    def difference(conn,items,ttl = 30,_execute=True):
        return _set_common(conn,'sdiffstore',items,ttl,_execute)
    
    names = ["DirectX","Unity3D","STL"]
    pool = redis.ConnectionPool(host='localhost', password='lin@Wen.',port=6379, decode_responses=True)
    conn = redis.Redis(connection_pool=pool)
    id = union(conn,names)
    print(id)
    print(conn.smembers('idx:'+id))
    # redis.exceptions.ResponseError: WRONGTYPE Operation against a key holding the wrong kind of value
    # 错误原因:
    # redis的存储数据的类型和代码试图读取该数据时使用的函数不一致.
    # print(conn.sunion("idx:DirectX", "idx:STL"))
  • 相关阅读:
    Python学习笔记:pip使用技巧
    机器学习笔记:训练集、验证集和测试集区别
    MySQL学习笔记:3道面试题小测
    Python学习笔记:精确的四舍五入
    Hive学习笔记:列转行之collect_list/collect_set/concat_ws
    Python学习笔记:6个代码性能坏习惯
    爬虫学习笔记:打造自己的代理池
    Mysql学习笔记:5.5升级至8.0版本
    机器学习笔记:sklearn.model_selection.train_test_split切分训练、测试集
    HashSet其实就那么一回事儿之源码浅析
  • 原文地址:https://www.cnblogs.com/-wenli/p/13034835.html
Copyright © 2011-2022 走看看