明日更新文字。
建立反向索引
基于文件建立单词与文档的反向索引,使用集合存储。
# # #!/usr/bin/env python # # # -*- coding: UTF-8 -*- import jieba import codecs import redis import uuid #分词 def cut_words(file): with open(file, 'r',encoding="utf-8") as f: text = f.read() words = jieba.cut_for_search(text) #print(len(words),words) #查看分词结果 return words #去停用词 def drop_Disable_Words(cut_res,stopwords): res = [] for word in cut_res: if(len(word)) > 2: if word in stopwords or word ==" " or word =="u3000": continue res.append(word) #print(len(res),res) #查看去停用词结果 return res #读取停用词 def read_stop_word(file_path): file = file_path stopwords = codecs.open(file,'r',encoding='utf8').readlines() stopwords = [ w.strip() for w in stopwords ] return stopwords #建立反向索引 def index_document(conn,docid,keywords): pipe = conn.pipeline(True) #管道里执行的命令可以保证执行的原子性 for keyword in keywords: pipe.sadd(keyword,docid) return len(pipe.execute()) def _set_conmon(conn,method,names,ttl = 30,execute = True): id = str(uuid.uuid4()) #读取原始语料、停用词表 files = ['file1.txt','file2.txt'] stopwords = read_stop_word("stop_word.txt") dic = {} #分词、去停用词 corpus = [] for file in files: #分词 cut_res = cut_words(file) #去停用词 res = drop_Disable_Words(cut_res,stopwords) #记录页标和位置 corpus.append(res) print(corpus) pool = redis.ConnectionPool(host='localhost', password='lin@Wen.',port=6379, decode_responses=True) conn = redis.Redis(connection_pool=pool) pipeline = conn.pipeline(True) for i in range(0,len(corpus)): for word in corpus[i]: pipeline.sadd('idx:'+word,files[i]) print(len(pipeline.execute()))
对单词进行搜索
#搜索 def _set_common(conn,method,names,ttl=30,execute =True): id = str(uuid.uuid4()) pipeline = conn.pipeline(True) names = ['idx:' + name for name in names] getattr(pipeline,method)('idx:' + id,*names) pipeline.expire('idx:' + id,ttl) if execute: print(pipeline.execute()) return id #交集计算 def intersect(conn,items,ttl = 30,_execute=True): return _set_common(conn,'sinterstore',items,ttl,_execute) #并集计算 def union(conn,items,ttl = 30,_execute=True): return _set_common(conn,'sunionstore',items,ttl,_execute) #差集计算 def difference(conn,items,ttl = 30,_execute=True): return _set_common(conn,'sdiffstore',items,ttl,_execute) names = ["DirectX","Unity3D","STL"] pool = redis.ConnectionPool(host='localhost', password='lin@Wen.',port=6379, decode_responses=True) conn = redis.Redis(connection_pool=pool) id = union(conn,names) print(id) print(conn.smembers('idx:'+id)) # redis.exceptions.ResponseError: WRONGTYPE Operation against a key holding the wrong kind of value # 错误原因: # redis的存储数据的类型和代码试图读取该数据时使用的函数不一致. # print(conn.sunion("idx:DirectX", "idx:STL"))