zoukankan      html  css  js  c++  java
  • wukong引擎源码分析之索引——part 3 文档评分 无非就是将docid对应的fields信息存储起来,为搜索结果rank评分用

    之前的文章分析过,接受索引请求处理的代码在segmenter_worker.go里:

    复制代码
    func (engine *Engine) segmenterWorker() {
        for {
            request := <-engine.segmenterChannel //关键
    
            tokensMap := make(map[string][]int)
            numTokens := 0
            if !engine.initOptions.NotUsingSegmenter && request.data.Content != "" {
                // 当文档正文不为空时,优先从内容分词中得到关键词
                segments := engine.segmenter.Segment([]byte(request.data.Content))
                for _, segment := range segments {
                    token := segment.Token().Text()
                    if !engine.stopTokens.IsStopToken(token) {
                        tokensMap[token] = append(tokensMap[token], segment.Start())
                    }
                }
                numTokens = len(segments)
            } else {
                // 否则载入用户输入的关键词
                for _, t := range request.data.Tokens {
                    if !engine.stopTokens.IsStopToken(t.Text) {
                        tokensMap[t.Text] = t.Locations
                    }
                }
                numTokens = len(request.data.Tokens)
            }
    
            // 加入非分词的文档标签
            for _, label := range request.data.Labels {
                if !engine.initOptions.NotUsingSegmenter {
                    if !engine.stopTokens.IsStopToken(label) {
                        tokensMap[label] = []int{}
                    }
                } else {
                    tokensMap[label] = []int{}
                }
            }
    
            indexerRequest := indexerAddDocumentRequest{
                document: &types.DocumentIndex{
                    DocId:       request.docId,
                    TokenLength: float32(numTokens),
                    Keywords:    make([]types.KeywordIndex, len(tokensMap)),
                },
            }
            iTokens := 0
            for k, v := range tokensMap {
                indexerRequest.document.Keywords[iTokens] = types.KeywordIndex{
                    Text: k,
                    // 非分词标注的词频设置为0,不参与tf-idf计算
                    Frequency: float32(len(v)),
                    Starts:    v}
                iTokens++
            }
    
            var dealDocInfoChan = make(chan bool, 1)
    
            indexerRequest.dealDocInfoChan = dealDocInfoChan
            engine.indexerAddDocumentChannels[request.shard] <- indexerRequest
    
            rankerRequest := rankerAddDocRequest{
                docId:           request.docId,
                fields:          request.data.Fields,
                dealDocInfoChan: dealDocInfoChan,
            }
            engine.rankerAddDocChannels[request.shard] <- rankerRequest
        }
    }
    复制代码

    上面代码的作用就是在统计词频和单词位置(注意:tag也是作为搜索的单词,不过其词频是0,而无法参与tf-idf计算),并封装为indexerRequest,发送给engine.indexerAddDocumentChannels[request.shard]

    此外,红色部分代码是在为文档评分做准备,engine/ranker_worker.go:

    func (engine *Engine) rankerAddDocWorker(shard int) {                                                                                                                                           
        for {
            request := <-engine.rankerAddDocChannels[shard] //关键
            docInfo := engine.rankers[shard].AddDoc(request.docId, request.fields, request.dealDocInfoChan)                                                                                         
            // save
            if engine.initOptions.UsePersistentStorage {
                engine.persistentStorageIndexDocumentChannels[shard] <- persistentStorageIndexDocumentRequest{                                                                                      
                    typ:     "info",                
                    docId:   request.docId,         
                    docInfo: docInfo,           
                }                  
            }
        }
    }

    AddDoc无非就是将docid对应的fields信息存储起来,为搜索结果rank评分用!

    // 给某个文档添加评分字段
    func (ranker *Ranker) AddDoc(docId uint64, fields interface{}, dealDocInfoChan <-chan bool) *types.DocInfo {
        if ranker.initialized == false {
            log.Fatal("排序器尚未初始化")   
        }
    
        <-dealDocInfoChan // 等待索引器处理完成
    
        ranker.DocInfosShard.Lock()
        defer ranker.DocInfosShard.Unlock()
        if _, found := ranker.DocInfosShard.DocInfos[docId]; !found {
            ranker.DocInfosShard.DocInfos[docId] = new(types.DocInfo)
            ranker.DocInfosShard.NumDocuments++
        }
        ranker.DocInfosShard.DocInfos[docId].Fields = fields
        return ranker.DocInfosShard.DocInfos[docId]
    }
  • 相关阅读:
    第一个Android应用 扫描宝 欲挑战传统扫描枪
    前端工程师在实现支付功能的时候能做些什么(V客学院技术分享)?
    HBuilder android 打包指南(V客学院技术分享)
    JavaScript 事件处理详解
    关于webpack的path和publicPath。
    svg线条的动画到渐变
    vue目录结构及其对应作用
    数据改变视图未变问题解决(Object.assign)
    ES6语法的简单介绍——拓展运算符
    webpack打包原理
  • 原文地址:https://www.cnblogs.com/bonelee/p/6582369.html
Copyright © 2011-2022 走看看