zoukankan      html  css  js  c++  java
  • indexer.go

    package core

    import (
        "log"
        "math"
        "sort"
        "sync"

        "github.com/huichen/wukong/types"
        "github.com/huichen/wukong/utils"
    )

    // 索引器
    type Indexer struct {
        // 从搜索键到文档列表的反向索引
        // 加了读写锁以保证读写安全
        tableLock struct {
            sync.RWMutex
            table     map[string]*KeywordIndices
            docsState map[uint64]int // nil: 表示无状态记录,0: 存在于索引中,1: 等待删除,2: 等待加入
        }
        addCacheLock struct {
            sync.RWMutex
            addCachePointer int
            addCache        types.DocumentsIndex
        }
        removeCacheLock struct {
            sync.RWMutex
            removeCachePointer int
            removeCache        types.DocumentsId
        }

        initOptions types.IndexerInitOptions
        initialized bool

        // 这实际上是总文档数的一个近似
        numDocuments uint64

        // 所有被索引文本的总关键词数
        totalTokenLength float32

        // 每个文档的关键词长度
        docTokenLengths map[uint64]float32
    }

    // 反向索引表的一行,收集了一个搜索键出现的所有文档,按照DocId从小到大排序。
    type KeywordIndices struct {
        // 下面的切片是否为空,取决于初始化时IndexType的值
        docIds      []uint64  // 全部类型都有
        frequencies []float32 // IndexType == FrequenciesIndex
        locations   [][]int   // IndexType == LocationsIndex
    }

    // 初始化索引器
    func (indexer *Indexer) Init(options types.IndexerInitOptions) {
        if indexer.initialized == true {
            log.Fatal("索引器不能初始化两次")
        }
        options.Init()
        indexer.initOptions = options
        indexer.initialized = true

        indexer.tableLock.table = make(map[string]*KeywordIndices)
        indexer.tableLock.docsState = make(map[uint64]int)
        indexer.addCacheLock.addCache = make([]*types.DocumentIndex, indexer.initOptions.DocCacheSize)
        indexer.removeCacheLock.removeCache = make([]uint64, indexer.initOptions.DocCacheSize*2)
        indexer.docTokenLengths = make(map[uint64]float32)
    }

    // 从KeywordIndices中得到第i个文档的DocId
    func (indexer *Indexer) getDocId(ti *KeywordIndices, i int) uint64 {
        return ti.docIds[i]
    }

    // 得到KeywordIndices中文档总数
    func (indexer *Indexer) getIndexLength(ti *KeywordIndices) int {
        return len(ti.docIds)
    }

    //  ADDCACHE 中加入一个文档
    func (indexer *Indexer) AddDocumentToCache(document *types.DocumentIndex, forceUpdate bool) {
        if indexer.initialized == false {
            log.Fatal("索引器尚未初始化")
        }

        indexer.addCacheLock.Lock()
        if document != nil {
            indexer.addCacheLock.addCache[indexer.addCacheLock.addCachePointer] = document
            indexer.addCacheLock.addCachePointer++
        }
        if indexer.addCacheLock.addCachePointer >= indexer.initOptions.DocCacheSize || forceUpdate {
            indexer.tableLock.Lock()
            position := 0
            for i := 0; i < indexer.addCacheLock.addCachePointer; i++ {
                docIndex := indexer.addCacheLock.addCache[i]
                if docState, ok := indexer.tableLock.docsState[docIndex.DocId]; ok && docState <= 1 {
                    // ok && docState == 0 表示存在于索引中,需先删除再添加
                    // ok && docState == 1 表示不一定存在于索引中,等待删除,需先删除再添加
                    if position != i {
                        indexer.addCacheLock.addCache[position], indexer.addCacheLock.addCache[i] =
                            indexer.addCacheLock.addCache[i], indexer.addCacheLock.addCache[position]
                    }
                    if docState == 0 {
                        indexer.removeCacheLock.Lock()
                        indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] =
                            docIndex.DocId
                        indexer.removeCacheLock.removeCachePointer++
                        indexer.removeCacheLock.Unlock()
                        indexer.tableLock.docsState[docIndex.DocId] = 1
                        indexer.numDocuments--
                    }
                    position++
                } else if !ok {
                    indexer.tableLock.docsState[docIndex.DocId] = 2
                }
            }

            indexer.tableLock.Unlock()
            if indexer.RemoveDocumentToCache(0, forceUpdate) {
                // 只有当存在于索引表中的文档已被删除,其才可以重新加入到索引表中
                position = 0
            }

            addCachedDocuments := indexer.addCacheLock.addCache[position:indexer.addCacheLock.addCachePointer]
            indexer.addCacheLock.addCachePointer = position
            indexer.addCacheLock.Unlock()
            sort.Sort(addCachedDocuments)
            indexer.AddDocuments(&addCachedDocuments)
        } else {
            indexer.addCacheLock.Unlock()
        }
    }

    // 向反向索引表中加入 ADDCACHE 中所有文档
    func (indexer *Indexer) AddDocuments(documents *types.DocumentsIndex) {
        if indexer.initialized == false {
            log.Fatal("索引器尚未初始化")
        }

        indexer.tableLock.Lock()
        defer indexer.tableLock.Unlock()
        indexPointers := make(map[string]int, len(indexer.tableLock.table))

        // DocId 递增顺序遍历插入文档保证索引移动次数最少
        for i, document := range *documents {
            if i < len(*documents)-1 && (*documents)[i].DocId == (*documents)[i+1].DocId {
                // 如果有重复文档加入,因为稳定排序,只加入最后一个
                continue
            }
            if docState, ok := indexer.tableLock.docsState[document.DocId]; ok && docState == 1 {
                // 如果此时 docState 仍为 1,说明该文档需被删除
                // docState 合法状态为 nil & 2,保证一定不会插入已经在索引表中的文档
                continue
            }

            // 更新文档关键词总长度
            if document.TokenLength != 0 {
                indexer.docTokenLengths[document.DocId] = float32(document.TokenLength)
                indexer.totalTokenLength += document.TokenLength
            }

            docIdIsNew := true
            for _, keyword := range document.Keywords {
                indices, foundKeyword := indexer.tableLock.table[keyword.Text]
                if !foundKeyword {
                    // 如果没找到该搜索键则加入
                    ti := KeywordIndices{}
                    switch indexer.initOptions.IndexType {
                    case types.LocationsIndex:
                        ti.locations = [][]int{keyword.Starts}
                    case types.FrequenciesIndex:
                        ti.frequencies = []float32{keyword.Frequency}
                    }
                    ti.docIds = []uint64{document.DocId}
                    indexer.tableLock.table[keyword.Text] = &ti
                    continue
                }

                // 查找应该插入的位置,且索引一定不存在
                position, _ := indexer.searchIndex(
                    indices, indexPointers[keyword.Text], indexer.getIndexLength(indices)-1, document.DocId)
                indexPointers[keyword.Text] = position
                switch indexer.initOptions.IndexType {
                case types.LocationsIndex:
                    indices.locations = append(indices.locations, []int{})
                    copy(indices.locations[position+1:], indices.locations[position:])
                    indices.locations[position] = keyword.Starts
                case types.FrequenciesIndex:
                    indices.frequencies = append(indices.frequencies, float32(0))
                    copy(indices.frequencies[position+1:], indices.frequencies[position:])
                    indices.frequencies[position] = keyword.Frequency
                }
                indices.docIds = append(indices.docIds, 0)
                copy(indices.docIds[position+1:], indices.docIds[position:])
                indices.docIds[position] = document.DocId
            }

            // 更新文章状态和总数
            if docIdIsNew {
                indexer.tableLock.docsState[document.DocId] = 0
                indexer.numDocuments++
            }
        }
    }

    //  REMOVECACHE 中加入一个待删除文档
    // 返回值表示文档是否在索引表中被删除
    func (indexer *Indexer) RemoveDocumentToCache(docId uint64, forceUpdate bool) bool {
        if indexer.initialized == false {
            log.Fatal("索引器尚未初始化")
        }

        indexer.removeCacheLock.Lock()
        if docId != 0 {
            indexer.tableLock.Lock()
            if docState, ok := indexer.tableLock.docsState[docId]; ok && docState == 0 {
                indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] = docId
                indexer.removeCacheLock.removeCachePointer++
                indexer.tableLock.docsState[docId] = 1
                indexer.numDocuments--
            } else if ok && docState == 2 {
                // 删除一个等待加入的文档
                indexer.tableLock.docsState[docId] = 1
            } else if !ok {
                // 若文档不存在,则无法判断其是否在 addCache 中,需避免这样的操作
            }
            indexer.tableLock.Unlock()
        }

        if indexer.removeCacheLock.removeCachePointer > 0 &&
            (indexer.removeCacheLock.removeCachePointer >= indexer.initOptions.DocCacheSize ||
                forceUpdate) {
            removeCachedDocuments := indexer.removeCacheLock.removeCache[:indexer.removeCacheLock.removeCachePointer]
            indexer.removeCacheLock.removeCachePointer = 0
            indexer.removeCacheLock.Unlock()
            sort.Sort(removeCachedDocuments)
            indexer.RemoveDocuments(&removeCachedDocuments)
            return true
        }
        indexer.removeCacheLock.Unlock()
        return false
    }

    // 向反向索引表中删除 REMOVECACHE 中所有文档
    func (indexer *Indexer) RemoveDocuments(documents *types.DocumentsId) {
        if indexer.initialized == false {
            log.Fatal("索引器尚未初始化")
        }

        indexer.tableLock.Lock()
        defer indexer.tableLock.Unlock()

        // 更新文档关键词总长度,删除文档状态
        for _, docId := range *documents {
            indexer.totalTokenLength -= indexer.docTokenLengths[docId]
            delete(indexer.docTokenLengths, docId)
            delete(indexer.tableLock.docsState, docId)
        }

        for keyword, indices := range indexer.tableLock.table {
            indicesTop, indicesPointer := 0, 0
            documentsPointer := sort.Search(
                len(*documents), func(i int) bool { return (*documents)[i] >= indices.docIds[0] })
            // 双指针扫描,进行批量删除操作
            for documentsPointer < len(*documents) && indicesPointer < indexer.getIndexLength(indices) {
                if indices.docIds[indicesPointer] < (*documents)[documentsPointer] {
                    if indicesTop != indicesPointer {
                        switch indexer.initOptions.IndexType {
                        case types.LocationsIndex:
                            indices.locations[indicesTop] = indices.locations[indicesPointer]
                        case types.FrequenciesIndex:
                            indices.frequencies[indicesTop] = indices.frequencies[indicesPointer]
                        }
                        indices.docIds[indicesTop] = indices.docIds[indicesPointer]
                    }
                    indicesTop++
                    indicesPointer++
                } else if indices.docIds[indicesPointer] == (*documents)[documentsPointer] {
                    indicesPointer++
                    documentsPointer++
                } else {
                    documentsPointer++
                }
            }
            if indicesTop != indicesPointer {
                switch indexer.initOptions.IndexType {
                case types.LocationsIndex:
                    indices.locations = append(
                        indices.locations[:indicesTop], indices.locations[indicesPointer:]...)
                case types.FrequenciesIndex:
                    indices.frequencies = append(
                        indices.frequencies[:indicesTop], indices.frequencies[indicesPointer:]...)
                }
                indices.docIds = append(
                    indices.docIds[:indicesTop], indices.docIds[indicesPointer:]...)
            }
            if len(indices.docIds) == 0 {
                delete(indexer.tableLock.table, keyword)
            }
        }
    }

    // 查找包含全部搜索键(AND操作)的文档
    // 当docIds不为nil时仅从docIds指定的文档中查找
    func (indexer *Indexer) Lookup(
        tokens []string, labels []string, docIds map[uint64]bool, countDocsOnly bool) (docs []types.IndexedDocument, numDocs int) {
        if indexer.initialized == false {
            log.Fatal("索引器尚未初始化")
        }

        if indexer.numDocuments == 0 {
            return
        }
        numDocs = 0

        // 合并关键词和标签为搜索键
        keywords := make([]string, len(tokens)+len(labels))
        copy(keywords, tokens)
        copy(keywords[len(tokens):], labels)

        indexer.tableLock.RLock()
        defer indexer.tableLock.RUnlock()
        table := make([]*KeywordIndices, len(keywords))
        for i, keyword := range keywords {
            indices, found := indexer.tableLock.table[keyword]
            if !found {
                // 当反向索引表中无此搜索键时直接返回
                return
            } else {
                // 否则加入反向表中
                table[i] = indices
            }
        }

        // 当没有找到时直接返回
        if len(table) == 0 {
            return
        }

        // 归并查找各个搜索键出现文档的交集
        // 从后向前查保证先输出DocId较大文档
        indexPointers := make([]int, len(table))
        for iTable := 0; iTable < len(table); iTable++ {
            indexPointers[iTable] = indexer.getIndexLength(table[iTable]) - 1
        }
        // 平均文本关键词长度,用于计算BM25
        avgDocLength := indexer.totalTokenLength / float32(indexer.numDocuments)
        for ; indexPointers[0] >= 0; indexPointers[0]-- {
            // 以第一个搜索键出现的文档作为基准,并遍历其他搜索键搜索同一文档
            baseDocId := indexer.getDocId(table[0], indexPointers[0])
            if docIds != nil {
                if _, found := docIds[baseDocId]; !found {
                    continue
                }
            }
            iTable := 1
            found := true
            for ; iTable < len(table); iTable++ {
                // 二分法比简单的顺序归并效率高,也有更高效率的算法,
                // 但顺序归并也许是更好的选择,考虑到将来需要用链表重新实现
                // 以避免反向表添加新文档时的写锁。
                // TODO: 进一步研究不同求交集算法的速度和可扩展性。
                position, foundBaseDocId := indexer.searchIndex(table[iTable],
                    0, indexPointers[iTable], baseDocId)
                if foundBaseDocId {
                    indexPointers[iTable] = position
                } else {
                    if position == 0 {
                        // 该搜索键中所有的文档ID都比baseDocId大,因此已经没有
                        // 继续查找的必要。
                        return
                    } else {
                        // 继续下一indexPointers[0]的查找
                        indexPointers[iTable] = position - 1
                        found = false
                        break
                    }
                }
            }

            if found {
                if docState, ok := indexer.tableLock.docsState[baseDocId]; !ok || docState != 0 {
                    continue
                }
                indexedDoc := types.IndexedDocument{}

                // 当为LocationsIndex时计算关键词紧邻距离
                if indexer.initOptions.IndexType == types.LocationsIndex {
                    // 计算有多少关键词是带有距离信息的
                    numTokensWithLocations := 0
                    for i, t := range table[:len(tokens)] {
                        if len(t.locations[indexPointers[i]]) > 0 {
                            numTokensWithLocations++
                        }
                    }
                    if numTokensWithLocations != len(tokens) {
                        if !countDocsOnly {
                            docs = append(docs, types.IndexedDocument{
                                DocId: baseDocId,
                            })
                        }
                        numDocs++
                        //当某个关键字对应多个文档且有lable关键字存在时,若直接break,将会丢失相当一部分搜索结果
                        continue
                    }

                    // 计算搜索键在文档中的紧邻距离
                    tokenProximity, tokenLocations := computeTokenProximity(table[:len(tokens)], indexPointers, tokens)
                    indexedDoc.TokenProximity = int32(tokenProximity)
                    indexedDoc.TokenSnippetLocations = tokenLocations

                    // 添加TokenLocations
                    indexedDoc.TokenLocations = make([][]int, len(tokens))
                    for i, t := range table[:len(tokens)] {
                        indexedDoc.TokenLocations[i] = t.locations[indexPointers[i]]
                    }
                }

                // 当为LocationsIndex或者FrequenciesIndex时计算BM25
                if indexer.initOptions.IndexType == types.LocationsIndex ||
                    indexer.initOptions.IndexType == types.FrequenciesIndex {
                    bm25 := float32(0)
                    d := indexer.docTokenLengths[baseDocId]
                    for i, t := range table[:len(tokens)] {
                        var frequency float32
                        if indexer.initOptions.IndexType == types.LocationsIndex {
                            frequency = float32(len(t.locations[indexPointers[i]]))
                        } else {
                            frequency = t.frequencies[indexPointers[i]]
                        }

                        // 计算BM25
                        if len(t.docIds) > 0 && frequency > 0 && indexer.initOptions.BM25Parameters != nil && avgDocLength != 0 {
                            // 带平滑的idf
                            idf := float32(math.Log2(float64(indexer.numDocuments)/float64(len(t.docIds)) + 1))
                            k1 := indexer.initOptions.BM25Parameters.K1
                            b := indexer.initOptions.BM25Parameters.B
                            bm25 += idf * frequency * (k1 + 1) / (frequency + k1*(1-b+b*d/avgDocLength))
                        }
                    }
                    indexedDoc.BM25 = float32(bm25)
                }

                indexedDoc.DocId = baseDocId
                if !countDocsOnly {
                    docs = append(docs, indexedDoc)
                }
                numDocs++
            }
        }
        return
    }

    // 二分法查找indices中某文档的索引项
    // 第一个返回参数为找到的位置或需要插入的位置
    // 第二个返回参数标明是否找到
    func (indexer *Indexer) searchIndex(
        indices *KeywordIndices, start int, end int, docId uint64) (int, bool) {
        // 特殊情况
        if indexer.getIndexLength(indices) == start {
            return start, false
        }
        if docId < indexer.getDocId(indices, start) {
            return start, false
        } else if docId == indexer.getDocId(indices, start) {
            return start, true
        }
        if docId > indexer.getDocId(indices, end) {
            return end + 1, false
        } else if docId == indexer.getDocId(indices, end) {
            return end, true
        }

        // 二分
        var middle int
        for end-start > 1 {
            middle = (start + end) / 2
            if docId == indexer.getDocId(indices, middle) {
                return middle, true
            } else if docId > indexer.getDocId(indices, middle) {
                start = middle
            } else {
                end = middle
            }
        }
        return end, false
    }

    // 计算搜索键在文本中的紧邻距离
    //
    // 假定第 i 个搜索键首字节出现在文本中的位置为 P_i,长度 L_i
    // 紧邻距离计算公式为
    //
    //     ArgMin(Sum(Abs(P_(i+1) - P_i - L_i)))
    //
    // 具体由动态规划实现,依次计算前 i  token 在每个出现位置的最优值。
    // 选定的 P_i 通过 tokenLocations 参数传回。
    func computeTokenProximity(table []*KeywordIndices, indexPointers []int, tokens []string) (
        minTokenProximity int, tokenLocations []int) {
        minTokenProximity = -1
        tokenLocations = make([]int, len(tokens))

        var (
            currentLocations, nextLocations []int
            currentMinValues, nextMinValues []int
            path                            [][]int
        )

        // 初始化路径数组
        path = make([][]int, len(tokens))
        for i := 1; i < len(path); i++ {
            path[i] = make([]int, len(table[i].locations[indexPointers[i]]))
        }

        // 动态规划
        currentLocations = table[0].locations[indexPointers[0]]
        currentMinValues = make([]int, len(currentLocations))
        for i := 1; i < len(tokens); i++ {
            nextLocations = table[i].locations[indexPointers[i]]
            nextMinValues = make([]int, len(nextLocations))
            for j, _ := range nextMinValues {
                nextMinValues[j] = -1
            }

            var iNext int
            for iCurrent, currentLocation := range currentLocations {
                if currentMinValues[iCurrent] == -1 {
                    continue
                }
                for iNext+1 < len(nextLocations) && nextLocations[iNext+1] < currentLocation {
                    iNext++
                }

                update := func(from int, to int) {
                    if to >= len(nextLocations) {
                        return
                    }
                    value := currentMinValues[from] + utils.AbsInt(nextLocations[to]-currentLocations[from]-len(tokens[i-1]))
                    if nextMinValues[to] == -1 || value < nextMinValues[to] {
                        nextMinValues[to] = value
                        path[i][to] = from
                    }
                }

                // 最优解的状态转移只发生在左右最接近的位置
                update(iCurrent, iNext)
                update(iCurrent, iNext+1)
            }

            currentLocations = nextLocations
            currentMinValues = nextMinValues
        }

        // 找出最优解
        var cursor int
        for i, value := range currentMinValues {
            if value == -1 {
                continue
            }
            if minTokenProximity == -1 || value < minTokenProximity {
                minTokenProximity = value
                cursor = i
            }
        }

        // 从路径倒推出最优解的位置
        for i := len(tokens) - 1; i >= 0; i-- {
            if i != len(tokens)-1 {
                cursor = path[i+1][cursor]
            }
            tokenLocations[i] = table[i].locations[indexPointers[i]][cursor]
        }
        return
    }

  • 相关阅读:
    lumen生成key
    github发布版本
    如何使用.gitignore文件删除掉已经提交的文件
    vue中计算属性和侦听属性
    Mac命令行指定特定程序打开文件
    MacOS中创建Sublime Text3快捷方式返回Operation not permitted的原因及解决
    Mac 下 visual studio code 编辑器 设置为中文
    Unix/Linux环境C编程入门教程(14) Mandriva LinuxCCPP开发环境搭建
    Unix/Linux环境C编程入门教程(13) 开发环境搭建VMware软件安装
    Unix/Linux环境C编程入门教程(12) openSUSECCPP以及Linux内核驱动开发环境搭建
  • 原文地址:https://www.cnblogs.com/zhangboyu/p/7461682.html
Copyright © 2011-2022 走看看