zoukankan      html  css  js  c++  java
  • 中文数据增强 同义句 扩展

    def FuncRecursive(len_curr=0, sen_odd=[], sen_curr=[]):
    
        """
    
          递归函数,将形如 [['1'], ['1', '2'], ['1']] 的list转为 ['111','121']
    
        :param count: int, recursion times
    
        :param candidate_list_set: list, eg.[[''], ['', '是不是'], ['喜欢', '喜爱', ''], ['米饭']]
    
        :param syn_sentences: list, Storing intermediate variables of syn setnence, eg.['你是喜欢米饭', '你是不是喜欢米饭', '你是不是爱米饭']
    
        :return: list, result of syn setnence, eg.['你是喜欢米饭', '你是不是喜欢米饭', '你是不是爱米饭']
    
        """
    
        syn_sentences = []
    
        len_curr = len_curr - 1
    
        if len_curr == -1:
    
            return sen_curr
    
        for sen_odd_one in sen_odd[0]:
    
            for syn_one in sen_curr:
    
                syn_sentences.append(syn_one + sen_odd_one)
    
        syn_sentences = FuncRecursive(len_curr=len_curr,
    
                                      sen_odd=sen_odd[1:],
    
                                      sen_curr=syn_sentences)
    
        return syn_sentences
    
    
    
    
    
    def gen_syn_sentences(org_data):
    
        """
    
            同义句生成等
    
        :param org_data: list, list of rule
    
        :return: list
    
        """
    
        # 获取数据
    
        sentences_pre = []
    
        for org_sen in org_data:
    
            org_sen_sp = org_sen.split("][")
    
            sentences_add = []
    
            for words in org_sen_sp:
    
                words_sp = words.split("|")
    
                words_sp = [word.replace("]", "").replace("[", "") for word in words_sp]
    
                sentences_add.append(words_sp)
    
            sentences_pre.append(sentences_add)
    
    
    
        # 递归生成
    
        sentences_syn = []
    
        for sen_rule in sentences_pre:
    
            len_sen_rule = len(sen_rule)
    
            if len_sen_rule == 1: # 长度为1不递归
    
                sentences_syn = sentences_syn + sen_rule[0]
    
            else:
    
                sentences_syn = sentences_syn + FuncRecursive(len_curr=len_sen_rule-1,
    
                                                              sen_odd=sen_rule[1:],
    
                                                              sen_curr=sen_rule[0])
    
        return sentences_syn
    if __name__=="__main__":
    
        org_data = ["[你][喜欢|喜爱|爱][虾米|啥子|什么]", "[1|11][2|22][3|33][44|444]", "大漠帝国"]
    
        syn_sentences = gen_syn_sentences(org_data)
    
        # syn_sentences = sorted(syn_sentences)
    
        print(syn_sentences)
    
        gg = 0

    原文:

    https://blog.csdn.net/rensihui/article/details/94589739

  • 相关阅读:
    深度学习的优化算法
    基于双向的CNN的细粒度物体识别论文翻译
    LSTM公式推导
    结巴分词python脚本
    eval() python 中的
    C++编译原理
    extern,以及在linux头文件中的应用
    iostream源码
    LINUX命令
    apt-get
  • 原文地址:https://www.cnblogs.com/cupleo/p/11896143.html
Copyright © 2011-2022 走看看