zoukankan      html  css  js  c++  java
  • python(十):列表转换成字典

    一、列表转换成字典

            self.cat_list = []
            with open(os.path.join(self.raw_data, "cat.txt")) as f:
                for line in f.readlines():
                    self.cat_list.append(line.strip())
            self.label_dict = dict(zip(self.cat_list, range(len(self.cat_list))))

    二、NLP生成字典

    def remove_1a(content):
        # 去除标点字母数字
        chinese = '[u4e00-u9fa5a-zA-Z0-9]+'
        str1 = re.findall(chinese, content)
        return ''.join(str1)
    
    
    def read_file(filename):
        """读取文件数据"""
        contents, labels = [], []
        with open(filename, mode='r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    label, content = line.split("  ")
                    if content:
                        content = remove_1a(content)
                        contents.append(list(content))
                        labels.append(label)
                except:
                    pass
        return contents, labels
    
    
    def build_vocab(train_dir, vocab_dir, vocab_size=5000):
        """根据训练集构建词汇表,存储"""
        data_train, _ = read_file(train_dir)
    
        all_data = []
        for content in data_train:
            all_data.extend(content)
    
        counter = Counter(all_data)
        print(all_data)
        count_pairs = counter.most_common(vocab_size - 1)
        pairs = []
        for i in count_pairs:
            if i[1] > 2:
                pairs.append(i)
        count_pairs = pairs
        words, _ = list(zip(*count_pairs))
        # 添加一个 <PAD> 来将所有文本pad为同一长度
        words = ['<PAD>'] + list(words)
        open(vocab_dir, mode='w', encoding='utf-8', errors='ignore').write('
    '.join(words) + '
    ')

     

  • 相关阅读:
    「总结」容斥。二.反演原理 3.约数容斥
    「总结」容斥。二.反演原理 2.组合容斥
    「总结」容斥。二.反演原理 1.子集容斥
    「总结」容斥。一.容斥原理
    「考试」num (破800纪念)
    「刷题」 网络
    「考试」 Or
    「考试」weight
    「刷题」GERALD07加强版
    「刷题」Triple
  • 原文地址:https://www.cnblogs.com/zhangxianrong/p/15320911.html
Copyright © 2011-2022 走看看