zoukankan      html  css  js  c++  java
  • python(十):列表转换成字典

    一、列表转换成字典

            self.cat_list = []
            with open(os.path.join(self.raw_data, "cat.txt")) as f:
                for line in f.readlines():
                    self.cat_list.append(line.strip())
            self.label_dict = dict(zip(self.cat_list, range(len(self.cat_list))))

    二、NLP生成字典

    def remove_1a(content):
        # 去除标点字母数字
        chinese = '[u4e00-u9fa5a-zA-Z0-9]+'
        str1 = re.findall(chinese, content)
        return ''.join(str1)
    
    
    def read_file(filename):
        """读取文件数据"""
        contents, labels = [], []
        with open(filename, mode='r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                try:
                    label, content = line.split("  ")
                    if content:
                        content = remove_1a(content)
                        contents.append(list(content))
                        labels.append(label)
                except:
                    pass
        return contents, labels
    
    
    def build_vocab(train_dir, vocab_dir, vocab_size=5000):
        """根据训练集构建词汇表,存储"""
        data_train, _ = read_file(train_dir)
    
        all_data = []
        for content in data_train:
            all_data.extend(content)
    
        counter = Counter(all_data)
        print(all_data)
        count_pairs = counter.most_common(vocab_size - 1)
        pairs = []
        for i in count_pairs:
            if i[1] > 2:
                pairs.append(i)
        count_pairs = pairs
        words, _ = list(zip(*count_pairs))
        # 添加一个 <PAD> 来将所有文本pad为同一长度
        words = ['<PAD>'] + list(words)
        open(vocab_dir, mode='w', encoding='utf-8', errors='ignore').write('
    '.join(words) + '
    ')

     

  • 相关阅读:
    宏的全解
    Mathematica 表达式求值
    nandflash中oob、ecc分析
    ubuntu 12.04下安装openldap,slapd.conf找不到的解决方法
    jetbrains
    看看美国公务员挣多少钱
    css的#和.的区别
    Ubuntu 14.04 LTS下安装Google Chrome浏览器
    Bootstrap 模态框(Modal)插件
    怎么旋转PDF文件的方向并保存成功
  • 原文地址:https://www.cnblogs.com/zhangxianrong/p/15320911.html
Copyright © 2011-2022 走看看