一、列表转换成字典
self.cat_list = []
with open(os.path.join(self.raw_data, "cat.txt")) as f:
for line in f.readlines():
self.cat_list.append(line.strip())
self.label_dict = dict(zip(self.cat_list, range(len(self.cat_list))))
二、NLP生成字典
def remove_1a(content):
# 去除标点字母数字
chinese = '[u4e00-u9fa5a-zA-Z0-9]+'
str1 = re.findall(chinese, content)
return ''.join(str1)
def read_file(filename):
"""读取文件数据"""
contents, labels = [], []
with open(filename, mode='r', encoding='utf-8', errors='ignore') as f:
for line in f:
try:
label, content = line.split(" ")
if content:
content = remove_1a(content)
contents.append(list(content))
labels.append(label)
except:
pass
return contents, labels
def build_vocab(train_dir, vocab_dir, vocab_size=5000):
"""根据训练集构建词汇表,存储"""
data_train, _ = read_file(train_dir)
all_data = []
for content in data_train:
all_data.extend(content)
counter = Counter(all_data)
print(all_data)
count_pairs = counter.most_common(vocab_size - 1)
pairs = []
for i in count_pairs:
if i[1] > 2:
pairs.append(i)
count_pairs = pairs
words, _ = list(zip(*count_pairs))
# 添加一个 <PAD> 来将所有文本pad为同一长度
words = ['<PAD>'] + list(words)
open(vocab_dir, mode='w', encoding='utf-8', errors='ignore').write('
'.join(words) + '
')