使用torchtext的一般步骤https://www.cnblogs.com/cxq1126/p/13466998.html#_label9
1.使用torchtext默认支持的预训练词向量
默认情况下,会自动下载对应的预训练词向量文件到当前文件夹下的.vector_cache目录下,.vector_cache为默认的词向量文件和缓存文件的目录。
1 from torchtext.vocab import GloVe 2 from torchtext import data 3 TEXT = data.Field(sequential=True) 4 5 TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) 6 TEXT.build_vocab(train, vectors="glove.6B.300d")
2.使用外部预训练好的词向量
从网站中(https://github.com/Embedding/Chinese-Word-Vectors)下载中文词向量sgns.wiki.word
通过name参数可以指定预训练词向量文件所在的目录;
默认情况下预训练词向量文件和缓存文件的目录位置都为当前目录下的 .vector_cache目录,虽然通过name参数指定了预训练词向量文件存在的目录,但是因为缓存文件的目录没有特殊指定,此时在当前目录下仍然需要存在 .vector_cache 目录。
1 if not os.path.exists('.vector_cache'): 2 os.mkdir('.vector_cache') 3 vectors = Vectors(name='sgns.wiki.word') 4 TEXT.build_vocab(train_data, max_size=10000, vectors=vectors)
Embedding初始化还是一样
1 pretrained_embedding = TEXT.vocab.vectors 2 print('pretrained_embedding:', pretrained_embedding.shape) #torch.Size([1727, 300]) 3 model.src_embed[0].lut.weight.data.copy_(pretrained_embedding) 4 print('Embedding初始化')
参考https://blog.csdn.net/leo_95/article/details/87708267
3.篇章级文本分类,将每一篇文档按长度分三段保存,共用一份词表
textfield可以定义多个属性,text1,text2,text3。
1 from nltk.tokenize import word_tokenize 2 from torchtext import data as tdata 3 from torchtext.vocab import GloVe 4 from torchtext.vocab import Vectors 5 6 def read_data(data_path, text_field, label_field, split=3, overlap=0): 7 fields = [] 8 for i in range(1, split+1): 9 fields.append(('text'+str(i), text_field)) 10 fields.append(('label', label_field)) 11 12 examples = [] 13 14 with open(data_path) as csv_file: 15 reader = csv.reader(csv_file, quotechar='"') 16 for idx, line in enumerate(reader): 17 text = "" 18 for tx in line[1:]: 19 text += tx #tx就是一篇文档 20 text += " " 21 word_tokens = word_tokenize(text) 22 len_text = len(word_tokens) 23 document_encode = [] 24 for i in range(split): 25 len_true = int((len_text + overlap*(split-1)) / split) #小文档的真实长度 26 len_rel = len_true - overlap 27 doc = word_tokens[i*len_rel : (i+1)*len_rel + overlap] 28 document_encode.append(doc) 29 30 label = int(line[0]) 31 document_encode.append(label) 32 examples.append(tdata.Example.fromlist(document_encode, fields)) 33 return examples, fields 34 35 def data_doc_iter(train_path, test_path, text_field, label_field, batch_size, embedding_dim=50): 36 train_examples, train_fields = read_data(train_path, text_field, label_field) 37 test_examples, test_fields = read_data(test_path, text_field, label_field) 38 39 train_dataset = tdata.Dataset(train_examples, train_fields) 40 test_dataset = tdata.Dataset(test_examples, test_fields) 41 42 #构建词表 43 text_field.build_vocab(train_dataset, vectors=GloVe(name='6B', dim=embedding_dim)) 44 label_field.build_vocab(train_dataset) 45 46 train_iter = tdata.Iterator(train_dataset, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 47 test_iter = tdata.Iterator(test_dataset, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 48 vocabulary = text_field.vocab 49 return train_iter, test_iter, vocabulary
调用如下:
1 text_field = tdata.Field(tokenize=lambda x: word_tokenize(x), lower=True, fix_length=512, batch_first=True) 2 label_field = tdata.LabelField(dtype=torch.int) 3 train_iter, test_iter, vocabulary = data_doc_iter("./data/IMDB_new/train_shuffle.csv", "./data/IMDB_new/test_new.csv", 4 text_field, label_field, batch_size=8) 5 6 for batch in train_iter: 7 print(batch.text1.shape) 8 print(batch.text2.shape) 9 print(batch.text3.shape) 10 print(batch.label)
4.篇章级文本分类,将每一篇文档按长度分三段保存,每一份文档的词表不同
即所有文档的第一份使用第一份词表,所有文档的第二份使用第二份词表,所有文档的第三份使用第三份词表。
每一份词表是从.vector_cache中的glove.6B.50d.txt中的词随机抽取一半当作新的词向量得到的,分别保存为cibiao1.txt,cibiao2.txt和cibiao3.txt。构建随机特征子空间。
1 from nltk.tokenize import word_tokenize 2 from torchtext import data as tdata 3 from torchtext.vocab import GloVe 4 from torchtext.vocab import Vectors 5 6 def read_split_data(data_path, text_fields, label_fields, split=3, overlap=0): 7 8 field1, field2, field3 = [], [], [] 9 field1.append(('text', text_fields[0])) 10 field1.append(('label', label_fields[0])) 11 field2.append(('text', text_fields[1])) 12 field2.append(('label', label_fields[1])) 13 field3.append(('text', text_fields[2])) 14 field3.append(('label', label_fields[2])) 15 16 examp1, examp2, examp3 = [], [], [], [] 17 18 with open(data_path) as csv_file: 19 reader = csv.reader(csv_file, quotechar='"') 20 for idx, line in enumerate(reader): 21 text = "" 22 for tx in line[1:]: 23 text += tx #tx就是一篇文档 24 text += " " 25 word_tokens = word_tokenize(text) 26 len_text = len(word_tokens) 27 document_encode = [] 28 for i in range(split): 29 len_true = int((len_text + overlap*(split-1)) / split) #小文档的真实长度 30 len_rel = len_true - overlap 31 doc = word_tokens[i*len_rel : (i+1)*len_rel + overlap] 32 document_encode.append(doc) 33 34 label = int(line[0]) 35 36 doc1, doc2, doc3 = [], [], [] 37 doc1.append(document_encode[0]) 38 doc1.append(label) 39 examp1.append(tdata.Example.fromlist(doc1, field1)) 40 41 doc2.append(document_encode[1]) 42 doc2.append(label) 43 examp2.append(tdata.Example.fromlist(doc2, field2)) 44 45 doc3.append(document_encode[2]) 46 doc3.append(label) 47 examp3.append(tdata.Example.fromlist(doc3, field3)) 48 49 return examp1, examp2, examp3, field1, field2, field3 50 51 52 def data_docsplit_iter(train_path, test_path, text_fields, label_fields, batch_size): 53 train1_examp, train2_examp, train3_examp, field1, field2, field3 = read_split_data(train_path, text_fields, label_fields) 54 test1_examp, test2_examp, test3_examp, tfield1, tfield2, tfield3 = read_split_data(test_path, text_fields, label_fields) 55 56 #构建词表 57 train1_data = tdata.Dataset(train1_examp, field1) 58 train2_data = tdata.Dataset(train2_examp, field2) 59 train3_data = tdata.Dataset(train3_examp, field3) 60 61 vectors1 = Vectors(name='cibiao1.txt') 62 vectors2 = Vectors(name='cibiao2.txt') 63 vectors3 = Vectors(name='cibiao3.txt') 64 65 text_fields[0].build_vocab(train1_data, vectors=vectors1) 66 text_fields[1].build_vocab(train2_data, vectors=vectors2) 67 text_fields[2].build_vocab(train3_data, vectors=vectors3) 68 69 label_fields[0].build_vocab(train1_data) 70 label_fields[1].build_vocab(train2_data) 71 label_fields[2].build_vocab(train3_data) 72 73 test1_data = tdata.Dataset(test1_examp, tfield1) 74 test2_data = tdata.Dataset(test2_examp, tfield2) 75 test3_data = tdata.Dataset(test3_examp, tfield3) 76 77 train_iter1 = tdata.Iterator(train1_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 78 train_iter2 = tdata.Iterator(train2_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 79 train_iter3 = tdata.Iterator(train3_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 80 81 test_iter1 = tdata.Iterator(test1_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 82 test_iter2 = tdata.Iterator(test2_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 83 test_iter3 = tdata.Iterator(test3_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 84 85 vocabulary1, vocabulary2, vocabulary3 = text_fields[0].vocab, text_fields[1].vocab, text_fields[2].vocab 86 return train_iter1, train_iter2, train_iter3, test_iter1, test_iter2, test_iter3, vocabulary1, vocabulary2, vocabulary3
调用如下:
1 SENTENCE_LIMIT_SIZE = 512 2 DATAPATH = './data/IMDB_new/' 3 4 text_fields, label_fields = [], [] 5 for i in range(3): 6 text_fields.append(tdata.Field(tokenize=lambda x: word_tokenize(x), lower=True, fix_length=SENTENCE_LIMIT_SIZE, batch_first=True)) 7 label_fields.append(tdata.LabelField(dtype=torch.int)) 8 9 train_iter1, train_iter2, train_iter3, test_iter1, test_iter2, test_iter3, vocabulary1, vocabulary2, vocabulary3 = data_docsplit_iter(DATAPATH + "train_shuffle.csv", DATAPATH + "test_new.csv", 10 text_fields, label_fields, batch_size=4) 11 print('vocabulary1.vectors.shape = ', vocabulary1.vectors) 12 print('vocabulary2.vectors.shape = ', vocabulary2.vectors.shape) 13 print('vocabulary3.vectors.shape = ', vocabulary3.vectors.shape) 14 for i, batch in enumerate(zip(train_iter1, train_iter2, train_iter3)): 15 print(batch[0].text) 16 print(batch[0].label) 17 print(batch[1].text) 18 print(batch[1].label) 19 break