1 from torchtext.vocab import GloVe 2 from torchtext import data 3 TEXT = data.Field(sequential=True) 4 5 TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) 6 TEXT.build_vocab(train, vectors="glove.6B.300d")
默认情况下预训练词向量文件和缓存文件的目录位置都为当前目录下的 .vector_cache目录,虽然通过name参数指定了预训练词向量文件存在的目录,但是因为缓存文件的目录没有特殊指定,此时在当前目录下仍然需要存在 .vector_cache 目录。
1 if not os.path.exists('.vector_cache'): 2 os.mkdir('.vector_cache') 3 vectors = Vectors(name='sgns.wiki.word') 4 TEXT.build_vocab(train_data, max_size=10000, vectors=vectors)
1 pretrained_embedding = TEXT.vocab.vectors 2 print('pretrained_embedding:', pretrained_embedding.shape) #torch.Size([1727, 300]) 3 model.src_embed[0].lut.weight.data.copy_(pretrained_embedding) 4 print('Embedding初始化')
1 from nltk.tokenize import word_tokenize 2 from torchtext import data as tdata 3 from torchtext.vocab import GloVe 4 from torchtext.vocab import Vectors 5 6 def read_data(data_path, text_field, label_field, split=3, overlap=0): 7 fields = [] 8 for i in range(1, split+1): 9 fields.append(('text'+str(i), text_field)) 10 fields.append(('label', label_field)) 11 12 examples = [] 13 14 with open(data_path) as csv_file: 15 reader = csv.reader(csv_file, quotechar='"') 16 for idx, line in enumerate(reader): 17 text = "" 18 for tx in line[1:]: 19 text += tx #tx就是一篇文档 20 text += " " 21 word_tokens = word_tokenize(text) 22 len_text = len(word_tokens) 23 document_encode = [] 24 for i in range(split): 25 len_true = int((len_text + overlap*(split-1)) / split) #小文档的真实长度 26 len_rel = len_true - overlap 27 doc = word_tokens[i*len_rel : (i+1)*len_rel + overlap] 28 document_encode.append(doc) 29 30 label = int(line[0]) 31 document_encode.append(label) 32 examples.append(tdata.Example.fromlist(document_encode, fields)) 33 return examples, fields 34 35 def data_doc_iter(train_path, test_path, text_field, label_field, batch_size, embedding_dim=50): 36 train_examples, train_fields = read_data(train_path, text_field, label_field) 37 test_examples, test_fields = read_data(test_path, text_field, label_field) 38 39 train_dataset = tdata.Dataset(train_examples, train_fields) 40 test_dataset = tdata.Dataset(test_examples, test_fields) 41 42 #构建词表 43 text_field.build_vocab(train_dataset, vectors=GloVe(name='6B', dim=embedding_dim)) 44 label_field.build_vocab(train_dataset) 45 46 train_iter = tdata.Iterator(train_dataset, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 47 test_iter = tdata.Iterator(test_dataset, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 48 vocabulary = text_field.vocab 49 return train_iter, test_iter, vocabulary
1 text_field = tdata.Field(tokenize=lambda x: word_tokenize(x), lower=True, fix_length=512, batch_first=True) 2 label_field = tdata.LabelField(dtype=torch.int) 3 train_iter, test_iter, vocabulary = data_doc_iter("./data/IMDB_new/train_shuffle.csv", "./data/IMDB_new/test_new.csv", 4 text_field, label_field, batch_size=8) 5 6 for batch in train_iter: 7 print(batch.text1.shape) 8 print(batch.text2.shape) 9 print(batch.text3.shape) 10 print(batch.label)
1 from nltk.tokenize import word_tokenize 2 from torchtext import data as tdata 3 from torchtext.vocab import GloVe 4 from torchtext.vocab import Vectors 5 6 def read_split_data(data_path, text_fields, label_fields, split=3, overlap=0): 7 8 field1, field2, field3 = [], [], [] 9 field1.append(('text', text_fields[0])) 10 field1.append(('label', label_fields[0])) 11 field2.append(('text', text_fields[1])) 12 field2.append(('label', label_fields[1])) 13 field3.append(('text', text_fields[2])) 14 field3.append(('label', label_fields[2])) 15 16 examp1, examp2, examp3 = [], [], [], [] 17 18 with open(data_path) as csv_file: 19 reader = csv.reader(csv_file, quotechar='"') 20 for idx, line in enumerate(reader): 21 text = "" 22 for tx in line[1:]: 23 text += tx #tx就是一篇文档 24 text += " " 25 word_tokens = word_tokenize(text) 26 len_text = len(word_tokens) 27 document_encode = [] 28 for i in range(split): 29 len_true = int((len_text + overlap*(split-1)) / split) #小文档的真实长度 30 len_rel = len_true - overlap 31 doc = word_tokens[i*len_rel : (i+1)*len_rel + overlap] 32 document_encode.append(doc) 33 34 label = int(line[0]) 35 36 doc1, doc2, doc3 = [], [], [] 37 doc1.append(document_encode[0]) 38 doc1.append(label) 39 examp1.append(tdata.Example.fromlist(doc1, field1)) 40 41 doc2.append(document_encode[1]) 42 doc2.append(label) 43 examp2.append(tdata.Example.fromlist(doc2, field2)) 44 45 doc3.append(document_encode[2]) 46 doc3.append(label) 47 examp3.append(tdata.Example.fromlist(doc3, field3)) 48 49 return examp1, examp2, examp3, field1, field2, field3 50 51 52 def data_docsplit_iter(train_path, test_path, text_fields, label_fields, batch_size): 53 train1_examp, train2_examp, train3_examp, field1, field2, field3 = read_split_data(train_path, text_fields, label_fields) 54 test1_examp, test2_examp, test3_examp, tfield1, tfield2, tfield3 = read_split_data(test_path, text_fields, label_fields) 55 56 #构建词表 57 train1_data = tdata.Dataset(train1_examp, field1) 58 train2_data = tdata.Dataset(train2_examp, field2) 59 train3_data = tdata.Dataset(train3_examp, field3) 60 61 vectors1 = Vectors(name='cibiao1.txt') 62 vectors2 = Vectors(name='cibiao2.txt') 63 vectors3 = Vectors(name='cibiao3.txt') 64 65 text_fields[0].build_vocab(train1_data, vectors=vectors1) 66 text_fields[1].build_vocab(train2_data, vectors=vectors2) 67 text_fields[2].build_vocab(train3_data, vectors=vectors3) 68 69 label_fields[0].build_vocab(train1_data) 70 label_fields[1].build_vocab(train2_data) 71 label_fields[2].build_vocab(train3_data) 72 73 test1_data = tdata.Dataset(test1_examp, tfield1) 74 test2_data = tdata.Dataset(test2_examp, tfield2) 75 test3_data = tdata.Dataset(test3_examp, tfield3) 76 77 train_iter1 = tdata.Iterator(train1_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 78 train_iter2 = tdata.Iterator(train2_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 79 train_iter3 = tdata.Iterator(train3_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 80 81 test_iter1 = tdata.Iterator(test1_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 82 test_iter2 = tdata.Iterator(test2_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 83 test_iter3 = tdata.Iterator(test3_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False) 84 85 vocabulary1, vocabulary2, vocabulary3 = text_fields[0].vocab, text_fields[1].vocab, text_fields[2].vocab 86 return train_iter1, train_iter2, train_iter3, test_iter1, test_iter2, test_iter3, vocabulary1, vocabulary2, vocabulary3
1 SENTENCE_LIMIT_SIZE = 512 2 DATAPATH = './data/IMDB_new/' 3 4 text_fields, label_fields = [], [] 5 for i in range(3): 6 text_fields.append(tdata.Field(tokenize=lambda x: word_tokenize(x), lower=True, fix_length=SENTENCE_LIMIT_SIZE, batch_first=True)) 7 label_fields.append(tdata.LabelField(dtype=torch.int)) 8 9 train_iter1, train_iter2, train_iter3, test_iter1, test_iter2, test_iter3, vocabulary1, vocabulary2, vocabulary3 = data_docsplit_iter(DATAPATH + "train_shuffle.csv", DATAPATH + "test_new.csv", 10 text_fields, label_fields, batch_size=4) 11 print('vocabulary1.vectors.shape = ', vocabulary1.vectors) 12 print('vocabulary2.vectors.shape = ', vocabulary2.vectors.shape) 13 print('vocabulary3.vectors.shape = ', vocabulary3.vectors.shape) 14 for i, batch in enumerate(zip(train_iter1, train_iter2, train_iter3)): 15 print(batch[0].text) 16 print(batch[0].label) 17 print(batch[1].text) 18 print(batch[1].label) 19 break