zoukankan      html  css  js  c++  java
  • Pytorch-torchtext的使用

    使用torchtext的一般步骤https://www.cnblogs.com/cxq1126/p/13466998.html#_label9

    1.使用torchtext默认支持的预训练词向量

    默认情况下,会自动下载对应的预训练词向量文件到当前文件夹下的.vector_cache目录下,.vector_cache为默认的词向量文件和缓存文件的目录。

    1 from torchtext.vocab import GloVe
    2 from torchtext import data
    3 TEXT = data.Field(sequential=True)
    4 
    5 TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
    6 TEXT.build_vocab(train, vectors="glove.6B.300d")

    2.使用外部预训练好的词向量

    从网站中(https://github.com/Embedding/Chinese-Word-Vectors)下载中文词向量sgns.wiki.word

    通过name参数可以指定预训练词向量文件所在的目录;
    默认情况下预训练词向量文件和缓存文件的目录位置都为当前目录下的 .vector_cache目录,虽然通过name参数指定了预训练词向量文件存在的目录,但是因为缓存文件的目录没有特殊指定,此时在当前目录下仍然需要存在 .vector_cache 目录。

    1 if not os.path.exists('.vector_cache'):
    2     os.mkdir('.vector_cache')
    3 vectors = Vectors(name='sgns.wiki.word')
    4 TEXT.build_vocab(train_data, max_size=10000, vectors=vectors)

    Embedding初始化还是一样

    1 pretrained_embedding = TEXT.vocab.vectors
    2 print('pretrained_embedding:', pretrained_embedding.shape)  #torch.Size([1727, 300])
    3 model.src_embed[0].lut.weight.data.copy_(pretrained_embedding) 
    4 print('Embedding初始化')

     参考https://blog.csdn.net/leo_95/article/details/87708267

    3.篇章级文本分类,将每一篇文档按长度分三段保存,共用一份词表

    textfield可以定义多个属性,text1,text2,text3。

     1 from nltk.tokenize import word_tokenize
     2 from torchtext import data as tdata
     3 from torchtext.vocab import GloVe
     4 from torchtext.vocab import Vectors
     5 
     6 def read_data(data_path, text_field, label_field, split=3, overlap=0):
     7     fields = []
     8     for i in range(1, split+1):
     9         fields.append(('text'+str(i), text_field))
    10     fields.append(('label', label_field))
    11 
    12     examples = []
    13 
    14     with open(data_path) as csv_file:
    15         reader = csv.reader(csv_file, quotechar='"')
    16         for idx, line in enumerate(reader):
    17             text = ""
    18             for tx in line[1:]:
    19                 text += tx              #tx就是一篇文档
    20                 text += " "
    21                 word_tokens = word_tokenize(text)
    22                 len_text = len(word_tokens)
    23                 document_encode = []
    24                 for i in range(split):
    25                     len_true = int((len_text + overlap*(split-1)) / split)         #小文档的真实长度
    26                     len_rel = len_true - overlap
    27                     doc = word_tokens[i*len_rel : (i+1)*len_rel + overlap]
    28                     document_encode.append(doc)
    29 
    30             label = int(line[0])
    31             document_encode.append(label)
    32             examples.append(tdata.Example.fromlist(document_encode, fields))
    33     return examples, fields
    34 
    35 def data_doc_iter(train_path, test_path, text_field, label_field, batch_size, embedding_dim=50):
    36     train_examples, train_fields = read_data(train_path, text_field, label_field)
    37     test_examples, test_fields = read_data(test_path, text_field, label_field)
    38 
    39     train_dataset = tdata.Dataset(train_examples, train_fields)
    40     test_dataset = tdata.Dataset(test_examples, test_fields)
    41 
    42     #构建词表
    43     text_field.build_vocab(train_dataset, vectors=GloVe(name='6B', dim=embedding_dim))        
    44     label_field.build_vocab(train_dataset)
    45 
    46     train_iter = tdata.Iterator(train_dataset, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
    47     test_iter = tdata.Iterator(test_dataset, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
    48     vocabulary = text_field.vocab
    49     return train_iter, test_iter, vocabulary

    调用如下:

     1 text_field = tdata.Field(tokenize=lambda x: word_tokenize(x), lower=True, fix_length=512, batch_first=True)       
     2 label_field = tdata.LabelField(dtype=torch.int) 
     3 train_iter, test_iter, vocabulary = data_doc_iter("./data/IMDB_new/train_shuffle.csv",  "./data/IMDB_new/test_new.csv", 
     4                                                         text_field, label_field, batch_size=8)
     5 
     6 for batch in train_iter:
     7     print(batch.text1.shape)
     8     print(batch.text2.shape)
     9     print(batch.text3.shape)
    10     print(batch.label)

    4.篇章级文本分类,将每一篇文档按长度分三段保存,每一份文档的词表不同

    即所有文档的第一份使用第一份词表,所有文档的第二份使用第二份词表,所有文档的第三份使用第三份词表。

    每一份词表是从.vector_cache中的glove.6B.50d.txt中的词随机抽取一半当作新的词向量得到的,分别保存为cibiao1.txt,cibiao2.txt和cibiao3.txt。构建随机特征子空间。

     1 from nltk.tokenize import word_tokenize
     2 from torchtext import data as tdata
     3 from torchtext.vocab import GloVe
     4 from torchtext.vocab import Vectors
     5 
     6 def read_split_data(data_path, text_fields, label_fields, split=3, overlap=0):
     7     
     8     field1, field2, field3 = [], [], []
     9     field1.append(('text', text_fields[0]))
    10     field1.append(('label', label_fields[0]))
    11     field2.append(('text', text_fields[1]))
    12     field2.append(('label', label_fields[1]))
    13     field3.append(('text', text_fields[2]))
    14     field3.append(('label', label_fields[2]))       
    15 
    16     examp1, examp2, examp3 = [], [], [], []
    17 
    18     with open(data_path) as csv_file:
    19         reader = csv.reader(csv_file, quotechar='"')
    20         for idx, line in enumerate(reader):
    21             text = ""
    22             for tx in line[1:]:
    23                 text += tx              #tx就是一篇文档
    24                 text += " "
    25                 word_tokens = word_tokenize(text)
    26                 len_text = len(word_tokens)
    27                 document_encode = []
    28                 for i in range(split):
    29                     len_true = int((len_text + overlap*(split-1)) / split)         #小文档的真实长度
    30                     len_rel = len_true - overlap
    31                     doc = word_tokens[i*len_rel : (i+1)*len_rel + overlap]
    32                     document_encode.append(doc)
    33 
    34             label = int(line[0])        
    35             
    36             doc1, doc2, doc3 = [], [], []
    37             doc1.append(document_encode[0])
    38             doc1.append(label)
    39             examp1.append(tdata.Example.fromlist(doc1, field1))
    40 
    41             doc2.append(document_encode[1])
    42             doc2.append(label)
    43             examp2.append(tdata.Example.fromlist(doc2, field2))
    44 
    45             doc3.append(document_encode[2])
    46             doc3.append(label)
    47             examp3.append(tdata.Example.fromlist(doc3, field3))
    48 
    49     return examp1, examp2, examp3, field1, field2, field3
    50 
    51 
    52 def data_docsplit_iter(train_path, test_path, text_fields, label_fields, batch_size):
    53     train1_examp, train2_examp, train3_examp, field1, field2, field3 = read_split_data(train_path, text_fields, label_fields)
    54     test1_examp, test2_examp, test3_examp, tfield1, tfield2, tfield3 = read_split_data(test_path, text_fields, label_fields)
    55 
    56     #构建词表
    57     train1_data = tdata.Dataset(train1_examp, field1)
    58     train2_data = tdata.Dataset(train2_examp, field2)
    59     train3_data = tdata.Dataset(train3_examp, field3)
    60 
    61     vectors1 = Vectors(name='cibiao1.txt')
    62     vectors2 = Vectors(name='cibiao2.txt')
    63     vectors3 = Vectors(name='cibiao3.txt')
    64 
    65     text_fields[0].build_vocab(train1_data, vectors=vectors1)
    66     text_fields[1].build_vocab(train2_data, vectors=vectors2)
    67     text_fields[2].build_vocab(train3_data, vectors=vectors3)
    68 
    69     label_fields[0].build_vocab(train1_data)
    70     label_fields[1].build_vocab(train2_data)
    71     label_fields[2].build_vocab(train3_data)
    72 
    73     test1_data = tdata.Dataset(test1_examp, tfield1)
    74     test2_data = tdata.Dataset(test2_examp, tfield2)
    75     test3_data = tdata.Dataset(test3_examp, tfield3)
    76     
    77     train_iter1 = tdata.Iterator(train1_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
    78     train_iter2 = tdata.Iterator(train2_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
    79     train_iter3 = tdata.Iterator(train3_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
    80 
    81     test_iter1 = tdata.Iterator(test1_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
    82     test_iter2 = tdata.Iterator(test2_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
    83     test_iter3 = tdata.Iterator(test3_data, batch_size=batch_size, shuffle=False, sort=False, sort_within_batch=False, repeat=False)
    84   
    85     vocabulary1, vocabulary2, vocabulary3 = text_fields[0].vocab, text_fields[1].vocab, text_fields[2].vocab
    86     return train_iter1, train_iter2, train_iter3, test_iter1, test_iter2, test_iter3, vocabulary1, vocabulary2, vocabulary3

    调用如下:

     1 SENTENCE_LIMIT_SIZE = 512
     2 DATAPATH = './data/IMDB_new/'
     3 
     4 text_fields, label_fields = [], []
     5 for i in range(3):
     6     text_fields.append(tdata.Field(tokenize=lambda x: word_tokenize(x), lower=True, fix_length=SENTENCE_LIMIT_SIZE, batch_first=True))
     7     label_fields.append(tdata.LabelField(dtype=torch.int))
     8 
     9 train_iter1, train_iter2, train_iter3, test_iter1, test_iter2, test_iter3, vocabulary1, vocabulary2, vocabulary3 = data_docsplit_iter(DATAPATH + "train_shuffle.csv", DATAPATH + "test_new.csv", 
    10                                                                                                             text_fields, label_fields, batch_size=4)        
    11 print('vocabulary1.vectors.shape = ', vocabulary1.vectors)
    12 print('vocabulary2.vectors.shape = ', vocabulary2.vectors.shape)
    13 print('vocabulary3.vectors.shape = ', vocabulary3.vectors.shape)          
    14 for i, batch in enumerate(zip(train_iter1, train_iter2, train_iter3)):  
    15     print(batch[0].text)
    16     print(batch[0].label)
    17     print(batch[1].text)
    18     print(batch[1].label)
    19     break
  • 相关阅读:
    qemu+chroot构建arm aarch64虚拟机
    <转>Linux环境下段错误的产生原因及调试方法小结
    <转>PCA的数学原理
    博客分类整理
    detectron2 配置记录
    如何读取部分的预训练模型
    重新配置语义分割实验环境遇到的坑
    pytorch 调整tensor的维度位置
    seg代码配置的踩坑记录
    Alienware R8外星人台式机安装双系统(WIN10+Ubuntu)的总结
  • 原文地址:https://www.cnblogs.com/cxq1126/p/14392401.html
Copyright © 2011-2022 走看看