zoukankan      html  css  js  c++  java
  • 预处理方法

    import re
    import numpy as np
    train_path = 'data/train.txt'
    embedding_file = 'model/token_vec_300.bin'
    stop_words_path = 'data/stop_words.txt'
    temporary_variable_path = 'data/variable'
    embedding_file_path = 'data/embedding/token_vec_300.bin'
    

    返回x,y的列表集合

    def loadData(train_path):
        y_list = []
        x_left_list = []
        x_right_list = []
        for line in open(train_path,encoding='utf-8'):
            line = line.rstrip().split('	')
            if line:
                seg_left = line[0]
                seg_rigth = line[1]
                label = line[2]
                x_left_list.append(seg_left)
                x_right_list.append(seg_left)
                y_list.append(label)
        return x_left_list,x_right_list,y_list     
    
    x_left_list,x_right_list,y_list = load_data(train_path)
    

    正则--只保留汉字

    def cleanData(data_list):
        clear_data = []
        for sentece in data_list:
            clear_sentece = re.sub("[^u4e00-u9fa5]+", "",sentece)
            clear_data.append(clear_sentece)
        return clear_data
    
    x_left_clear = clean_data(x_left_list)
    
    def getCharFromSentence(sentence_list):
        char_list = []
        for sentence in sentence_list:
            char_sentence = [char for char in sentence if char]
            char_list.append(char_sentence)
        print('cleaning finished!!!')
        print('last sentence:{}'.format(char_list[-1]))
        return char_list
    
    x_left_char = getCharFromSentence(x_left_clear)
    
    def getStopWords(stop_words_path):
        stop_words = []
        with open(stop_words_path,'r',encoding='utf-8') as f:
            lines = f.readlines()
            for i in lines:
                word = i.strip()
                if word:
                    stop_words.append(word)
        print('loading stopwords finished!!!')
        print(stop_words[-10:])
        return stop_words
    stop_words_list = getStopWords(stop_words_path)
    ## 该方法只能用于分词之后
    def removeStopWords(sentence_list,stop_words_list):
        data = []
        for sentence in sentence_list:
            data.append([ word for word in sentence if word not in stop_words_list])
        return data
    x_left_char = removeStopWords(x_left_char,stop_words_list)
    

    获取句子的最大长度

    def getMaxLength(train_data):
        len_list = [len(i) for i in train_data]
        len_array_list = np.array(len_list)
        for i in range(10,100):  # 给一个预估量
            num = np.sum(len_array_list<i)
            rate = num/len(len_array_list)
            if rate>0.95:
                max_length = i
                print("max_tokens={}".format(max_length))
                break
        return max_length
    
    # 传入分完字/词后的句子列表
    def buildWordDict(all_text_list):
        word_list = []
        for sentence in all_text_list:
            word_list.extend(sentence)
        vocabulary_list =list(set(word_list))
        import collections
        word2index = collections.OrderedDict()
        for index ,word in enumerate(vocabulary_list):
            word2index[word]=index+1
        return word2index
    def buildIndex2Word(word2index):
        index2word = dict(zip(word2index.values(), word2index.keys())) # 反转字典
        return index2word
    
    word2index= buildWordDict(x_left_char)
    index2word = buildIndex2Word(word)
    

    句子转数字 ,输入是句子的列表

    def tokenizeSentence(sentence_list,word2index):
        x_tokens=[]
        for sentence in sentence_list:
            number_list = []
            for word in sentence:
                try:
                    number_list.append(word2index[word])
                except:
                    number_list.append(0)
            x_tokens.append(number_list)
        return x_tokens
    

    数字转句子

    def tokenizeNumber(number_list,index2word):
        sentence=[]
        for number in number_list:
            new_text = []
            for index in number:
                try:
                    new_text.append(index2word[index])
                except:
                    new_text.append(' ')
            sentence.append(new_text)
        return sentence
    
    tokens_List = tokenizeSentence(x_left_char,word2index)
    
    def loadEmbeddingsFile(embedding_file_path):
        embeddings_dict = {}
        with open(embedding_file_path, 'r',encoding='utf-8') as f:
            for line in f:
                values = line.strip().split(' ')
                if len(values) < 300:
                    continue
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_dict[word] = coefs
        print('Found %s word vectors.' % len(embeddings_dict))
        return embeddings_dict
    
    # 获取本语料的word2vector
    def getEmbeddingsDict(embeddings_dict,word2index):
        word2vector_dict = {}
        for word in word2index.keys():
            try:
                word2vector_dict[word] = embeddings_dict[word]
            except:
                print(word)
        return word2vector_dict
    
    word2vector_dict = getEmbeddingsDict(modelbin,word2index)
    
    def getEmbeddingWeights(word2index,word2vector_dict,embedding_dim):
        n_symbols = len(word2index) + 1  # 所有单词的索引数,未登录词语索引为0,所以加1
        embedding_weights = np.zeros((n_symbols, embedding_dim))
        for word, index in word2index.items():#从索引为1的词语开始,对每个词语对应其词向量
            embedding_vector = word2vector_dict.get(word)
            if embedding_vector is not None:
                embedding_weights[index, :] = word2vector_dict[word]
        return embedding_weights
    embedding_weights = getEmbeddingWeights(word2index,word2vector_dict,300)
    
  • 相关阅读:
    VUE入门
    搭建内网穿透服务
    nacos集群配置安装
    jenkins入门
    Linux系统管理与自动化运维
    SVN
    JAR包启动
    服务器rm -rf 恢复案例 好文章
    docker入门到放弃
    CentOS7安装图形界面及报错处理
  • 原文地址:https://www.cnblogs.com/rise0111/p/11563393.html
Copyright © 2011-2022 走看看