zoukankan      html  css  js  c++  java
  • NLP(十八):训练字级别的word2vec、Fasttext和词级别的word2vec

    一、数据:

    word2vec训练数据格式说明:对于文本文件,基本格式是一行一句话,需要分好词。

    (1)如果按字级别训练,将汉字分隔开就行。按字分割:

                    line_str = line.replace(" ", "")
                    cn = " ".join(line_str)

    (2)按词分割

    方法有很多,jieba、北大的pkuseg、哈工大的LTP

    1、基于字,文件示例。

    不 一 定 。 
    不 一 定 不 一 定 。 
    不 一 样 。 
    不 卖 钱 。 
    不 可 以 。 我 还 没 开 始 用 呢 , 
    不 同 的 地 点 今 天 你 会 经 过 哪 里 可 以 买 ? 
    不 听 。 
    不 在 。 
    不 太 信 不 在 。 

    2、基于分词

    不 我这个 , 那 我那个 。
    不是 一万 多 了 ? 怎么 变成 两万 多 ?
    不是 不是 你 去 可以 去 移动 去 查 一下路途中 他们 绝对 不 是 徐世东 好 吗 ?
    不是 不是 我 现在 现在 这个 号码 注册 的 四五折熊 图片 ?
    不是 从前 两 年 说过 了 吗 ?
    不是 你 能 听 我 说话 吗 ? 你 别老自己 跟着 吧 , 说 行不行 啊 。
    不是 原来 它会 自动 还款 怎么办 ? 最近 都 没 没有 跳出来 。
    不是 可以 自动 还款 吗 ?
    不是 啊 , 这样 没有 啊 。

    二、代码

    import os
    import jieba
    import jieba.analyse
    from gensim.test.utils import common_texts, get_tmpfile
    from gensim.models import Word2Vec
    from gensim.models import fasttext
    import gensim
    import pandas as pd
    from ltp import LTP
    ltp = LTP()
    from tqdm import tqdm
    
    # segment, _ = ltp.seg(["他叫汤姆去拿外衣。"])
    # # [['他', '叫', '汤姆', '去', '拿', '外衣', '。']]
    
    class TrainWord2Vec(object):
        def __init__(self):
            parent_path = os.path.split(os.path.realpath(__file__))[0]
            self.root = parent_path[:parent_path.find("pre_process")]  # E:personassemantics
            #一万五的交行对话
            self.jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "all_text.csv")
            #2000条无意义
            self.meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless.txt")
            #6万条原始训练集
            self.semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "all.csv")
            #单字模型
            self.char_word2vec = os.path.join(self.root, "checkpoints", "word2vec", "char_word2vec.model")
            self.char_fasttext = os.path.join(self.root, "checkpoints", "word2vec", "char_fasttext.model")
            #无意义单字分割
            self.char_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_char.txt")
            #词模型
            self.word_word2vec = os.path.join(self.root, "checkpoints", "word2vec", "word_word2vec.model")
            self.word_fasttext = os.path.join(self.root, "checkpoints", "word2vec", "word_fasttext.model")
    
        def char_meaningless(self):
            char_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_char.txt")
            with open(char_meaningless, "w", encoding="utf8") as fout:
                with open(self.meaningless, encoding="utf8") as f:
                    for line in f.readlines():
                        line_str = line.replace(" ", "")
                        cn = " ".join(line_str)
                        fout.write(cn)
    
        def char_jiaohang(self):
            char_jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "jiaohang_char.txt")
            with open(char_jiaohang, "w", encoding="utf8") as fout:
                dataList = pd.read_csv(self.jiaohang, sep="	")["texts"].tolist()
                for line in dataList:
                    line_str = line.replace(" ", "")
                    cn = " ".join(line_str)
                    fout.write(cn +  "
    ")
    
        def char_semantic(self):
            char_semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "semantic_char.txt")
            with open(char_semantic, "w", encoding="utf8") as fout:
                dataList = pd.read_csv(self.semantic, sep="	")["sentence"].tolist()
                for line in dataList:
                    line_str = line.replace(" ", "")
                    cn = " ".join(line_str)
                    fout.write(cn +  "
    ")
    
        def all_char_file(self):
            char_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_char.txt")
            char_jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "jiaohang_char.txt")
            char_semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "semantic_char.txt")
            r_lines = []
            with open(char_meaningless, "r", encoding="utf8") as f1:
                r_lines = r_lines + f1.readlines()
            with open(char_jiaohang, "r", encoding="utf8") as f2:
                r_lines = r_lines + f2.readlines()
            with open(char_semantic, "r", encoding="utf8") as f3:
                r_lines = r_lines + f3.readlines()
            out = os.path.join(self.root, "datas", "word2vec_data", "char_all.txt")
            with open(out, "w", encoding="utf8") as f4:
                for line in r_lines:
                    f4.write(line)
    
        def train_char_meaningless_word2vec(self):
            all_text = os.path.join(self.root, "datas", "word2vec_data", "char_all.txt")
            sentences = gensim.models.word2vec.LineSentence(all_text)
            model = Word2Vec(sentences, hs=0, min_count=5, window=5, vector_size=128)
            # 上下文窗口大小:window=5
            # 忽略低频次term:min_count=5
            # 语言模型是用CBOW还是skip-gram?sg=0 是CBOW
            # 优化方法是用层次softmax还是负采样:hs=0 是负采样
            # 负采样样本数: negative=5 (一般设为5-20)
            # 负采样采样概率的平滑指数:ns_exponent=0.75
            # 高频词抽样的阈值 sample=0.001
            model.save(self.char_word2vec)
            print("wv:", model.wv.most_similar(""))
            print("wv:", model.wv[""])
    
            model1 = fasttext.FastText(sentences, hs=0, min_count=5, window=5, vector_size=128)
            model1.save(self.char_fasttext)
            print("ft:", model1.wv.most_similar(""))
            print("ft:", model1.wv[""])
    
        def word_meaningless(self):
            word_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_word.txt")
            with open(word_meaningless, "w", encoding="utf8") as fout:
                with open(self.meaningless, encoding="utf8") as f:
                    for line in tqdm(f.readlines(), mininterval=1, smoothing=0.1):
                        line_str = line.replace(" ", "")
                        segment, _ = ltp.seg([line_str])
                        segment = " ".join(segment[0])
                        fout.write(segment + "
    ")
    
        def word_jiaohang(self):
            word_jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "jiaohang_word.txt")
            with open(word_jiaohang, "w", encoding="utf8") as fout:
                dataList = pd.read_csv(self.jiaohang, sep="	")["texts"].tolist()
                for line in tqdm(dataList, mininterval=1, smoothing=0.1):
                    line_str = line.replace(" ", "")
                    segment, _ = ltp.seg([line_str])
                    segment = " ".join(segment[0])
                    fout.write(segment + "
    ")
    
        def word_semantic(self):
            word_semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "semantic_word.txt")
            with open(word_semantic, "w", encoding="utf8") as fout:
                dataList = pd.read_csv(self.semantic, sep="	")["sentence"].tolist()
                for line in tqdm(dataList, mininterval=1, smoothing=0.1):
                    line_str = line.replace(" ", "")
                    segment, _ = ltp.seg([line_str])
                    segment = " ".join(segment[0])
                    fout.write(segment + "
    ")
    
        def all_word_file(self):
            word_meaningless = os.path.join(self.root, "datas", "word2vec_data", "meaningless", "meansless_word.txt")
            word_jiaohang = os.path.join(self.root, "datas", "word2vec_data", "jiaohang", "jiaohang_word.txt")
            word_semantic = os.path.join(self.root, "datas", "word2vec_data", "semantic", "semantic_word.txt")
            r_lines = []
            with open(word_meaningless, "r", encoding="utf8") as f1:
                r_lines = r_lines + f1.readlines()
            with open(word_jiaohang, "r", encoding="utf8") as f2:
                r_lines = r_lines + f2.readlines()
            with open(word_semantic, "r", encoding="utf8") as f3:
                r_lines = r_lines + f3.readlines()
            out = os.path.join(self.root, "datas", "word2vec_data", "word_all.txt")
            with open(out, "w", encoding="utf8") as f4:
                for line in r_lines:
                    f4.write(line)
    
        def train_word_meaningless_word2vec(self):
            all_text = os.path.join(self.root, "datas", "word2vec_data", "word_all.txt")
            sentences = gensim.models.word2vec.LineSentence(all_text)
            model = Word2Vec(sentences, hs=0, min_count=5, window=5, vector_size=128)
            # 上下文窗口大小:window=5
            # 忽略低频次term:min_count=5
            # 语言模型是用CBOW还是skip-gram?sg=0 是CBOW
            # 优化方法是用层次softmax还是负采样:hs=0 是负采样
            # 负采样样本数: negative=5 (一般设为5-20)
            # 负采样采样概率的平滑指数:ns_exponent=0.75
            # 高频词抽样的阈值 sample=0.001
            model.save(self.word_word2vec)
            print("wv:", model.wv.most_similar("了解"))
            print("wv:", model.wv["时候"])
    
            model1 = fasttext.FastText(sentences, hs=0, min_count=5, window=5, vector_size=128)
            model1.save(self.word_fasttext)
            print("ft:", model1.wv.most_similar("了解"))
            print("ft:", model1.wv["时候"])
    
        def main(self):
            self.train_word_meaningless_word2vec()
    
    
    
    if __name__ == '__main__':
        TrainWord2Vec().main()
  • 相关阅读:
    iOS----------弹窗动画
    书单
    如何屏蔽垃圾短信
    2018年IOS/Android UI设计规范
    关于Keychain
    OpenUDID 和 IDFA 比较
    iOS-----------关于UDID
    iOS-----------设置自定义字体
    【2020Python修炼记】前端开发之 JavaScript 基础
    【2020Python修炼记】前端开发之 CSS基础布局
  • 原文地址:https://www.cnblogs.com/zhangxianrong/p/14803253.html
Copyright © 2011-2022 走看看