zoukankan      html  css  js  c++  java
  • 一些好用的代码

    按标点切分语料

    src = ''
    tgt = ''
    temp = ",.!?;"

    def fun(file1,file2,temp):
      with open(file1,'r',encoding='utf-8') as fl1:
        with open(file2,'w',encoding='utf-8') as fl2:
          for line in fl1.readlines():
            for word in line:
              if word not in temp:
                fl2.write(word)
               else:
                if word != ' ':
                  fl2.write(word+' ')
                else:
                  fl2.write(word)
    fun(src,tgt,temp)
            

    查找语料中的外语### 本代码处理的事带有目标语言句子的源语言句子,将其定位之后再交换句子并生成新的文件

    ### 本代码同过英文文档来定位中文文档
    import langid
    import tensorflow as tf
    import codecs
    from langdetect import detect ## detect()输出探测出的语言类型
    from langdetect import detect_langs ## detect()输出探测出的所有语言类型及其所占的比例

    src = '' ## 英文
    tgt = '' ## 中文
    file1 = ''
    file2 = ''

    def fun1(seq):
      temp = langid.classify(seq)
      return temp[0]

    def fun(seq):
      lemp = detect(seq)
      return lemp

    ### 该函数的功能是定位外语句子
    def fun2(src):
      k = 0
      temp = []
      with codecs.getreader('utf-8')(tf.gfile.GFile(src,'rb')) as fl:
        for line in fl.readlines():
          k += 1
    try:
            temp1 = fun(line) ### 正常的情况下用langdetect
          expect:
            temp1 = fun1(line) ### 异常的情况下用langid
          if temp1 == 'zh':
            temp.append(k)
          else:
            pass
        return temp

    ### 该函数的功能是交换句子
    ### src(英文):file1是切分后的英文句子,file2是切分后的中文句子
    ### tgt(中文):
    file1是切分后的中文句子,file2是切分后的英文句子
    def fun3(temp,src,tgt,file1,file2):
      num = 0
      #s_file = open(src,'r',encoding='utf-8')
      s_file = open(tgt,'r',encoding='utf-8')
      fl1 = open(file1,'w',encoding = 'utf-8')
      fl2 = open(file2,'w',encoding = 'utf-8')
      for line in s_file.readlines():
        num += 1
        if num in temp:
          fl2.write(line)
        else:
          fl1.write(line)

      s_file.close()
      fl1.close()
      fl2.close()


    if __name__ == "__main__":
      temp = fun2(src)
      fun3(temp,src,tgt,file1,file2)

     分词

    import jieba

    src = ''
    tgt = ''

    def cut(file1,file2):
      with open(file1,'r',encoding='utf-8') as fl1:
        with open(file2,'w',encoding='utf-8') as fl2:
          for line in fl1.readlines():
            ## seq = jieba.cut(line,cut_all=True) ## 全模式
            ## seq = jieba.cut_for_search(line ) ## 搜索引擎模式
            seq = jieba.cut(line,cut_all = False) ## 精确模式
            seq = ' '.join(seq)
            fl2.write(seq)

    还原句子

    ### 将分好词的结果文件还原成句子
    file = ''
    tgt_file = ''

    def fun(file,file2):
      with open(file,'r',encoding='utf-8') as fl1:
        fl2 = open(file2,'w',encoding='utf-8')
          for line in fl.readlines():
            line = line.replace(" ",'')
            fl2.write(line)
        fl2.close()

    fun(file,tgt_file)

     随机生成测试集

    ### 本代码的功能是随机抽取测试集,并将文本除抽取出的测试集之后余下的部分生成训练集
    import numpy as np
    import random

    src_en = ''
    src_ch = ''
    cut_num = 3000 ## 抽取句子的数量

    tgt_train_en = ''
    tgt_train_ch = ''
    tgt_dev_en = ''
    tgt_dev_ch = ''

    ## 生成随机数
    def random_num():
      temp = []
      for i in range(cut_num):
        a = random.randint(1,25000) ## 生成随机数的范围
        if a not in temp:
          temp.append(a)
      print(len(temp)) ## 实际抽取出的数量
      temp = sorted(temp,reverse=False) ## 升序
      return temp

    ## src-en(1)/src-zh(2)/train-en(3)/dev-en(4)/train-zh(5)/dev-zh(6)
    def new_file(file1,file2,file3,file4,file5,file6):
      temp = random_num()
      fl1 = open(file1,'r',encoding='utf-8')
      fl2 = open(file2,'r',encoding='utf-8')
      fl3 = open(file3,'r',encoding='utf-8')
      fl4 = open(file4,'r',encoding='utf-8')
      fl5 = open(file5,'r',encoding='utf-8')
      fl6 = open(file6,'r',encoding='utf-8')
      
    def fun(f1,f2,f3):
        num = 0
        i = 0
        for line1 in fl.readlines():
          num += 1
          if i< len(temp):
            if num == temp[i]:
              f3.write(line1)
               i += 1
            else:
              f2.write(line1)
      fun(fl1,fl3,fl4)
      fun(fl2,fl5,fl6)

      fl1.close()
      fl2.close()
      fl3.close()
      fl4.close()
      fl5.close()
      fl6.close()
    new_file(src_en,src_ch,tgt_train_en,tgt_dev_en,tgt_train_ch,tgt_dev_ch)
      
  • 相关阅读:
    ObjectDataSource用法之六(刪除)
    ObjectDataSourc用法之七(新增)
    C# 装箱和拆箱
    Android SD卡中压缩包解压(ZIP文件)
    Android 调用系统的拨号服务实现 电话拨打功能
    Android 判断SD卡存不存在
    android中IdleHandler的使用
    android使用遥控器模拟鼠标拖拽操作
    Android SD卡 文件或目录拷贝、复制、粘贴
    C#在线获取歌词(转)
  • 原文地址:https://www.cnblogs.com/hanouba/p/11637737.html
Copyright © 2011-2022 走看看