zoukankan      html  css  js  c++  java
  • 一些好用的代码

    ##### 正则化 ####

    def
    re_fun(seq): rule = re.compile(u'[^a-zA-Z.,;《》?!“”‘’@#¥%…&×()——+【】{};;●,。&~、|s::' + 'u4e00-u9fa5]+') seq = re.sub(rule, '', seq) seq = re.sub('[<]+', '', seq) seq = re.sub('[>]+', '', seq) seq = re.sub('[:]+', '', seq) seq = re.sub('[;]+', '', seq) seq = re.sub('[!]+', '', seq) seq = re.sub('[.]+', '', seq) seq = re.sub('[。]+', '', seq) seq = re.sub('[,]+', '', seq) seq = re.sub('[ ]+', '', seq) return seq

    #############################################################################################################################################

    ##### 生成词表 #####
    def
    vocab_fun(filename): vocab = ct.Counter() with codecs.getreader('utf-8')(tf.gfile.GFile(filename,'rb')) as file: for line in file.readlines(): line = re_fun(line) line = line.strip().split(' ') #print(line) for word in line: #print(word) vocab.update([word]) return vocab

    ##########################################################################################################################################

    ##### 写入文件路径 #####
    dir_path = "D:mathine_learningpre_estidataset" tgt = 'europarl-v7.de-en.de' src = 'europarl-v7.de-en.en' train_src = os.path.join(dir_path,os.path.basename(src)) train_tgt = os.path.join(dir_path,os.path.basename(tgt))

    #########################################################################################################################################

    ##### 测bleu值 #####
    from
    nltk.translate.bleu_score import corpus_bleu src_seq = open(src_file,'r',encoding='utf-8') tgt_seq = open(tgt_file,'r',encoding='utf-8') temp1 = [] temp2 = [] for line1,line2 in zip(src_seq,tgt_seq): line1 = line1.strip(' ').split(' ') line2 = line2.strip(' ').split(' ') temp1.append(line1) temp2.append(line2) a = corpus_bleu(temp1,temp2) print( a ) src_seq.close() tgt_seq.close()
  • 相关阅读:
    U10783 名字被和谐了
    P1151 子数整数
    P2756 飞行员配对方案问题
    P3227 [HNOI2013]切糕
    BZOJ 2127: happiness(最小割解决集合划分)
    linux脚本初体验
    130902 周赛
    Citrix 服务器虚拟化之三十二 XenConvert
    数据字典统一管理,动态下拉框
    [置顶] oracle 数据库表中转换成java代码
  • 原文地址:https://www.cnblogs.com/hanouba/p/11544867.html
Copyright © 2011-2022 走看看