zoukankan      html  css  js  c++  java
  • 常用Python文件

    # -*- coding: utf-8 -*-
    """
    Created on Wed Jun 28 18:42:33 2017


    """
    import re
    import numpy as np


    '''
    该程序实现对giza++后的对齐双语平行语料抽取对齐词汇关系
    建立源语言到目标语言的映射矩阵,编号从0开始,将对齐文件中的NULL当作第一个词语
    如果词语之间存在对齐关系,则将对齐矩阵matrixST[s][t]位置值设置为1,其它为0

    '''
    def alig_pairs(filepath):
        matrixZeroOne = []
        pattern1 = re.compile(r' ({([0-9 ]*)}) ?')
        # print(pattern1)
        f = open(filepath,'r')#,encoding='utf-8')
        line=f.readline()
        #matrix = np.zeros()
        while(True):
            if not line:
                break   
            target = f.readline().strip().split()
            source = f.readline().strip()
            #match= pattern1.findall(source) # 使用Pattern匹配文本,获得匹配结果,无法匹配时将返回None
            source_word = pattern1.split(source)
            # print(source_word)
            s_l = len(source_word)//2-1#-1不考虑null
            t_l = len(target)
            #print(s_l)
            #print(t_l)
            matrixTS = np.zeros((t_l,s_l))
            #print(matrixST.shape)
            #从null开始对齐i=0。如果不考虑null,从第二位开始,i=2
            i=2
            while( i < len(source_word)-2):
                index = source_word[i+1]
                if index != '' and index !=' ':
                    s = index.strip().split()
                    # print(s)
                    for s_ind in s:
                        #设置对齐矩阵
                        matrixTS[int(s_ind)-1][int((i-2))//2]=1
                        #print(i//2-1)
                        #该语句抽取对齐词语队
                        #print(source_word[int(i)],target[int(s_ind)-1])
                i+=2
            # print(matrixTS)
            matrixZeroOne.append(matrixTS)
            # print(matrixTS.shape)
            #因为对齐这个矩阵是动态生成的,所以在这里进行矩阵的合并

            #print(i)            
            #print(source_word)
            #if match:
                #print (match)
                #print ('yes')
            line=f.readline()
            #print(target)
            #print(source)
            
       
        f.close()
        return matrixZeroOne
    #alig_pairs('test.txt')
    #alig_pairs('117-06-28.183340.lmt.A3.final')
    -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    # -*- coding: utf-8 -*-
    import codecs
    def get_matrix():
            #print('程序进入process')
        Chinese = codecs.open("result/result_cn",'r',encoding = 'utf-8')
        English = codecs.open('result/result_en', 'r', encoding = 'utf-8')
        # result_eng = codecs.open('result/swap_en', 'w', encoding = 'utf-8')
        # result_chi = codecs.open('result/swap_cn', 'w', encoding = 'utf-8')
        # eng_chi = codecs.open('result/en_to_cn','w',encoding = 'utf-8')

        english_sentence_count = 0
        chinese_sentence_count = 0
            
        chinese_word = []
        chinese_sentence = []

        for line in Chinese.readlines():
            pair = line.strip().split()
            if len(pair) == 4:
                swap = pair[1]
                pair[1] = pair[2]
                pair[2] = swap
                s = pair[0] + " " + pair[1] + " " + pair[2] + " " + pair[3]
                chinese_word.append(s)
                # result_chi.write(pair[0] + "    " + pair[1] + "    " + pair[2] + "    " + pair[3] + " ")
            if len(pair) == 0:
                chinese_sentence.append(chinese_word)
                chinese_word = []
                # result_chi.write(" ")
                chinese_sentence_count += 1

                
        english_word = []
        english_sentence = []
        for line in English.readlines():
            pair = line.strip().split()
            if len(pair) == 4:
                swap = pair[1]
                pair[1] = pair[2]
                pair[2] = swap    
                s = pair[0] + " " + pair[1] + " " + pair[2] + " " + pair[3]
                english_word.append(s)
                # result_eng.write(pair[0] + "    " + pair[1] + "    " + pair[2] + "    " + pair[3] + " ")
            if len(pair) == 0:
                english_sentence.append(english_word)
                english_word = []
                # result_eng.write(" ")
                english_sentence_count += 1

        if english_sentence_count < chinese_sentence_count:
            min_count = english_sentence_count
        else:
            min_count = chinese_sentence_count


        matrix = []
        if len(english_sentence) == len(chinese_sentence):
            i = 0
            while i < len(english_sentence):
                chinese_sentence_length = len(chinese_sentence[i])
                english_sentence_length = len(english_sentence[i])#获得当前句子的行列值

                english_chinese = [["0" for col in range(english_sentence_length + 1)] for row in range(chinese_sentence_length + 1)]
                
                col = 1
                while col <= english_sentence_length:
                    english_chinese[0][col] = english_sentence[i][col - 1]
                    col += 1
                
                row = 1
                while row <= chinese_sentence_length:
                    english_chinese[row][0] = chinese_sentence[i][row - 1]
                    row += 1
                # for row in range(chinese_sentence_length):
                #     for col in range(english_sentence_length):
                #         eng_chi.write(english_chinese[row][col] + "    ")
                #     eng_chi.write(" ")
                # eng_chi.write(" ")
                #每次放进去的矩阵,其实规模是不一样大的
                matrix.append(english_chinese)
                i = i + 1
        else:
            print('error')



        # for j in range(len(matrix)):
        #     for row in range(len(matrix[j])):
        #         s = ""
        #         for col in range(len(matrix[j][row])):
        #             s += matrix[j][row][col]
        #             s += "    "
        #         print(s)
            
        return matrix, chinese_sentence
    #matrix,_ = get_matrix()            
    -----------------------------------------------------------------------------------------------------------------------------------------------------------------------

    #-*-coding:utf-8-*-
    import os  
    import string  
       
    def count(filepath):  
        total = 0 #总行数  
        countPound = 0 #注释行数  
        countBlank = 0 #空行数  
        line = open(filepath,'r')#,encoding='utf-8')
        for li in line.readlines(): #readlines()一次性读完整个文件  
            total += 1  
            if not li.split(): #判断是否为空行  
                countBlank +=1  
            li.strip()  
            if li.startswith('#'):  
                countPound += 1  
        print(file)  
        print("countBlank:%d" % countBlank)  
        print("countPound:%d" % countPound)  
        print("total:%d" % total)  
     
     
    count('result_cn')

    -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    #-*-coding:utf-8-*-
    def bijiao():  
         f1=open('lmt.txt','r')  
         f2=open('lh.txt','r')  
         count=0     #统计行数  
         dif=[]      #统计不同的数量序列  
         for a in f1:  
              b=f2.readline()  
              count+=1  
              if a!=b:  
                  dif.append(count)  
         f1.close()  
         f2.close()  
         return dif    
    c=bijiao()  
    if c==0:  
         print('两个文件一样!')  
    else:  
         print('有%d处不同'% len(c))  
         for each in d:  
              print('%d行不一样'% each)

  • 相关阅读:
    opencv+python实时人脸检测、磨皮
    opencv人脸检测
    均值模糊、中值模糊、自定义模糊
    双边滤波
    表面模糊
    水纹滤镜
    爬取https网站
    字符串、数组、切片、map
    tcpdump抓包和Wireshark解包
    iptables详解
  • 原文地址:https://www.cnblogs.com/maowuyu-xb/p/7236769.html
Copyright © 2011-2022 走看看