zoukankan      html  css  js  c++  java
  • corpus处理---

    实现的效果如上所示:

    (1)总共几千个文件,遍历文件的关键代码:

    1 files = os.listdir(src_path)  # 得到文件夹下的所有文件名称
    2 for file in files:  # 遍历文件夹
    3     if not os.path.isdir(file):  # 判断是否是文件夹,不是文件夹才打开
    4         f = open(src_path + "/" + file)  # 打开文件
    5         target_file = open(target_path + file, 'w')
    6         iter_f = iter(f)  # 创建迭代器
    7         for line in iter_f:
    8             line = line.rstrip('
    ')
    View Code

    (2)正则表达式:

    1  target_file.write(re.sub(u'\(.*?\)|\{.*?}|\[.*?]','',line))

    (3) 拆分,排序的问题

     1 import itertools
     2 import  os
     3 
     4 def split_str(count):
     5     temp_str = ''
     6     str_param = list()
     7     for num in range(len(count)-1,-1,-1):
     8         s = count[num]
     9         if s != '':
    10             temp=s.split('-')
    11             if temp[0] == '' or temp[1] == '':# 去掉没有对应的
    12                count.remove(s)
    13             else:
    14                 left,right = deal_Str(s)
    15                 temp =[]
    16                 temp = list(itertools.product(left,right))
    17                 temp = list(temp)
    18                 for i in range(0,len(temp)):
    19                     for j in range(0,1):
    20                         str_param.append(str(temp[i][j])+'-'+str(temp[i][j+1]))
    21     return str_param
    22 
    23 def deal_Str(Str): # 将一个字符串分割,eg:'7,8,9-4,5,6,7',变成[7,8,9]和[4,5,6,7]
    24     L=Str.split('-')
    25     left=L[0].split(',')
    26     right=L[1].split(',')
    27     left=[int(x) for x in left]
    28     right=[int(x) for x in right]
    29     return left,right
    30 
    31 def sort_str(str_list):# 排序
    32      count = len(str_list)
    33      for i in range(0, count):
    34          for j in range(0, count-1):
    35              m = str_list[j].split('-')[0]
    36              n = str_list[j+1].split('-')[0]
    37              if int(m) > int(n):
    38                  temp = str_list[j]
    39                  str_list[j] = str_list[j+1]
    40                  str_list[j+1] = temp
    41 
    42      return str_list
    43 
    44 
    45 src_path = 'D:\wordalign\WA\ctb_aligned_chuli'
    46 target_path = 'D:\wordalign\WA\ctb_aligned_last_chuli\'
    47 files = os.listdir(src_path)  # 得到文件夹下的所有文件名称
    48 for file in files:  # 遍历文件夹
    49     if not os.path.isdir(file):  # 判断是否是文件夹,不是文件夹才打开
    50         f = open(src_path + "/" + file)  # 打开文件
    51         target_file = open(target_path + file, 'w')
    52         iter_f = iter(f)  # 创建迭代器
    53         for line in iter_f:
    54             line = line.rstrip('
    ')
    55             if line != 'rejected':
    56                 count = line.split(' ')
    57 
    58                 arr = sort_str(split_str(count))
    59                 temp = ''
    60                 for st in arr:
    61                     temp += st
    62                     temp += ' '
    63                 target_file.write(temp+'
    ')
    64             else:
    65                 target_file.write(line+'
    ')
    66 f.close()
    67 target_file.close()
    View Code
  • 相关阅读:
    git 常用命令
    flask汇总
    flask自定义转换器
    css,js,jquery的载入方式和属性控制
    python import xx和from xx import x 中的坑
    CPython中的GIL
    python装饰器的参数传递
    python函数中的参数(关键字参数,默认参数,位置参数,不定长参数)
    python 闭包
    Java数据类型和对象的引用
  • 原文地址:https://www.cnblogs.com/Shaylin/p/9925937.html
Copyright © 2011-2022 走看看