1 # -*- coding:utf-8 -*- 2 ''' 3 从11c开始提取 4 ''' 5 import re 6 import numpy as np 7 import os 8 year = '17A' 9 ss="./data/edmd/" 10 # filename=ss+"/EDMDI1.17A" 11 def get_tag(): 12 try: 13 os.rename(ss+"/EDMDI1.17A",ss+"/EDMDI1.txt") 14 except: 15 pass 16 f1=open(ss+"/EDMDI1.txt") 17 p1=re.compile(r"^(?:s{3}|Xs{2}|Ws{2})([A-Z]{6})s.+ ") 18 list_tag=list() 19 for line in f1.readlines(): 20 # print(line) 21 match1=re.findall(p1,line) 22 # print(match1) 23 if match1: 24 for j in match1: 25 list_tag.append(j) 26 # filename_w1= ss+'%s'%list_tag[MM] 27 print(list_tag) 28 return list_tag 29 def trmd_b1_nonote(list_tag): 30 if not os.path.exists('./data/edmd/new/'): 31 os.makedirs('./data/edmd/new/') 32 33 for MM in range(len(list_tag)): 34 try: 35 os.rename(ss+'%s_D.17A'%list_tag[MM],ss+'%s.txt'%list_tag[MM]) 36 except: 37 break 38 39 filename_w= ss+'new/%s_w.txt'%list_tag[MM] 40 if os.path.exists(filename_w): 41 os.remove(filename_w) 42 # import os 43 44 # os.rename('./data/CODECO_D.02A','./data/CODECO_D.txt') 45 filename_r = ss+'%s.txt'%list_tag[MM] # txt文件和当前脚本在同一目录下,所以不用写具体路径 46 #00010 UNH Message header M 1 47 pattern1 = re.compile(r"(^d{4,5})s{3}[A-Z]{3}.+[CM]s{3}d*s{1,}|{0,} ")#00010 48 pattern1_2 = re.compile(r"^d{4,5}s{3}([A-Z]{3}).+[CM]s{3}d*s{1,}|{0,} ")#UNH 49 #pattern1_3 = re.compile(r"^d{5}s{3}[A-Z]{3}(.+)[CM]s{3}d*s{1,}|{0,} ")#Message header 50 pattern1_4 = re.compile(r"^d{4,5}s{3}[A-Z]{3}.+([CM])s{3}d*s{1,}|{0,} ")#C 51 pattern1_5 = re.compile(r"^d{4,5}s{3}[A-Z]{3}.+[CM]s{3}(d*)s{1,}|{0,} ")#1 52 #pattern2 = re.compile(r"^d{5}.+Segmentsgroups(d)*.+[CM]s{3}d*-++ " )#+结尾 53 #00050 ---- Segment group 1 ------------------ C 9----------------+ 54 pattern4_1 = re.compile(r"(^d{4,5}).+Segmentsgroupsd*.+[CM]s{3}d*.+ ") 55 pattern4_2 = re.compile(r"^d{4,5}.+Segmentsgroups(d*).+[CM]s{3}d*.+ ") 56 pattern4_3 = re.compile(r"^d{4,5}.+Segmentsgroupsd*.+([CM])s{3}d*.+ ") 57 pattern4_4 = re.compile(r"^d{4,5}.+Segmentsgroupsd*.+[CM]s{3}(d*).+ ") 58 #匹配每组的单独结尾的一行即没有Segment group的以+、+|、+||、+|||……结尾的的每个字段 59 #如00280 RNG Range details C 1---------------+| 60 pattern5_1 = re.compile(r"(^d{4,5})s{3}[A-Z]{3}.+[CM]s{3}d*-++{1,10}|{0,20} " ) 61 pattern5_2 = re.compile(r"^d{4,5}s{3}([A-Z]{3}).+[CM]s{3}d*-++{1,10}|{0,20} " ) 62 pattern5_3 = re.compile(r"^d{4,5}s{3}[A-Z]{3}.+([CM])s{3}d*-++{1,10}|{0,20} " ) 63 pattern5_4 = re.compile(r"^d{4,5}s{3}[A-Z]{3}.+[CM]s{3}(d*)-++{1,10}|{0,20} " ) 64 #以下是确定层级关系 65 #匹配每组的单独结尾的一行即没有Segment group的以+、+|、+||、+|||……结尾的 66 pattern5 = re.compile(r"^d{5}s{3}[A-Z]{3}.+[CM]s{3}d*-++|{0,10} " ) 67 #匹配每组的开头一行即有Segment group的以+、+|、+||、+|||……结尾的 68 pattern2_1 = re.compile(r"^d{5}.+Segmentsgroups(d*).+[CM]s{3}d*-++ " )#+结尾 69 pattern2_2 = re.compile(r"^d{5}.+Segmentsgroups(d*).+[CM]s{3}d*-++| " )#+|结尾 70 pattern2_3 = re.compile(r"^d{5}.+Segmentsgroups(d*).+[CM]s{3}d*-++|| " )#+||结尾 71 pattern2_4 = re.compile(r"^d{5}.+Segmentsgroups(d*).+[CM]s{3}d*-++||| " ) 72 pattern2_5 = re.compile(r"^d{5}.+Segmentsgroups(d*).+[CM]s{3}d*-++|||| " ) 73 pattern2_6 = re.compile(r"^d{5}.+Segmentsgroups(d*).+[CM]s{3}d*-++||||| " ) 74 pattern2_7 = re.compile(r"^d{5}.+Segmentsgroups(d*).+[CM]s{3}d*-++|||||| " ) 75 #匹配有同时多个组同时结束的情况,即以++、++|、++||……++、++|、++||……等结尾的 76 pattern3_1 = re.compile(r"^d{5}.+[CM]s{3}d*-++{2}|{0,20} ")# 匹配++、++|、++||……等结尾 77 pattern3_2 = re.compile(r"^d{5}.+[CM]s{3}d*-++{3}|{0,20} ")# 匹配+++、+++|、+++||……等结尾 78 pattern3_3 = re.compile(r"^d{5}.+[CM]s{3}d*-++{4}|{0,20} ") 79 pattern3_4 = re.compile(r"^d{5}.+[CM]s{3}d*-++{5}|{0,20} ") 80 pattern3_5 = re.compile(r"^d{5}.+[CM]s{3}d*-++{6}|{0,20} ") 81 pattern3_6 = re.compile(r"^d{5}.+[CM]s{3}d*-++{7}|{0,20} ") 82 83 84 flag = 0 85 #listgr中第一个不为0的点 86 pos = -1 87 listgr =[0,0,0,0,0,0,0,0,0,0] 88 89 fr = open(filename_r) 90 w2 = open(filename_w,'a')#a代表追加 w代表重写 91 # w2.write("code_pos,parent,TRSD_tag,year,list_tag[MM],S,R") 92 for line in fr.readlines(): 93 matcher1 = re.findall(pattern1,line) 94 matcher1_2 = re.findall(pattern1_2,line) 95 #matcher1_3 = re.findall(pattern1_3,line) 96 matcher1_4 = re.findall(pattern1_4,line) 97 matcher1_5 = re.findall(pattern1_5,line) 98 matcher2_1 = re.findall(pattern2_1,line) 99 matcher2_2 = re.findall(pattern2_2,line) 100 matcher2_3 = re.findall(pattern2_3,line) 101 matcher2_4 = re.findall(pattern2_4,line) 102 matcher2_5 = re.findall(pattern2_5,line) 103 matcher2_6 = re.findall(pattern2_6,line) 104 matcher2_7 = re.findall(pattern2_7,line) 105 matcher3_1 = re.findall(pattern3_1,line) 106 matcher3_2 = re.findall(pattern3_2,line) 107 matcher3_3 = re.findall(pattern3_3,line) 108 matcher3_4 = re.findall(pattern3_4,line) 109 matcher3_5 = re.findall(pattern3_5,line) 110 matcher3_6 = re.findall(pattern3_6,line) 111 matcher4_1 = re.findall(pattern4_1,line) 112 matcher4_2 = re.findall(pattern4_2,line) 113 matcher4_3 = re.findall(pattern4_3,line) 114 matcher4_4 = re.findall(pattern4_4,line) 115 matcher5 = re.findall(pattern5,line) 116 matcher5_1 = re.findall(pattern5_1,line) 117 matcher5_2 = re.findall(pattern5_2,line) 118 matcher5_3 = re.findall(pattern5_3,line) 119 matcher5_4 = re.findall(pattern5_4,line) 120 121 if matcher4_1!=[]: 122 w2.write(" ") 123 for j in matcher4_1: 124 for k in j: 125 w2.write(k) 126 if matcher4_2!=[]: 127 w2.write(",") 128 #写入parent列 129 if pos!= -1: 130 numgr =listgr[pos] 131 else: 132 numgr = 0 133 if numgr ==0: 134 w2.write("SG0,") 135 else: 136 w2.write("SG"+str(numgr)+",") 137 for j in matcher4_2: 138 for k in j: 139 w2.write("SG"+str(k)) 140 if matcher4_3!=[]: 141 flag = 3 142 w2.write(",") 143 #默认写入year,list_tag[MM]两列 144 w2.write(year+","+list_tag[MM]+",") 145 for j in matcher4_3: 146 for k in j: 147 w2.write(k) 148 if matcher4_4!=[]: 149 w2.write(",") 150 for j in matcher4_4: 151 for k in j: 152 w2.write(k) 153 if matcher5_1!=[]: 154 w2.write(" ") 155 for j in matcher5_1: 156 for k in j: 157 w2.write(k) 158 if matcher5_2!=[]: 159 w2.write(",") 160 #写入parent列 161 if pos!= -1: 162 numgr =listgr[pos] 163 else: 164 numgr = 0 165 if numgr ==0: 166 w2.write("SG0,") 167 else: 168 w2.write("SG"+str(numgr)+",") 169 for j in matcher5_2: 170 for k in j: 171 w2.write(k) 172 if matcher5_3!=[]: 173 flag = 3 174 w2.write(",") 175 #默认写入year,list_tag[MM]两列 176 w2.write(year+","+list_tag[MM]+",") 177 for j in matcher5_3: 178 for k in j: 179 w2.write(k) 180 if matcher5_4!=[]: 181 w2.write(",") 182 for j in matcher5_4: 183 for k in j: 184 w2.write(k) 185 #确定层级关系,也就是确定listgr 186 if(matcher5!=[]): 187 for i in listgr: 188 if i==0: 189 pos = listgr.index(i)-1 190 break 191 listgr[pos]=0 192 if (matcher2_1!=[]): 193 # print "2_1" 194 for j in matcher2_1: 195 # print j 196 if(listgr[0]==0): 197 listgr[0]=j 198 else: 199 listgr[0]=0 200 # print listgr 201 if (matcher2_2!=[]): 202 for j in matcher2_2: 203 #numgr_d = j 204 if(listgr[1]==0): 205 listgr[1]=j 206 else: 207 listgr[1]=0 208 if (matcher2_3!=[]): 209 for j in matcher2_3: 210 if(listgr[2]==0): 211 listgr[2]=j 212 else: 213 listgr[2]=0 214 if (matcher2_4!=[]): 215 for j in matcher2_4: 216 if(listgr[3]==0): 217 listgr[3]=j 218 else: 219 listgr[3]=0 220 if (matcher2_5!=[]): 221 for j in matcher2_5: 222 if(listgr[4]==0): 223 listgr[4]=j 224 else: 225 listgr[4]=0 226 if (matcher2_6!=[]): 227 for j in matcher2_6: 228 if(listgr[5]==0): 229 listgr[5]=j 230 else: 231 listgr[5]=0 232 if (matcher2_7!=[]): 233 for j in matcher2_7: 234 if(listgr[6]==0): 235 listgr[6]=j 236 else: 237 listgr[6]=0 238 if (matcher3_1!=[]): 239 for i in listgr: 240 if i==0: 241 pos = listgr.index(i)-1 242 break 243 listgr[pos]=0 244 listgr[pos-1]=0 245 if (matcher3_2!=[]): 246 for i in listgr: 247 if i==0: 248 pos = listgr.index(i)-1 249 break 250 for k in range((pos-2),(pos+1)): 251 listgr[k]=0 252 if (matcher3_3!=[]): 253 for i in listgr: 254 if i==0: 255 pos = listgr.index(i)-1 256 break 257 for k in range((pos-3),(pos+1)): 258 listgr[k]=0 259 if (matcher3_4!=[]): 260 for i in listgr: 261 if i==0: 262 pos = listgr.index(i)-1 263 break 264 for k in range(pos-4,pos+1): 265 listgr[k]=0 266 if (matcher3_5!=[]): 267 for i in listgr: 268 if i==0: 269 pos = listgr.index(i)-1 270 break 271 for k in range(pos-5,pos+1): 272 listgr[k]=0 273 if (matcher3_6!=[]): 274 for i in listgr: 275 if i==0: 276 pos = listgr.index(i)-1 277 break 278 for k in range(pos-6,pos+1): 279 listgr[k]=0 280 #确定层级关系结束 281 if (matcher1!=[]): 282 flag = 1 283 w2.write(" ") 284 for j in matcher1: 285 for k in j: 286 w2.write(k) 287 #print listgr 288 #判断当前lit不为0的位置 289 for i in listgr: 290 if i==0: 291 pos = listgr.index(i)-1 292 break 293 if matcher1_2!=[]: 294 flag = 2 295 w2.write(",") 296 #写入parent列 297 if pos!= -1: 298 numgr =listgr[pos] 299 else: 300 numgr = 0 301 if numgr ==0: 302 w2.write("SG0,") 303 else: 304 w2.write("SG"+str(numgr)+",") 305 for j in matcher1_2: 306 for k in j: 307 w2.write(k) 308 # if matcher1_3!=[]: 309 # flag = 3 310 # w2.write(",") 311 # for j in matcher1_3: 312 # for k in j: 313 # w2.write(k) 314 if matcher1_4!=[]: 315 flag = 4 316 w2.write(",") 317 #默认写入year,list_tag[MM]两列 318 w2.write(year+","+list_tag[MM]+",") 319 for j in matcher1_4: 320 for k in j: 321 w2.write(k) 322 if ((matcher1_5!=[])and(flag ==4)): 323 flag = 5 324 w2.write(",") 325 for j in matcher1_5: 326 for k in j: 327 w2.write(k) 328 w2.close() 329 fr.close() 330 def trmd_b1_note(list_tag): 331 for MM in range(len(list_tag)): 332 filename_r = ss+'%s.txt'%list_tag[MM] 333 filename_w = ss+'new/%s_wnote.txt'%list_tag[MM] 334 if os.path.exists(filename_w): 335 os.remove(filename_w) 336 337 fr = open(filename_r) 338 w2 = open(filename_w,'a') 339 m=0 340 for line in fr.readlines(): 341 list1 = [3,6,9,12,15,18,21,24,27,30] 342 for i in range(10): 343 k = list1[i] 344 # print k 345 pattern1 = re.compile(r"^(d{4,5})s{"+str(k)+"}[^ ].+ ") 346 matcher1 = re.findall(pattern1,line) 347 if matcher1!=[]: 348 flag = 1 349 m = k 350 # print m 351 w2.write("" ") 352 # for j in matcher1: 353 # w2.write(j) 354 flag = 1 355 w2.write(""") 356 break 357 v = m+5 358 #print v 359 pattern2 = re.compile(r"^s{"+str(v)+"}([^ ].+) ") 360 matcher2 = re.findall(pattern2,line) 361 if (matcher2!=[]): 362 for j in matcher2: 363 w2.write(j) 364 w2.write(" ") 365 #防止匹配到下面结构中的行 366 pattern3 = re.compile(r"(:?4.3s{4}Messagesstructure)|(:?Poss+TagsNames+Ss+R)") 367 matcher3 = re.findall(pattern3,line) 368 if (matcher3!=[]): 369 break 370 w2.write(""") 371 w2.close( ) 372 #把第一行的“修改为note 373 old_file=filename_w 374 fopen=open(old_file,'r') 375 w_str="" 376 i =0 377 for line in fopen: 378 i =i+1 379 if ((re.search(""",line)) and (i ==1)): 380 line=re.sub('"','code_pos,note',line) 381 w_str+=line 382 else: 383 w_str+=line 384 # print w_str 385 wopen=open(old_file,'w') 386 wopen.write(w_str) 387 fopen.close() 388 wopen.close() 389 def join(list_tag): 390 for MM in range(len(list_tag)): 391 f1 = open(ss+'new/%s_w.txt'%list_tag[MM]) 392 f2 = open(ss+'new/%s_wnote.txt'%list_tag[MM]) 393 394 395 list_note=[] 396 for line1 in f1: 397 # print(line1) 398 399 list_note.append(line1) 400 401 f1.close() 402 403 # print(list_note) 404 f2_w= open(ss+'new/b1%s.csv'%year,'a') 405 # for i in range(len(list_note)): 406 j=0 407 # f2_r = open(ss+'/new/%s_w.txt'%list_tag[MM]) 408 for line2 in f2: 409 410 str11="%s,%s "%(list_note[j].strip(' '),line2.strip(' ')) 411 j=j+1 412 # print(i) 413 # print(str11) 414 f2_w.write(str11) 415 416 417 418 f2.close() 419 f2_w.close() 420 421 422 423 if __name__ == '__main__': 424 list_tag=get_tag() 425 trmd_b1_nonote(list_tag) 426 trmd_b1_note(list_tag) 427 join(list_tag) 428 429 """ 430 特殊情况 431 432 433 434 """