zoukankan      html  css  js  c++  java
  • python网页爬虫 spiders_97A-04B

      1 import urllib
      2 import urllib.request
      3 import bs4
      4 from bs4 import BeautifulSoup as bs
      5 import re
      6 import os 
      7 
      8 # year = '97A'
      9 # ss="./data/%s/"%year
     10 '''
     11 适应网页爬取95B-96B
     12 
     13 '''
     14 
     15 
     16 '''
     17 解决网页请求失败
     18 resp = None
     19 while (resp == None):
     20     try:
     21         resp = urllib.request.urlopen("http://baidu.com 
     22 
     23 ")
     24     except:
     25         pass
     26 
     27 '''
     28 def b0_trmd(year,ss):
     29     if not os.path.exists(ss):
     30         os.makedirs(ss)
     31     # os.makedirs(ss)
     32     p1=r"^([A-Z]{6})"
     33 
     34     url = "http://www.stylusstudio.com/edifact/D%s/messages.htm"%year
     35     resp=None
     36     while(resp==None):
     37         try:
     38             resp = urllib.request.urlopen(url)
     39         except:
     40             pass
     41     data = resp.read().decode('cp852')
     42     soup = bs(data, 'html.parser')    
     43     segment11= soup.find_all('table')# ResultSet
     44     segment1=segment11[0].find_all('td')[1:]#表示第几个table,此时表示进去html网页中的第7个table,[1:],<class 'list'>
     45     # segment2= soup.find_all('table')
     46     # print(type(segment1))#
     47     f2=open(ss+'./trmd1%s.txt'%year,'a',encoding='utf-8')
     48     f3=open(ss+'./b0%s.txt'%year,'a',encoding='utf-8')
     49     f4=open(ss+'./trmd%s.txt'%year,'a',encoding='utf-8')
     50     pattern1=re.compile(p1)
     51     tag_list=[]
     52     for item in segment1:
     53             # print(item.string)#如果一个标签里面没有标签了,那么 .string 就会返回标签里面的内容。如果标签里面只有唯一的一个标签了,那么 .string 也会返回最里面的内容。
     54             str1=item.get_text()
     55             # if str1.strip()=="":用于判断字符串是否含空格
     56             #     break
     57             if item.string==None:
     58                 # print("hhusssssssssssssssssssss")
     59                 break
     60             matcher1=re.findall(pattern1,str1)
     61             if matcher1:
     62                 
     63                 f3.write(matcher1[0]+','+year+'
    ')
     64                 tag_list.append(matcher1[0])
     65                 f4.write(matcher1[0]+',')
     66             else:
     67                 f4.write(str1+'
    ')
     68 
     69 
     70             # print(type(str1))
     71             # test1(str1)
     72             # print(str1)#以文本方式呈现
     73 
     74             # print(item.get_text())#获取具体标签内部内容
     75             # print([text for text in item.stripped_strings] )#以列表方式呈现
     76 
     77             # str2=str([text for text in item.stripped_strings])
     78             # #print(type(str1[0][0]))
     79             f2.writelines(str1+'
    ')
     80     f2.close()
     81     return tag_list
     82 def test1(code_tag,year,ss):
     83 
     84     url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag)
     85     resp=None
     86     while(resp==None):
     87         try:
     88             resp = urllib.request.urlopen(url)
     89         except:
     90             pass
     91     data = resp.read().decode('UTF-8')
     92     soup = bs(data, 'html.parser')    
     93     segment11= soup.find_all('table')
     94     segment1=segment11[6].find_all('tr')#表示第几个table,此时表示进去html网页中的第7个table
     95 
     96  
     97     f2=open(ss+'./text1%s%s.txt'%(year,code_tag),'a',encoding='cp852')
     98     for item in segment1:
     99 
    100             # #print(item)
    101             '''
    102             <tr class="FrameTreeFont"><td><span class="FrameDrawFont">│
    103             <span class="FrameHideFont">─</span>│<span class="FrameHideFont">─</span>├─</span>
    104             <a class="FrameItemFont" href="DAM_.htm" target="classFrame" title="Damage">DAM</a> 
    105             Damage</td><td align="right"><span class="FrameDetailFont"> ×1 
    106             </span></td><td><span class="FrameDetailFont">(M)</span></td></tr>
    107             '''
    108             str12=item.get_text()
    109             # #print(str12)#以文本方式呈现
    110             # #print(type(str12))
    111             '''
    112             │─│─├─DAM Damage ×1 (M)
    113             '''
    114             # #print(item.td.span.get_text())#获取具体标签内部内容
    115             # #print([text for text in item.stripped_strings] )#以列表方式呈现
    116             '''
    117             ['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']
    118             '''
    119             '''
    120             soup.get_text("|")#u'
    I linked to |example.com|
    '进一步,通过strip去除掉文本每个位的头尾空白。
    121 
    122             soup.get_text("|", strip=True)#u'I linked to|example.com'
    123             '''
    124             str1=str([text for text in item.stripped_strings])
    125             # #print(type(str1[0][0]))
    126             f2.writelines(str12+'
    ')
    127 
    128     f2.close()
    129 def test2(code_tag,year,ss):
    130     # p1=r"^(?:├─|└─)(.+)
    "
    131     p1=r"^W{2}(w.+)
    "#
    132     # p1=r"^W{2}(SegmentsGroupsw.+)
    "#segement为第一层
    133     # p2=r"^(?:│─├─|│─└─)(.+)
    "
    134     p2=r"^W{4}(w.+)
    "
    135     # p3=r"^(?:│───├─|│───└─|│─│─├─|│─│─└─)(.+)
    "
    136     p3=r"^W{6}(w.+)
    "
    137     # p4=r"^(?:)(.+)
    "
    138 
    139     p4=r"^W{8}(w.+)
    "
    140     p5=r"^W{10}(w.+)
    "
    141     p6=r"^W{12}(w.+)
    "
    142     p7=r"^W{14}(w.+)
    "
    143     p8=r"^W{16}(w.+)
    "
    144 
    145     p9=r"SegmentsGroups(?:([0-9]|[0-9][0-9]))"
    146     # p10="Segment Group "
    147 
    148 
    149 
    150     pattern1=re.compile(p1)
    151     pattern2=re.compile(p2)
    152     pattern3=re.compile(p3)
    153     pattern4=re.compile(p4)
    154 
    155     pattern5=re.compile(p5)
    156     pattern6=re.compile(p6)
    157     pattern7=re.compile(p7)
    158     pattern8=re.compile(p8)
    159     pattern9=re.compile(p9) 
    160     # pattern10=re.compile(p10)
    161 
    162     f1=open(ss+'./text1%s%s.txt'%(year,code_tag),'r',encoding='cp852')
    163     f2=open(ss+'./text2%s%s.txt'%(year,code_tag),'a',encoding='utf-8')    
    164     # c=int()
    165     # d=int()
    166     listp=[0,0,0,0,0,0,0,0]#用于记录父节点
    167     for line in f1.readlines():
    168 
    169         matcher1=re.findall(pattern1,line)
    170         matcher2=re.findall(pattern2,line)
    171         matcher3=re.findall(pattern3,line)
    172         matcher4=re.findall(pattern4,line)
    173 
    174         matcher5=re.findall(pattern5,line)
    175         matcher6=re.findall(pattern6,line)
    176         matcher7=re.findall(pattern7,line)
    177         matcher8=re.findall(pattern8,line)
    178         matcher9=re.findall(pattern9,line)
    179         # #print(type(matcher1))
    180 
    181         if matcher1:
    182 
    183             a='SG'+str(listp[0])+' '+matcher1[0]+'
    '
    184             f2.write(a)
    185             if matcher9:
    186                 listp[1]=matcher9[0]
    187         if matcher2:
    188 
    189             b='SG'+str(listp[1])+' '+matcher2[0]+'
    '
    190             f2.write(b)
    191             if matcher9:
    192                 listp[2]=matcher9[0]
    193         if matcher3:
    194 
    195             c='SG'+str(listp[2])+' '+matcher3[0]+'
    '
    196             f2.write(c)
    197             #print(c)
    198             if matcher9:
    199                 listp[3]=matcher9[0]
    200         if matcher4:
    201             d='SG'+str(listp[3])+' '+matcher4[0]+'
    '
    202             f2.write(d)
    203             #print(d)
    204             if matcher9:
    205                 listp[4]=matcher9[0]
    206         if matcher5:
    207             e='SG'+str(listp[4])+' '+matcher5[0]+'
    '
    208             f2.write(e)
    209             #print(d)
    210             if matcher9:
    211                 listp[5]=matcher9[0]
    212         if matcher6:
    213             f='SG'+str(listp[5])+' '+matcher6[0]+'
    '
    214             f2.write(f)
    215             #print(d)
    216             if matcher9:
    217                 listp[6]=matcher9[0]
    218         if matcher7:
    219             g='SG'+str(listp[6])+' '+matcher7[0]+'
    '
    220             f2.write(g)
    221             #print(d)
    222             if matcher9:
    223                 listp[7]=matcher9[0]
    224         if matcher8:
    225             h='SG'+str(listp[7])+' '+matcher8[0]+'
    '
    226             f2.write(h)
    227             #print(d)
    228             if matcher9:
    229                 listp[8]=matcher9[0]
    230     f2.close()
    231     f1.close()
    232     f3=open(ss+'./text3%s%s.txt'%(year,code_tag),'w',encoding='utf-8')
    233     f4=open(ss+'./text2%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
    234     for line1 in f4.readlines():
    235         #print(line1)
    236         # f3.write(line1.replace(" "," "))
    237         f3.write(line1.replace("Segment Group ","SG"))
    238     f4.close()
    239     f3.close()
    240 def test3(code_tag,year,ss):
    241     f5=open(ss+'./text4%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
    242     f6=open(ss+'./text3%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
    243     p10=r"(^w{3})s(w{3}).+×([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})s((w))$"
    244     pattern10=re.compile(p10)
    245     i=0
    246     for line2 in f6.readlines():
    247         i=i+1
    248         matcher10=re.findall(pattern10,line2)
    249         # print(matcher10)
    250         # print(type(matcher10))
    251         if matcher10:
    252             f5.write(str(matcher10[0])+'
    ')
    253 
    254     f5.close()
    255     f6.close()
    256     # print(i)
    257     return i
    258 def test4(code_tag,year,ss):
    259     url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag)
    260     resp=None
    261     while(resp==None):
    262         try:
    263             resp = urllib.request.urlopen(url)
    264         except:
    265             pass
    266     data = resp.read().decode('UTF-8')
    267     soup = bs(data, 'html.parser')    
    268     segment11= soup.find_all('p')
    269     # segment1=segment11[1].find_all('p')#表示第几个table,此时表示进去html网页中的第7个table
    270     # #print(segment1)    
    271     f2=open(ss+'./text5%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
    272     for item in segment11:
    273         str12=item.get_text()
    274         #print(str12)#以文本方式呈现
    275         #print(type(str12))
    276         '''
    277         │─│─├─DAM Damage ×1 (M)
    278         '''
    279         # #print(item.td.span.get_text())#获取具体标签内部内容
    280         #print([text for text in item.stripped_strings] )#以列表方式呈现
    281         '''
    282         ['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']
    283         '''
    284         '''
    285         soup.get_text("|")#u'
    I linked to |example.com|
    '进一步,通过strip去除掉文本每个位的头尾空白。
    286 
    287         soup.get_text("|", strip=True)#u'I linked to|example.com'
    288         '''
    289         str1=str([text for text in item.stripped_strings])
    290         #print(type(str1[0][0]))
    291         f2.writelines(str12+'
    ')
    292 
    293     f2.close()
    294  
    295     # f2=open('./text1.txt','a',encoding='cp852')
    296     # for item in segment1:    
    297 def test5(code_tag,num,year,ss):
    298     f7=open(ss+'./text6%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
    299     f8=open(ss+'./text5%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
    300     p1=r"(^Asservicessegment.+
    )"
    301     # p2=r"((?:Asw|^Date|^This|^Document|^Ins|^Requirementss|^Dimensions|^The|^Ifs|^Through|^Instructions|^For|^An).+
    )"
    302     p2=r"(^(?!Information.+:|Note|Itsissrecommendedsthatswhere|IDsshouldsbesspecified|Allsotherssegments|Asgroupsofssegmentssthatscontainssaslinesitemsandsitssrelatedsinformation.+shouldsbesconsigned.).+
    )"
    303     pattern1=re.compile(p1)
    304     pattern2=re.compile(p2)
    305     # pattern3=re.compile(p3)
    306     # pattern4=re.compile(p4)
    307     flag=0
    308     i=num
    309     for line3 in f8.readlines():
    310         matcher1=re.findall(pattern1,line3)
    311         matcher2=re.findall(pattern2,line3)
    312         # matcher3=re.findall(pattern3,line3)
    313         # matcher4=re.findall(pattern4,line3)
    314 
    315         # #print(matcher10)
    316         if matcher1 and flag==0:
    317             f7.write(matcher1[0])
    318             flag=1
    319             i=i-1
    320             if i==0:
    321                 break
    322             continue
    323         if (matcher2 and (flag==1 or flag==2)):
    324             f7.write(matcher2[0])
    325             flag=2
    326             i=i-1
    327             continue
    328     f7.close()
    329     f8.close()
    330 
    331 def join(code_tag,year,ss):
    332 
    333 
    334     f1 =open(ss+'text6%s%s.txt'%(year,code_tag),'r',encoding='utf-8') 
    335     f2= open(ss+'text4%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
    336 
    337 
    338     list_note=[]
    339     for line1 in f1:
    340         list_note.append(line1)
    341     f1.close()
    342     p11=r"^W{2}(w{3}).+
    "
    343     p12=r"^W{2}w{3}W{2}sW(w{3}).+
    "
    344     p13=r"^W{2}w{3}W{2}sWw{3}W{2}sW([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})W.+
    "
    345     p14=r"W{2}w{3}W{2}sWw{3}W{2}sW.+(C|M)"
    346     # print(list_note)
    347     f2_w= open(ss+'b1%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
    348     f3_w= open(ss+'b1%s.csv'%year,'a',encoding='utf-8')  
    349     # for i in range(len(list_note)):
    350     i=0
    351     pattern11=re.compile(p11)
    352     pattern12=re.compile(p12)    
    353     pattern13=re.compile(p13)
    354     pattern14=re.compile(p14)        
    355     # f2_r = open(ss+'/new/%s_w.txt'%list_tag[i])
    356     pos=[
    357     
    358     '0010','0020','0030','0040','0050','0060','0070','0080','0090','0100','0110','0120','0130','0140','0150','0160','0170','0180','0190','0200',
    359     '0210','0220','0230','0240','0250','0260','0270','0280','0290','0300','0310','0320','0330','0340','0350','0360','0370','0380','0390','0400',
    360     '0410','0420','0430','0440','0450','0460','0470','0480','0490','0500','0510','0520','0530','0540','0550','0560','0570','0580','0590','0600',
    361     '0610','0620','0630','0640','0650','0660','0670','0680','0690','0700','0710','0720','0730','0740','0750','0760','0770','0780','0790','0800',
    362     '0810','0820','0830','0840','0850','0860','0870','0880','0890','0900','0910','0920','0930','0940','0950','0960','0970','0980','0990','1000',
    363     '1010','1020','1030','1040','1050','1060','1070','1080','1090','1100','1110','1120','1130','1140','1150','1160','1170','1180','1190','1200',
    364     '1210','1220','1230','1240','1250','1260','1270','1280','1290','1300','1310','1320','1330','1340','1350','1360','1370','1380','1390','1400',
    365     '1410','1420','1430','1440','1450','1460','1470','1480','1490','1500','1510','1520','1530','1540','1550','1560','1570','1580','1590','1600',
    366     '1610','1620','1630','1640','1650','1660','1670','1680','1690','1700','1710','1720','1730','1740','1750','1760','1770','1780','1790','1800',
    367     '1810','1820','1830','1840','1850','1860','1870','1880','1890','1900','1910','1920','1930','1940','1950','1960','1970','1980','1990','2000',
    368     '2010','2020','2030','2040','2050','2060','2070','2080','2090','2100','2110','2120','2130','2140','2150','2160','2170','2180','2190','2200',
    369     '2210','2220','2230','2240','2250','2260','2270','2280','2290','2300','2310','2320','2330','2340','2350','2360','2370','2380','2390','2400',
    370     '2410','2420','2430','2440','2450','2460','2470','2480','2490','2500','2510','2520','2530','2540','2550','2560','2570','2580','2590','2600',
    371     '2610','2620','2630','2640','2650','2660','2670','2680','2690','2700','2710','2720','2730','2740','2750','2760','2770','2780','2790','2800',
    372     '2810','2820','2830','2840','2850','2860','2870','2880','2890','2900','2910','2920','2930','2940','2950','2960','2970','2980','2990','3000',
    373     '3010','3020','3030','3040','3050','3060','3070','3080','3090','3100','3110','3120','3130','3140','3150','3160','3170','3180','3190','3200',
    374     '3210','3220','3230','3240','3250','3260','3270','3280','3290','3300','3310','3320','3330','3340','3350','3360','3370','3380','3390','3400',
    375     '3410','3420','3430','3440','3450','3460','3470','3480','3490','3500','3510','3520','3530','3540','3550','3560','3570','3580','3590','3600',
    376     '3610','3620','3630','3640','3650','3660','3670','3680','3690','3700','3710','3720','3730','3740','3750','3760','3770','3780','3790','3800',
    377     '3810','3820','3830','3840','3850'
    378 
    379     ]
    380     for line2 in f2:
    381         matcher11=re.findall(pattern11,line2)
    382         matcher12=re.findall(pattern12,line2)
    383         matcher13=re.findall(pattern13,line2)
    384         matcher14=re.findall(pattern14,line2)
    385         # print(matcher11[0])
    386         # print(matcher12[0])
    387         # print(matcher13[0])
    388         # print(matcher14[0])
    389         # print(matcher11[0])
    390         # a=list(line2)
    391         # print(a)
    392         # b=str(a)
    393         # print(b)
    394         # print(line2.split(','))
    395         try:
    396             str11="%s,%s,%s,%s,%s,%s,%s,"%s"
    "%(pos[i],code_tag,matcher12[0],matcher11[0],year,matcher14[0],matcher13[0],list_note[i].strip('
    '))
    397         
    398             i=i+1
    399             # print(i)
    400             # print(str11)
    401             f2_w.write(str11)
    402             f3_w.write(str11)
    403         except:
    404             print("---error---")
    405             break
    406 
    407     f2_w.close() 
    408     f2.close()
    409 
    410 def test():#用户爬取网页,保存到本地
    411     filename='./codeco.txt'
    412     url = "http://www.stylusstudio.com/edifact/D95B/CODECO.htm"
    413     resp = urllib.request.urlopen(url)
    414     data = resp.read().decode('UTF-8')
    415     # f1=open(filename,'w')
    416     # f1.write(data)
    417     # #print(type(data))
    418     # #print(data)
    419     f2=open('./text.txt','a')
    420     soup = bs(data, 'html.parser')    
    421 
    422     # sw=soup.find_all('table',border=0,width="100%")
    423     # #print(sw[0])
    424     segment1= soup.find_all('h4')
    425 
    426     segment2= soup.find_all('p')
    427     # #print(type(segment))
    428     #print(segment1)
    429     #print(segment2)
    430     nowplaying_list = [] 
    431     for item in segment1:
    432             #print(item)
    433             # #print(item.name)
    434             # #print(item.attrs)
    435             # #print(type(item))
    436             #print(item.get_text())
    437             #print([text for text in item.stripped_strings] )
    438             f2.writelines(str([text for text in item.stripped_strings])+'
    ')    
    439             # nowplaying_dict = {}        
    440             # nowplaying_dict['id'] = item['a']       
    441             # for tag_img_item in item.find_all('img'):            
    442             #     nowplaying_dict['name'] = tag_img_item['alt']            
    443             #     nowplaying_list.append(nowplaying_dict)
    444     # result= segment[0].find_all('h4')
    445     # #print(result)
    446 
    447     for item in segment2:
    448 
    449             #print(item)
    450             #print(item.get_text())
    451             f2.writelines(str([text for text in item.stripped_strings] )+'
    ')  
    452     f2.close()
    453     # data={}
    454     # data['word']='Jecvay Notes'
    455      
    456     # url_values=urllib.parse.urlencode(data)
    457     # url="http://www.baidu.com/s?"
    458     # full_url=url+url_values
    459      
    460     # data=urllib.request.urlopen(full_url).read()
    461     # data=data.decode('UTF-8')
    462     # #print(data)
    463 if __name__=='__main__':
    464     # '97A','97B','98A','98B','99A','99B'
    465     year1=['00A','00B','01A','01B','01C','02A','02B','03A','03B','04A','04B']
    466     for j in range(len(year1)):
    467 
    468         year=year1[j]
    469         ss="./data/%s/"%year
    470         tag=b0_trmd(year,ss)
    471         print(tag)
    472         for i in range(len(tag)):
    473             test1(tag[i],year,ss)
    474             test2(tag[i],year,ss)
    475             num=test3(tag[i],year,ss)
    476             test4(tag[i],year,ss)
    477             test5(tag[i],num,year,ss)
    478             join(tag[i],year,ss)
    479             print("------%s-----ok"%i)
    480     # str1='APERAK'
    481     # join(str1)
  • 相关阅读:
    dell服务器 bios界面
    windows server 常用功能(一)
    Windows server 2016 / Windows 10关于域管理员帐号权限不足的问题
    (转)Java并发编程:并发容器之CopyOnWriteArrayList
    (转)ThreadLocal
    (转)java volatile关键字
    (转)java fail-fast机制
    (转)java并发对象锁、类锁、私有锁
    (转)java synchronised关键字
    javascript array
  • 原文地址:https://www.cnblogs.com/smuxiaolei/p/7427663.html
Copyright © 2011-2022 走看看