代码:
1 # # -*- coding:utf-8 -*-
2 from lxml import etree
3 import re
4 import requests #导入requests包
5 import 爬取法律法规.SQL as SQL
6
7 def Get_urls(start,end):
8 hrefs_link = []
9 hrefs_title = []
10 for i in range(start,end):
11 #i=1
12 url = 'https://www.chinacourt.org/law/more/law_type_id/MzAwNEAFAA/page/' + str(i) + '.shtml'
13 print(url)
14 strhtml = requests.get(url, timeout=7)
15 tree = etree.HTML(strhtml.text)
16 hreff=tree.xpath('//*[@id="flwk"]/div[1]/div[2]/ul//a//@href')
17 for hh in hreff:
18 hrefs_link.append(hh)
19 hreff_text = tree.xpath('//*[@id="flwk"]/div[1]/div[2]/ul//a//text()')
20 for hh in hreff_text:
21 hrefs_title.append(hh)
22 return hrefs_title,hrefs_link
23 if __name__ =="__main__":
24 hrefs_title,hrefs_link=Get_urls(500,534)
25
26 for num_i in range(len(hrefs_link)):
27 #num_i=15
28 print(num_i)
29 try:
30 href_url='https://www.chinacourt.org'+hrefs_link[num_i]
31 href_title=hrefs_title[num_i]
32 if ("失效" in href_title) or ("主席令" in href_title):
33 continue
34
35 print(href_url)
36 print(href_title)
37
38 values = SQL.select_db(href_title)
39 if (len(values) > 0):
40 SQL.delete_db(href_title)
41
42 #/html/body/div[2]/div/div[2]/div
43 strhtml = requests.get(href_url, timeout=(7,7)) # Get方式获取网页数据
44 tree = etree.HTML(strhtml.text)
45 text=tree.xpath('//div[@class="content_text"]//text()')
46 text[0]=re.sub(r'xa0','',text[0])
47
48 name_zhang=""
49 name_tiao=""
50 name_info=""
51 info_value=[]
52 whole_value=[]
53 Value=[]
54 for val in text:
55 val=re.sub(r'([xa0
xaesu3000ue004ue003ufeffufffd])','',val)
56 Value.append(val)
57 # print(Value)
58 Value=[]
59 check_zhang = re.findall(r"(^第[一二三四五六七八九十百千]+章)", val)
60 check_tiao = re.findall(r"(^第[一二三四五六七八九十百千]+条)", val)
61 check_jie = re.findall(r"(^第[一二三四五六七八九十百千]+节)", val)
62 if (len(check_jie) > 0): # 章
63 continue
64 if (len(check_zhang)>0):#章
65 lsis=val.split("章")
66 name_zhang=lsis[0]+"章"
67 elif(len(check_tiao)>0):#条
68 if(len(info_value)>0):
69 whole_value.append(''.join(info_value))
70 info_value = []
71 lsis=val.split("条")
72 name_tiao=lsis[0]
73 name_info=name_zhang+"_"+name_tiao+"条"+":"
74 value=name_info+''.join(lsis[1:])
75 info_value.append(value)
76 elif(len(info_value)>0):#条中解释
77 lsis = val.split((" "))
78 lsi = [i for i in lsis if i != '']
79 if (len(lsi) == 1):
80 lsis = val.split("u3000")
81 lsi = [i for i in lsis if i != '']
82 info_value.append(''.join(lsi))
83
84 for value in whole_value:
85 print("KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK")
86 name_zhang=re.findall("(^第.+?章)_",value)
87 name_tiao=re.findall("_(第.+?条):",value)
88 name_info=re.findall(":(.+?)$",value)
89 if (len(name_tiao)==0) or (len(name_info)==0):
90 continue
91 if (len(name_zhang)==0):
92 name_zhang=[""]
93 SQL.insert(href_title,name_zhang[0],name_tiao[0],name_info[0])
94 except Exception as r:
95 print('未知错误 %s' %(r))
96 continue