zoukankan html css js c++ java

爬取法律法规代码（可直接使用）

代码：

 1 # # -*- coding:utf-8 -*-
 2 from lxml import etree
 3 import re
 4 import requests        #导入requests包
 5 import 爬取法律法规.SQL as SQL
 6 
 7 def Get_urls(start,end):
 8     hrefs_link = []
 9     hrefs_title = []
10     for i in range(start,end):
11         #i=1
12         url = 'https://www.chinacourt.org/law/more/law_type_id/MzAwNEAFAA/page/' + str(i) + '.shtml'
13         print(url)
14         strhtml = requests.get(url, timeout=7)
15         tree = etree.HTML(strhtml.text)
16         hreff=tree.xpath('//*[@id="flwk"]/div[1]/div[2]/ul//a//@href')
17         for hh in hreff:
18             hrefs_link.append(hh)
19         hreff_text = tree.xpath('//*[@id="flwk"]/div[1]/div[2]/ul//a//text()')
20         for hh in hreff_text:
21             hrefs_title.append(hh)
22     return hrefs_title,hrefs_link
23 if __name__ =="__main__":
24     hrefs_title,hrefs_link=Get_urls(500,534)
25 
26     for num_i in range(len(hrefs_link)):
27         #num_i=15
28         print(num_i)
29         try:
30             href_url='https://www.chinacourt.org'+hrefs_link[num_i]
31             href_title=hrefs_title[num_i]
32             if ("失效" in href_title) or ("主席令" in href_title):
33                 continue
34 
35             print(href_url)
36             print(href_title)
37 
38             values = SQL.select_db(href_title)
39             if (len(values) > 0):
40                 SQL.delete_db(href_title)
41 
42             #/html/body/div[2]/div/div[2]/div
43             strhtml = requests.get(href_url, timeout=(7,7))  # Get方式获取网页数据
44             tree = etree.HTML(strhtml.text)
45             text=tree.xpath('//div[@class="content_text"]//text()')
46             text[0]=re.sub(r'xa0','',text[0])
47 
48             name_zhang=""
49             name_tiao=""
50             name_info=""
51             info_value=[]
52             whole_value=[]
53             Value=[]
54             for val in text:
55                 val=re.sub(r'([xa0
	xaesu3000ue004ue003ufeffufffd])','',val)
56                 Value.append(val)
57             #    print(Value)
58                 Value=[]
59                 check_zhang = re.findall(r"(^第[一二三四五六七八九十百千]+章)", val)
60                 check_tiao = re.findall(r"(^第[一二三四五六七八九十百千]+条)", val)
61                 check_jie = re.findall(r"(^第[一二三四五六七八九十百千]+节)", val)
62                 if (len(check_jie) > 0):  # 章
63                     continue
64                 if (len(check_zhang)>0):#章
65                     lsis=val.split("章")
66                     name_zhang=lsis[0]+"章"
67                 elif(len(check_tiao)>0):#条
68                     if(len(info_value)>0):
69                         whole_value.append(''.join(info_value))
70                     info_value = []
71                     lsis=val.split("条")
72                     name_tiao=lsis[0]
73                     name_info=name_zhang+"_"+name_tiao+"条"+":"
74                     value=name_info+''.join(lsis[1:])
75                     info_value.append(value)
76                 elif(len(info_value)>0):#条中解释
77                     lsis = val.split((" "))
78                     lsi = [i for i in lsis if i != '']
79                     if (len(lsi) == 1):
80                         lsis = val.split("u3000")
81                         lsi = [i for i in lsis if i != '']
82                     info_value.append(''.join(lsi))
83 
84             for value in whole_value:
85                 print("KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK")
86                 name_zhang=re.findall("(^第.+?章)_",value)
87                 name_tiao=re.findall("_(第.+?条):",value)
88                 name_info=re.findall(":(.+?)$",value)
89                 if  (len(name_tiao)==0) or (len(name_info)==0):
90                     continue
91                 if (len(name_zhang)==0):
92                     name_zhang=[""]
93                 SQL.insert(href_title,name_zhang[0],name_tiao[0],name_info[0])
94         except Exception as r:
95             print('未知错误 %s' %(r))
96             continue

查看全文

相关阅读:
基础
 条件语句/变量和基本数据类型
 编程语言介绍
 asp.net中log4net使用方法
 web布到服务器上出错
 《转》IEnumerable、IEnumerator两个接口的认识
 异步ADO.NET
Session的使用
 AJAX参数及各种HTTP状态值
 简易的抓取别人网站内容

原文地址：https://www.cnblogs.com/smartisn/p/14426584.html