zoukankan      html  css  js  c++  java
  • 爬取法律法规代码(可直接使用)

    代码:

     1 # # -*- coding:utf-8 -*-
     2 from lxml import etree
     3 import re
     4 import requests        #导入requests包
     5 import 爬取法律法规.SQL as SQL
     6 
     7 def Get_urls(start,end):
     8     hrefs_link = []
     9     hrefs_title = []
    10     for i in range(start,end):
    11         #i=1
    12         url = 'https://www.chinacourt.org/law/more/law_type_id/MzAwNEAFAA/page/' + str(i) + '.shtml'
    13         print(url)
    14         strhtml = requests.get(url, timeout=7)
    15         tree = etree.HTML(strhtml.text)
    16         hreff=tree.xpath('//*[@id="flwk"]/div[1]/div[2]/ul//a//@href')
    17         for hh in hreff:
    18             hrefs_link.append(hh)
    19         hreff_text = tree.xpath('//*[@id="flwk"]/div[1]/div[2]/ul//a//text()')
    20         for hh in hreff_text:
    21             hrefs_title.append(hh)
    22     return hrefs_title,hrefs_link
    23 if __name__ =="__main__":
    24     hrefs_title,hrefs_link=Get_urls(500,534)
    25 
    26     for num_i in range(len(hrefs_link)):
    27         #num_i=15
    28         print(num_i)
    29         try:
    30             href_url='https://www.chinacourt.org'+hrefs_link[num_i]
    31             href_title=hrefs_title[num_i]
    32             if ("失效" in href_title) or ("主席令" in href_title):
    33                 continue
    34 
    35             print(href_url)
    36             print(href_title)
    37 
    38             values = SQL.select_db(href_title)
    39             if (len(values) > 0):
    40                 SQL.delete_db(href_title)
    41 
    42             #/html/body/div[2]/div/div[2]/div
    43             strhtml = requests.get(href_url, timeout=(7,7))  # Get方式获取网页数据
    44             tree = etree.HTML(strhtml.text)
    45             text=tree.xpath('//div[@class="content_text"]//text()')
    46             text[0]=re.sub(r'xa0','',text[0])
    47 
    48             name_zhang=""
    49             name_tiao=""
    50             name_info=""
    51             info_value=[]
    52             whole_value=[]
    53             Value=[]
    54             for val in text:
    55                 val=re.sub(r'([xa0
    	xaesu3000ue004ue003ufeffufffd])','',val)
    56                 Value.append(val)
    57             #    print(Value)
    58                 Value=[]
    59                 check_zhang = re.findall(r"(^第[一二三四五六七八九十百千]+章)", val)
    60                 check_tiao = re.findall(r"(^第[一二三四五六七八九十百千]+条)", val)
    61                 check_jie = re.findall(r"(^第[一二三四五六七八九十百千]+节)", val)
    62                 if (len(check_jie) > 0):  #
    63                     continue
    64                 if (len(check_zhang)>0):#
    65                     lsis=val.split("")
    66                     name_zhang=lsis[0]+""
    67                 elif(len(check_tiao)>0):#
    68                     if(len(info_value)>0):
    69                         whole_value.append(''.join(info_value))
    70                     info_value = []
    71                     lsis=val.split("")
    72                     name_tiao=lsis[0]
    73                     name_info=name_zhang+"_"+name_tiao+""+":"
    74                     value=name_info+''.join(lsis[1:])
    75                     info_value.append(value)
    76                 elif(len(info_value)>0):#条中解释
    77                     lsis = val.split((" "))
    78                     lsi = [i for i in lsis if i != '']
    79                     if (len(lsi) == 1):
    80                         lsis = val.split("u3000")
    81                         lsi = [i for i in lsis if i != '']
    82                     info_value.append(''.join(lsi))
    83 
    84             for value in whole_value:
    85                 print("KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK")
    86                 name_zhang=re.findall("(^第.+?章)_",value)
    87                 name_tiao=re.findall("_(第.+?条):",value)
    88                 name_info=re.findall(":(.+?)$",value)
    89                 if  (len(name_tiao)==0) or (len(name_info)==0):
    90                     continue
    91                 if (len(name_zhang)==0):
    92                     name_zhang=[""]
    93                 SQL.insert(href_title,name_zhang[0],name_tiao[0],name_info[0])
    94         except Exception as r:
    95             print('未知错误 %s' %(r))
    96             continue
  • 相关阅读:
    大道至简阅读笔记08-完结
    (转)MySQL的JDBC驱动源码解析
    Win7如何开启Telnet服务
    idea 设置jetty进程jvm参数
    (转)如何在maven环境中设置JVM参数
    (转)IntelliJ Idea 常用快捷键列表 for win
    cglib Demo
    MAVEN 配置阿里巴巴镜像
    MyBatis入门基础(一)
    JAVA常见面试题之Forward和Redirect的区别
  • 原文地址:https://www.cnblogs.com/smartisn/p/14426584.html
Copyright © 2011-2022 走看看