zoukankan      html  css  js  c++  java
  • 使用python爬去国家民政最新的省份代码的程序,requests,beautifulsoup,lxml

    使用的python3.6

    民政网站,不同年份数据可能页面结构不一致,这点踩了很多坑,这也是代码越写越长的原因。

    如果以后此段代码不可用,希望再仔细学习下 页面结构是否发生了变更。

      1 # -*- coding: utf-8 -*-
      2 """
      3 Created on Wed Jul 10 14:40:41 2019
      4 
      5 @author: Administrator
      6 """
      7 
      8 import pandas as pd
      9 import requests 
     10 from bs4 import BeautifulSoup
     11 import time 
     12 
     13 url1 = 'http://www.mca.gov.cn/article/sj/xzqh//1980/'
     14 headers = {'content-type': 'application/json',
     15                'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
     16 
     17 # 1. 获取所有链接========================================================================
     18 def f1(url1):
     19     '2018-1980年中华人民共和国行政区划代码 的所有链接'
     20     #requests发出请求,设置url,header参数
     21     response = requests.get(url1, headers=headers, timeout=200, verify=False)
     22     soup = BeautifulSoup(response.text,'lxml') #将网页源码返回为BeautifulSoup类型
     23     _tmp1 = soup.select('td.arlisttd')
     24     end_1 = []
     25     for i in _tmp1:
     26         _a = i.select('a')[0].get('href')
     27         _b = i.select('a')[0].get('title')[:4]
     28         end_1.append(['http://www.mca.gov.cn'+_a,_b])
     29     return end_1
     30 
     31 end_2=[]
     32 for i in ['','?2','?3']:
     33     end_2 = end_2+f1(url1+i)
     34     
     35     
     36 def f2(url1='http://www.mca.gov.cn/article/sj/xzqh/2019/'):
     37     '2019年中华人民共和国行政区划代码'
     38     response = requests.get(url1, headers=headers, timeout=200, verify=False)
     39     soup = BeautifulSoup(response.text,'lxml')
     40     _tmp1 = soup.select('td.arlisttd')
     41     end_1 = []
     42     for i in _tmp1:
     43         _a = i.select('a')[0].get('href')
     44         _b = i.select('a')[0].get('title')[:7]
     45         end_1.append(['http://www.mca.gov.cn'+_a,_b])
     46     return end_1
     47 
     48 end_2 = end_2+f2()
     49 
     50 # 2. 获取数据========================================================================
     51 def f3(url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201903/20190300014989.shtml'):
     52     #url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201507/20150715854922.shtml'
     53     #url1='http://www.mca.gov.cn/article/sj/xzqh/1980/201507/20150715854918.shtml'
     54     #
     55     response = requests.get(url1, headers=headers, timeout=200, verify=False)
     56     soup = BeautifulSoup(response.text,'lxml')
     57     _txt = soup.select('script')[4].get_text().strip().replace('window.location.href="','').strip('";')
     58     if _txt[-4:]=='html':
     59         print('script!')
     60         url2 = _txt
     61     else:
     62         _tmp1 = soup.select('div.artext > div > p > a')
     63         if len(_tmp1)==0:
     64             _tmp1 = soup.select('div#zoom > a')
     65         url2 = _tmp1[0].get('href')
     66     print(url2)
     67     #return url2
     68     #url2='http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220856.html'
     69     time.sleep(0.5)
     70     response = requests.get(url2, headers=headers, timeout=200, verify=False)
     71     #将网页源码返回为BeautifulSoup类型
     72     soup = BeautifulSoup(response.text,'lxml')
     73     _tmp1 = soup.select('table > tr[height="19"]')
     74     end_1 = []
     75     if len(_tmp1)>5:
     76         for i in _tmp1:
     77             _a = i.select('td')[1].get_text().strip()
     78             if len(_a)>15: #部分数据页面,最后一行是备注。
     79                 continue
     80             else:
     81                 _b = i.select('td')[2].get_text().strip()
     82                 end_1.append([_a,_b])
     83     else:
     84         _tmp1 = soup.select('table > tr[height="20"]')
     85         for i in _tmp1:
     86             _a = i.select('td')[0].get_text().strip()
     87             if len(_a)>15 or _a=='行政区划代码': #部分数据页面,最后一行是备注。
     88                 continue
     89             else:
     90                 _b = i.select('td')[1].get_text().strip()
     91                 end_1.append([_a,_b])
     92     
     93     return end_1
     94 
     95 #循环对每个链接 获取数据
     96 end_3=[];#end_4=[]
     97 for j in range(len(end_2)):
     98     item = end_2[j]
     99     if '19'  in item[1] or '20'  in item[1]:
    100         print(j,item[0],item[1])
    101         tmp2 = f3(item[0])
    102         print('.')
    103         end_3.extend([[item[1]]+i for i in tmp2])
    104         #end_4.append(tmp2)
    105         time.sleep(0.1)
    106     
    107 df_result = pd.DataFrame(end_3)
    108 #pd.DataFrame(end_4).to_excel('所有连接.xlsx',index=False)
    109 df_result.to_excel('地区编码.xlsx',index=False)
    110 
    111 
    112 '''
    113 #3 2019年5月份县以上行政区划代码_3852 > table > tbody > tr:nth-child(4)
    114 #list_content > div.list_right > div > ul > table > tbody > tr:nth-child(1) > td.arlisttd > a
    115 '''
  • 相关阅读:
    fn project 试用之后的几个问题的解答
    fn project 扩展
    fn project 生产环境使用
    fn project 对象模型
    fn project AWS Lambda 格式 functions
    fn project 打包Function
    fn project Function files 说明
    fn project hot functions 说明
    fn project k8s 集成
    fn project 私有镜像发布
  • 原文地址:https://www.cnblogs.com/andylhc/p/11490563.html
Copyright © 2011-2022 走看看