zoukankan      html  css  js  c++  java
  • 爬取国家统计局2020年行政区划分数据

    参考:https://blog.csdn.net/qlx119/article/details/105289974

    在MySQL中创建tab_citys数据表:

    DROP TABLE IF EXISTS `tab_citys`;
    CREATE TABLE `tab_citys` (
      `id` int(11) NOT NULL AUTO_INCREMENT,
      `parent_id` int(11) DEFAULT NULL,
      `city_name_zh` varchar(20) NOT NULL,
      `city_name_en` varchar(20) DEFAULT NULL,
      `city_level` int(11) NOT NULL,
      `city_code` char(12) NOT NULL,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=742037 DEFAULT CHARSET=utf8;

    创建xzqh.py的pyton脚本:

      1 #!/usr/bin/python
      2 # -*- coding: UTF-8 -*-
      3 #   功能:  获取省市县数据
      4 #   版本:v1.1
      5 import importlib
      6 import sys
      7 import pymysql
      8 importlib.reload(sys)
      9 import requests
     10 import lxml.etree as etree
     11 import os
     12 class chinese_city():
     13     # 初始化函数
     14     def __init__(self):
     15         self.baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html'
     16         self.base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'
     17         self.conn = pymysql.connect(host="localhost", port=3306, user="root", passwd="root", db="xzqh", charset='utf8')
     18         self.cur = self.conn.cursor()
     19         self.trdic = {
     20             1: '//tr[@class="provincetr"]',
     21             2: '//tr[@class="citytr"]',
     22             3: '//tr[@class="countytr"]',
     23             4: '//tr[@class="towntr"]',
     24             5: '//tr[@class="villagetr"]'
     25         }
     26     def __del__(self):
     27         if self.cur:
     28             self.cur.close()
     29         if self.conn:
     30             self.conn.close()
     31  
     32     def crawl_page(self,url):
     33         ''' 爬行政区划代码公布页 '''
     34         # print(f"crawling...{url}")
     35         headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
     36                    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
     37         i = 0
     38         while i < 3:
     39             try:
     40                 html = requests.get(url, headers=headers, timeout=20)
     41                 html.encoding = 'gbk'  # 这里添加一行
     42                 # print(html.status_code)
     43                 text = html.text
     44                 return text
     45             except requests.exceptions.RequestException:
     46                 i += 1
     47                 print('超时'+url)
     48  
     49     #解析省页,返回list
     50     def parseProvince(self):
     51         html = self.crawl_page(self.baseUrl)
     52         tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
     53         nodes = tree.xpath('//tr[@class="provincetr"]')
     54         id = 1
     55         values = []
     56         for node in nodes:
     57             items = node.xpath('./td')
     58             for item in items:
     59                 value = {}
     60                 nexturl = item.xpath('./a/@href')
     61                 province = item.xpath('./a/text()')
     62                 print(province)
     63                 value['url'] = self.base + "".join(nexturl)
     64                 value['name'] = "".join(province)
     65                 value['code'] = 0
     66                 value['pid'] = 0
     67                 value['id'] = id
     68                 value['level'] = 1
     69                 print(repr(value['name']))
     70                 id = id + 1
     71                 last_id = self.insert_to_db(value)
     72                 value['id'] = last_id
     73                 values.append(value)
     74                 print(value)
     75         return values
     76  
     77     #根据trid 解析子页
     78     def parse(self,trid, pid, url):
     79         if url.strip() == '':
     80             return None
     81         # url_prefix+url
     82         html = self.crawl_page(url)
     83         tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
     84         
     85         if trid==3:
     86             nodes = tree.xpath(self.trdic.get(trid))
     87             if len(nodes)==0:
     88                 nodes = tree.xpath(self.trdic.get(4))
     89                 print('有镇的市:'+url)
     90         else:
     91             nodes = tree.xpath(self.trdic.get(trid))
     92  
     93  
     94         path = os.path.basename(url)
     95         base_url = url.replace(path, '')
     96         id = 1
     97         values = []
     98         # 多个城市
     99         for node in nodes:
    100             value = {}
    101             nexturl = node.xpath('./td[1]/a/@href')
    102             if len(nexturl) == 0:
    103                 nexturl = ''
    104             code = node.xpath('./td[1]/a/text()')
    105             if len(code) == 0:
    106                 code = node.xpath('./td[1]/text()')
    107             name = node.xpath('./td[2]/a/text()')
    108             if len(name) == 0:
    109                 name = node.xpath('./td[2]/text()')
    110             value['code'] = "".join(code)
    111             urltemp = "".join(nexturl)
    112             if len(urltemp) != 0:
    113                 value['url'] = base_url + "".join(nexturl)
    114             else:
    115                 value['url'] = ''
    116             value['name'] = "".join(name)
    117             print(repr(value['name']))
    118             print(value['url'])
    119             value['id'] = id
    120             value['pid'] = pid
    121             value['level'] = trid
    122             id = id + 1
    123             last_id = self.insert_to_db(value)
    124             value['id'] = last_id
    125             values.append(value)
    126             print(value)
    127         return values
    128  
    129     #解析社区页
    130     def parseVillager(self,trid, pid, url):
    131         html = self.crawl_page(url)
    132         tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
    133         nodes = tree.xpath(self.trdic.get(trid))
    134         id = 1
    135         values = []
    136         # 多个城市
    137         for node in nodes:
    138             value = {}
    139             nexturl = node.xpath('./td[1]/a/@href')
    140             code = node.xpath('./td[1]/text()')
    141             vcode = node.xpath('./td[2]/text()')
    142             name = node.xpath('./td[3]/text()')
    143             value['code'] = "".join(code)
    144             value['url'] = "".join(nexturl)
    145             value['name'] = "".join(name)
    146             print(repr(value['name']))
    147             value['id'] = id
    148             value['pid'] = pid
    149             value['level'] = trid
    150             values.append(value)
    151             id = id + 1
    152             last_id = self.insert_to_db(value)
    153             value['id'] = last_id
    154             values.append(value)
    155             print(value)
    156  
    157         return values
    158  
    159     #插入数据库
    160     def insert_to_db(self,taobao):
    161         # return 0
    162         param = []
    163         lastid = 0
    164         try:
    165             sql = 'INSERT INTO tab_citys values(%s,%s,%s,%s,%s, %s)'
    166             param = (0, taobao.get("pid"), taobao.get("name"), '', taobao.get("level"), taobao.get("code"))
    167             self.cur.execute(sql, param)
    168             lastid = self.cur.lastrowid
    169             self.conn.commit()
    170         except Exception as e:
    171             print(e)
    172             self.conn.rollback()
    173         return lastid
    174  
    175     #从头执行解析
    176     def parseChineseCity(self):
    177         values = self.parseProvince()
    178         for value in values:
    179             citys = self.parse(2, value['id'], value['url'])
    180             if not citys is None:
    181                 for city in citys:
    182                     countys = self.parse(3, city['id'], city['url'])
    183                     #这个下面是获取 乡镇和居委会数据 如果不需要删除就可以了
    184                     if not countys is None:
    185                         for county in countys:
    186                             towns = self.parse(4, county['id'], county['url'])
    187                             if towns is not None:
    188                                 for town in towns:
    189                                     villagers = self.parseVillager(5, town['id'], town['url'])
    190  
    191 if __name__ == '__main__':
    192     chinese_city = chinese_city()
    193     chinese_city.parseChineseCity()

    如果提示缺少相应的库,可以使用pip进行安装:

    pip install pymysql

    pip install lxml

    运行脚本:

    python ./xzqh.py

    祝您成功!

  • 相关阅读:
    每日日报2020.9.30 1905
    每日日报2020.10.7 1905
    每日日报2020.10.2 1905
    每日日报2020.9.28 1905
    程序员修炼之道:从小工到专家 九月读书心得 1905
    每日日报2020.9.27 1905
    每日日报2020.9.29 1905
    每日日报2020.10.6 1905
    每日日报2020.10.5 1905
    每日总结2
  • 原文地址:https://www.cnblogs.com/liongis/p/14249932.html
Copyright © 2011-2022 走看看