zoukankan html css js c++ java

爬取国家统计局2020年行政区划分数据

参考：https://blog.csdn.net/qlx119/article/details/105289974

在MySQL中创建tab_citys数据表：

DROP TABLE IF EXISTS `tab_citys`;
CREATE TABLE `tab_citys` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `parent_id` int(11) DEFAULT NULL,
  `city_name_zh` varchar(20) NOT NULL,
  `city_name_en` varchar(20) DEFAULT NULL,
  `city_level` int(11) NOT NULL,
  `city_code` char(12) NOT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=742037 DEFAULT CHARSET=utf8;

创建xzqh.py的pyton脚本：

  1 #!/usr/bin/python
  2 # -*- coding: UTF-8 -*-
  3 #   功能：  获取省市县数据
  4 #   版本：v1.1
  5 import importlib
  6 import sys
  7 import pymysql
  8 importlib.reload(sys)
  9 import requests
 10 import lxml.etree as etree
 11 import os
 12 class chinese_city():
 13     # 初始化函数
 14     def __init__(self):
 15         self.baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html'
 16         self.base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'
 17         self.conn = pymysql.connect(host="localhost", port=3306, user="root", passwd="root", db="xzqh", charset='utf8')
 18         self.cur = self.conn.cursor()
 19         self.trdic = {
 20             1: '//tr[@class="provincetr"]',
 21             2: '//tr[@class="citytr"]',
 22             3: '//tr[@class="countytr"]',
 23             4: '//tr[@class="towntr"]',
 24             5: '//tr[@class="villagetr"]'
 25         }
 26     def __del__(self):
 27         if self.cur:
 28             self.cur.close()
 29         if self.conn:
 30             self.conn.close()
 31  
 32     def crawl_page(self,url):
 33         ''' 爬行政区划代码公布页 '''
 34         # print(f"crawling...{url}")
 35         headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
 36                    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
 37         i = 0
 38         while i < 3:
 39             try:
 40                 html = requests.get(url, headers=headers, timeout=20)
 41                 html.encoding = 'gbk'  # 这里添加一行
 42                 # print(html.status_code)
 43                 text = html.text
 44                 return text
 45             except requests.exceptions.RequestException:
 46                 i += 1
 47                 print('超时'+url)
 48  
 49     #解析省页，返回list
 50     def parseProvince(self):
 51         html = self.crawl_page(self.baseUrl)
 52         tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
 53         nodes = tree.xpath('//tr[@class="provincetr"]')
 54         id = 1
 55         values = []
 56         for node in nodes:
 57             items = node.xpath('./td')
 58             for item in items:
 59                 value = {}
 60                 nexturl = item.xpath('./a/@href')
 61                 province = item.xpath('./a/text()')
 62                 print(province)
 63                 value['url'] = self.base + "".join(nexturl)
 64                 value['name'] = "".join(province)
 65                 value['code'] = 0
 66                 value['pid'] = 0
 67                 value['id'] = id
 68                 value['level'] = 1
 69                 print(repr(value['name']))
 70                 id = id + 1
 71                 last_id = self.insert_to_db(value)
 72                 value['id'] = last_id
 73                 values.append(value)
 74                 print(value)
 75         return values
 76  
 77     #根据trid 解析子页
 78     def parse(self,trid, pid, url):
 79         if url.strip() == '':
 80             return None
 81         # url_prefix+url
 82         html = self.crawl_page(url)
 83         tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
 84         
 85         if trid==3:
 86             nodes = tree.xpath(self.trdic.get(trid))
 87             if len(nodes)==0:
 88                 nodes = tree.xpath(self.trdic.get(4))
 89                 print('有镇的市：'+url)
 90         else:
 91             nodes = tree.xpath(self.trdic.get(trid))
 92  
 93  
 94         path = os.path.basename(url)
 95         base_url = url.replace(path, '')
 96         id = 1
 97         values = []
 98         # 多个城市
 99         for node in nodes:
100             value = {}
101             nexturl = node.xpath('./td[1]/a/@href')
102             if len(nexturl) == 0:
103                 nexturl = ''
104             code = node.xpath('./td[1]/a/text()')
105             if len(code) == 0:
106                 code = node.xpath('./td[1]/text()')
107             name = node.xpath('./td[2]/a/text()')
108             if len(name) == 0:
109                 name = node.xpath('./td[2]/text()')
110             value['code'] = "".join(code)
111             urltemp = "".join(nexturl)
112             if len(urltemp) != 0:
113                 value['url'] = base_url + "".join(nexturl)
114             else:
115                 value['url'] = ''
116             value['name'] = "".join(name)
117             print(repr(value['name']))
118             print(value['url'])
119             value['id'] = id
120             value['pid'] = pid
121             value['level'] = trid
122             id = id + 1
123             last_id = self.insert_to_db(value)
124             value['id'] = last_id
125             values.append(value)
126             print(value)
127         return values
128  
129     #解析社区页
130     def parseVillager(self,trid, pid, url):
131         html = self.crawl_page(url)
132         tree = etree.HTML(html, parser=etree.HTMLParser(encoding='gbk'))
133         nodes = tree.xpath(self.trdic.get(trid))
134         id = 1
135         values = []
136         # 多个城市
137         for node in nodes:
138             value = {}
139             nexturl = node.xpath('./td[1]/a/@href')
140             code = node.xpath('./td[1]/text()')
141             vcode = node.xpath('./td[2]/text()')
142             name = node.xpath('./td[3]/text()')
143             value['code'] = "".join(code)
144             value['url'] = "".join(nexturl)
145             value['name'] = "".join(name)
146             print(repr(value['name']))
147             value['id'] = id
148             value['pid'] = pid
149             value['level'] = trid
150             values.append(value)
151             id = id + 1
152             last_id = self.insert_to_db(value)
153             value['id'] = last_id
154             values.append(value)
155             print(value)
156  
157         return values
158  
159     #插入数据库
160     def insert_to_db(self,taobao):
161         # return 0
162         param = []
163         lastid = 0
164         try:
165             sql = 'INSERT INTO tab_citys values(%s,%s,%s,%s,%s, %s)'
166             param = (0, taobao.get("pid"), taobao.get("name"), '', taobao.get("level"), taobao.get("code"))
167             self.cur.execute(sql, param)
168             lastid = self.cur.lastrowid
169             self.conn.commit()
170         except Exception as e:
171             print(e)
172             self.conn.rollback()
173         return lastid
174  
175     #从头执行解析
176     def parseChineseCity(self):
177         values = self.parseProvince()
178         for value in values:
179             citys = self.parse(2, value['id'], value['url'])
180             if not citys is None:
181                 for city in citys:
182                     countys = self.parse(3, city['id'], city['url'])
183                     #这个下面是获取 乡镇和居委会数据 如果不需要删除就可以了
184                     if not countys is None:
185                         for county in countys:
186                             towns = self.parse(4, county['id'], county['url'])
187                             if towns is not None:
188                                 for town in towns:
189                                     villagers = self.parseVillager(5, town['id'], town['url'])
190  
191 if __name__ == '__main__':
192     chinese_city = chinese_city()
193     chinese_city.parseChineseCity()

如果提示缺少相应的库，可以使用pip进行安装：

pip install pymysql

pip install lxml

运行脚本：

python ./xzqh.py

祝您成功！

查看全文

相关阅读:
WC2021 游记
 TC11054
P5904
CF741D
CF1467 题解
 [CTSC2008]网络管理 [树剖+整体二分]
[HNOI2015]接水果[整体二分]
[SDOI2010]粟粟的书架 [主席树]
整体二分的一些见解[整体二分学习笔记]
P2710 数列[fhq treap]

原文地址：https://www.cnblogs.com/liongis/p/14249932.html