zoukankan      html  css  js  c++  java
  • 全国省市县数据爬虫

    项目需要全国省市县数据,网上找了一圈发现要么过时要么收费,于是花点时间自己写了个爬虫爬了些基础数据,基本上够用了,数据是从国家统计局爬来的,目前更新到2019年,代码如下:

    import requests
    from requests.adapters import HTTPAdapter
    from bs4 import BeautifulSoup
    import re
    import time
    
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/67.0.3396.99 Safari/537.36'}
    
    
    def get_page(url):
        try:
            s = requests.Session()
            s.mount('http://', HTTPAdapter(max_retries=2))      # 重试次数
            r = s.get(url, headers=headers, timeout=3)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except Exception as e:
            print(e)
            return ''
    
    
    # 解析省级数据,返回(省份链接,id, 名字)
    def parse_province(page):
        provinces = []
        id_pattern = r'(.*).html'       # 用于提取链接中的id信息
        soup = BeautifulSoup(page, 'lxml')
    
        province_trs = soup.find_all(class_='provincetr')
        # 有些空标签,所以需要很多判断
        for tr in province_trs:
            if tr:
                province_items = tr.find_all(name='td')
                for item in province_items:
                    if item:
                        a = item.find(name='a')
                        if a:
                            next_url = a.get('href')
                            id_ = re.search(id_pattern, next_url).group(1)
                            id_ += '0' * (12 - len(id_))        # 省份id只给了前几位,补全到12位
                            name = a.text
                            provinces.append((next_url, id_, name))
        return provinces
    
    
    # 解析市级数据,返回(市级链接,id, 名字)
    def parse_city(page):
        citys = []
        soup = BeautifulSoup(page, 'lxml')
    
        city_trs = soup.find_all(class_='citytr')
        for tr in city_trs:
            if tr:
                tds = tr.find_all(name='td')
                next_url = tds[0].find(name='a').get('href')
                id_ = tds[0].text
                name = tds[1].text
                citys.append((next_url, id_, name))
        return citys
    
    
    # 解析区县数据,由于不需要下一级数据,并且保持格式统一,返回(None, id, 名字)
    def parse_district(page):
        districts = []
        soup = BeautifulSoup(page, 'lxml')
    
        district_trs = soup.find_all(class_='countytr')
        for tr in district_trs:
            if tr:
                tds = tr.find_all(name='td')
                id_ = tds[0].text
                name = tds[1].text
                districts.append((None, id_, name))
        return districts
    
    
    # 写入一串记录到指定文件
    def write_lines(file, type, data_list, parent_id, level):
        with open(file, type, encoding='utf-8') as f:
            for data in data_list:
                f.write(data[1] + '	' + data[2] + '	' + parent_id + '	' + level + '
    ')
    
    
    if __name__ == "__main__":
        root = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/'
        province_list = []
        city_list = []
    
        # 爬取省份数据
        province_page = get_page(root)
        if province_page:
            province_list = parse_province(province_page)
            if province_list:
                write_lines('areas.txt', 'w', province_list, '0', '1')
                print('='*20)
                print('省份数据写入完成')
                print('='*20)
            else:
                print('省份列表为空')
        else:
            print('省份页面获取失败')
    
        # 爬取市级数据
        if province_list:
            for province in province_list:
                province_url = root + province[0]
                province_id = province[1]
                province_name = province[2]
    
                city_page = get_page(province_url)
                if city_page:
                    city_list_tmp = parse_city(city_page)
                    if city_list_tmp:
                        city_list += city_list_tmp
                        write_lines('areas.txt', 'a', city_list_tmp, province_id, '2')
                        print(province_name+'市级数据写入完成')
                        time.sleep(1)
                    else:
                        print(province_name+'市级列表为空')
                else:
                    print(province_name+'市级页面获取失败')
        print('='*20)
        print('市级数据写入完成')
        print('='*20)
    
        # 爬取区县数据
        if city_list:
            for city in city_list:
                city_url = root + city[0]
                city_id = city[1]
                city_name = city[2]
    
                district_page = get_page(city_url)
                if district_page:
                    district_list = parse_district(district_page)
                    if district_list:
                        write_lines('areas.txt', 'a', district_list, city_id, '3')
                        print(city_name+'区县数据写入完成')
                        time.sleep(1)
                    else:
                        print(city_name+'区县列表为空')
                else:
                    print(city[2]+'区县页面获取失败')
        print('='*20)
        print('区县数据写入完成')
        print('='*20)
    
    

    最终数据格式为 (id,名字,上级id,级别(1,2,3分别代表省市县))

    在这里插入图片描述

    共有3600多条数据

  • 相关阅读:
    查看CLOUD系统级IIS日志
    采购订单设置采购部门为缺省值
    单据头数据复制到单据体
    CLOUD设置过滤方案不共享
    BZOJ 4773: 负环 倍增Floyd
    LOJ #539. 「LibreOJ NOIP Round #1」旅游路线 倍增floyd + 思维
    BZOJ 4821: [Sdoi2017]相关分析 线段树 + 卡精
    一些常用公式/技巧
    BZOJ 4517: [Sdoi2016]排列计数 错排 + 组合
    BZOJ 3162: 独钓寒江雪 树的同构 + 组合 + 计数
  • 原文地址:https://www.cnblogs.com/Hui4401/p/13495967.html
Copyright © 2011-2022 走看看