zoukankan html css js c++ java

民政局中行政区域数据爬取

'''
    中华人民共和国民政局官网中的行政区域代码爬取：
        技术点：
            1>进入二级页面（数据展示页）时，url发生跳转（js作用的），需要在二级页面源码中找到真实url
            2>数据入库实时更新：保存url，下次爬取时，先对比url，若相同，不更新，否则更新
'''

import requests
from lxml import etree
import re
import pymysql


class GovementSpider:
    def __init__(self):
        self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
        }
        self.db = pymysql.connect('localhost', 'root', '123456', 'govermentdb', charset='utf8')
        self.cursor = self.db.cursor()

    # 提取二级页面链接（假链接），一定是最新的那个链接
    def get_false_link(self):
        html = requests.get(url=self.one_url, headers=self.headers).content.decode('utf-8', 'ignore')
        parse_html = etree.HTML(html)
        a_list = parse_html.xpath('//a [@class="artitlelist"]')
        for a in a_list:
            title = a.xpath('./@title')[0].strip()
            if re.findall(r'.*以上行政区划代码', title, re.S):
                two_false_link = 'http://www.mca.gov.cn' + a.get('href')
                return two_false_link

    # 提取真实二级页面链接（返回数据）
    def get_true_link(self):
        # 获取响应内容
        false_link = self.get_false_link()
        html = requests.get(url=false_link, headers=self.headers).content.decode('utf-8', 'ignore')
        pattern = re.compile(r'window.location.href="(.*?)"', re.S)
        real_link = pattern.findall(html)[0]
        print(real_link)

        # 实现增量爬取
        # 即到version表中查询是否有real_link，如果有，直接返回数据已是最新，否则，抓取最新数据
        sel = 'select * from version where link="{}"'.format(real_link)
        self.cursor.execute(sel)
        # 不为空元组(不需要抓取数据)，即链接已存在
        if self.cursor.fetchall():
            print('数据已是最新')
        else:
            # 先抓数据
            self.get_data(real_link)
            # 把real_link插入到version表中
            ins = 'insert into version values(%s)'
            self.cursor.execute(ins, [real_link])
            self.db.commit()

    # 真正提取数据函数
    def get_data(self, real_link):
        html = requests.get(url=real_link, headers=self.headers).text
        parse_html = etree.HTML(html)
        tr_list = parse_html.xpath('//tr[@height="19"]')
        for tr in tr_list:
            code = tr.xpath('./td[2]/text()')[0]
            name = tr.xpath('./td[3]/text()')[0]
            print(name, code)

    # 主函数
    def main(self):
        pass


if __name__ == '__main__':
    spider = GovementSpider()
    spider.main()
    spider.get_true_link()

'''
    使用selenium+chrome进行爬取，可以避免js对二级页面链接的渲染，爬取更简单
'''

from selenium import webdriver
import time
import pymysql


class GovementSpider:
    def __init__(self):
        self.browser = webdriver.Chrome()
        self.one_url = 'http://www.mca.gov.cn/article/sj/xzqh/2019/'
        self.db = pymysql.connect('localhost', 'root', '123456', db='govdb', charset='utf8')
        self.cursor = self.db.cursor()
        # 定义三个空列表，为了excutemany()
        self.province_list = []
        self.city_list = []
        self.county_list = []

    # 获取首页并提取二级页面链接(虚假链接即可，真实链接可以不用)
    def get_false_url(self):
        self.browser.get(self.one_url)
        td_list = self.browser.find_elements_by_xpath('//td[@class="arlisttd"]/a[contains(@title,"代码")]')
        if td_list:
            # 找节点对象，因为要click()
            two_url_element = td_list[0]
            # 增量爬取，取出链接，和数据库中version表中作比对
            two_url = two_url_element.get_attribute('href')
            sel = 'select * from version where link=%s'
            self.cursor.execute(sel, [two_url])
            result = self.cursor.fetchall()
            if len(result) != 0:
                print('数据已最新，无需爬取')
            else:
                # 点击
                two_url_element.click()
                time.sleep(3)
                # 切换browser
                all_handles = self.browser.window_handles
                self.browser.switch_to_window(all_handles[1])
                # 数据抓取
                self.get_data()
                # 结束后把two_url插入version表中
                ins = 'insert into version values(%s)'
                self.cursor.execute(ins, [two_url])
                self.db.commit()

    # 二级页面中提取行政区划代码
    def get_data(self):
        tr_list = self.browser.find_elements_by_xpath('//tr[@height="19"]')
        for tr in tr_list:
            code = tr.find_element_by_xpath('./td[2]').text.strip()
            name = tr.find_element_by_xpath('./td[3]').text.strip()
            print(name, code)
            # 判断层级关系，添加到对应的数据库表中（对应表中字段）
            if code[-4:] == '0000':
                self.province_list.append([name, code])
                if name in ['北京市', '天津市', '上海市', '重庆市']:
                    city = [name, code, code[:2] + '0000']
                    self.city_list.append(city)
            elif code[-2:] == '00':
                city = [name, code, code[:2] + '0000']
                self.city_list.append(city)
            else:
                county = [name, code, code[:4] + '00']
                self.county_list.append(county)
        # 所有数据爬取完成之后，统一excutemany()
        self.insert_mysql()

    def insert_mysql(self):
        # 更新时一定要删除表记录
        del_province = 'delete from province'
        del_city = 'delete from city'
        del_county = 'delete from county'
        self.cursor.execute(del_province)
        self.cursor.execute(del_city)
        self.cursor.execute(del_county)
        # 插入新的数据
        ins_province = 'insert into province values(%s,%s)'
        ins_city = 'insert into city values(%s,%s,%s)'
        ins_county = 'insert into county values(%s,%s,%s)'
        self.cursor.executemany(ins_province, self.province_list)
        self.cursor.executemany(ins_city, self.city_list)
        self.cursor.executemany(ins_county, self.county_list)
        self.db.commit()
        print('数据抓取完成，成功存入数据库')

    def main(self):
        self.get_false_url()
        # 断开连接
        self.cursor.close()
        self.db.close()
        self.browser.quit()


if __name__ == "__main__":
    spider = GovementSpider()
    spider.main()

查看全文

相关阅读:
还是java中的编码问题
 java restful api
编码方式
 LinkedHash
Zoj 2562 More Divisors (反素数)
spark复习总结03
spark复习总结02
spark复习总结01
使用二进制解决一个字段代表多个状态的问题
 spark性能调优05-troubleshooting处理

原文地址：https://www.cnblogs.com/yuxiangyang/p/11245508.html