zoukankan      html  css  js  c++  java
  • 中华人民行政部网站爬虫

    #中华人民行政部网站爬虫

    **今日目标** 爬取该网站行政数据里的县以上最新行政区代码 ``` import requests from lxml import etree import pymysql import re class Govement(object): def __init__(self): self.one_url = 'http://www.mca.gov.cn/article/sj/' 'xzqh/2019/' self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'} self.db = pymysql.connect( 'localhost','root','123456','govdb',charset='utf8' ) self.cursor = self.db.cursor() # 提取二级页面链接(假链接) def get_false_link(self): # xpath: //a[@class="artitlelist"] html = requests.get(url = self.one_url,headers = self.headers).content.decode('utf-8','ignore') # 解析 parse_html = etree.HTML(html) a_list = parse_html.xpath('//a[@class="artitlelist"]') for a in a_list: # title = a.xpath('./@title')[0] title = a.get('title') if re.findall('.*以上行政区划代码',title,re.S): two_false_link = 'http://www.mca.gov.cn'+ a.get('href') return two_false_link # 提取真实二级页面链接(返回数据) def get_true_link(self): # 获取响应内容 false_link = self.get_false_link() html = requests.get(url=false_link,headers=self.headers).text # 打印响应内容,查看真实链接的跳转,匹配出真实链接 pattern=re.compile(r'window.location.href="(.*?)"',re.S) real_link = pattern.findall(html)[0] # 实现增量爬取 # 到version表中查询是否有real_link # 有: 数据最新  没有: 抓数据 sel = 'select * from version where link="{}"'.format(real_link) self.cursor.execute(sel) # 链接已存在(不需要抓取数据) if self.cursor.fetchall(): print('数据已是最新') else: # 先抓数据 self.get_data(real_link) # 把real_link插入到version表中 ins = 'insert into version values(%s)' self.cursor.execute(ins,[real_link]) self.db.commit() # 真正提取数据函数 def get_data(self,real_link): html = requests.get( url = real_link, headers = self.headers ).text # 基准xpath: //tr[@height="19"] parse_html = etree.HTML(html) tr_list = parse_html.xpath('//tr[@height="19"]') for tr in tr_list: code = tr.xpath('./td[2]/text()')[0] name = tr.xpath('./td[3]/text()')[0] print(name,code) # 主函数 def main(self): self.get_true_link() if __name__ == '__main__': spider = Govement() spider.main() ```
  • 相关阅读:
    LeetCode 1245. Tree Diameter
    LeetCode 1152. Analyze User Website Visit Pattern
    LeetCode 1223. Dice Roll Simulation
    LeetCode 912. Sort an Array
    LeetCode 993. Cousins in Binary Tree
    LeetCode 1047. Remove All Adjacent Duplicates In String
    LeetCode 390. Elimination Game
    LeetCode 1209. Remove All Adjacent Duplicates in String II
    LeetCode 797. All Paths From Source to Target
    LeetCode 1029. Two City Scheduling
  • 原文地址:https://www.cnblogs.com/cxiaolong/p/11261023.html
Copyright © 2011-2022 走看看