证监会处罚公告爬取 - 走看看

zoukankan html css js c++ java

证监会处罚公告爬取

域名：http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401.htm

# -*- coding: utf-8 -*-

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

import re

class CfSpider(CrawlSpider):

name = 'cf'

allowed_domains = ['csrc.gov.cn']

start_urls = ['http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401.htm']

rules = (

Rule(LinkExtractor(allow=r'/Gd+/d+/td+_d+.htm'), callback='parse_item'),

# Rule(LinkExtractor(allow=r'/3300/3313/index_7401_.*?.htm'),follow=True), # 不起作用，所以重写了start_requests

)

def start_requests(self):

current_page = 0

while current_page < 67:

if current_page == 0:

url = 'http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401'

next_url = url+".html"

else:

url = 'http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401_{}.htm'

next_url = url.format(str(current_page))

yield scrapy.Request(

url=next_url,

callback=self.parse,

)

current_page += 1

def parse_item(self, response):

item = dict()

item["title"] = response.xpath("//span[@id='lTitle']/text()").extract_first()

item["pub_title"] = re.findall(r"<span>(20d+年d{2}月d{2}日)</span>",response.body.decode(),re.S)

item["pub_title"] = item["pub_title"][0] if item["pub_title"] else None

item["index_number"] = response.xpath("//table[@id='headContainer']//tr[1]//td[@colspan='2']//td[1]/text()").extract_first()

item["href"] = response.url

yield item

查看全文

相关阅读:
Admin添加字段
 django admin基础
 user_admin
admin 模块功能
 todolist项目
 Django进阶项目
 Django进阶
 bolg项目
 EL&jstl
模拟用户登录，内含验证码验证和request等操作

原文地址：https://www.cnblogs.com/nuochengze/p/12944333.html

Copyright © 2011-2022 走看看