本项目中运用了相关技术
- fake_useragent的随机ua
- http://www.goubanjia.com/中的动态代理ip
- time.sleep(delay)随机延迟数,来降低被反爬虫策略监控的风险
- 存储数据在mysql数据库中
- 经本人测试,可爬取阳光政务的 http://wz.sun0769.com/political/index/politicsNewest 中2000页数据,爬取全部网页不在话下
项目结构
sunPro
sunPro
spiders
sun.py
items.py
middlewares.py
pipelines.py
settings.py
sun.py中
# -*- coding: utf-8 -*-
import scrapy
import os
import random
# 当前的路径为:
# base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# print(base_path)
import sys
import logging
from sunPro.items import SunproItem
# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))
class SunSpider(scrapy.Spider):
name = 'sun'
allowed_domains = ['wz.sun0769.com']
start_urls = ['http://wz.sun0769.com/political/index/politicsNewest', ]
# start_urls = ['http://wz.sun0769.com/political/index/politicsNewest',
# 'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=2',
# 'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=3',
# 'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=4',
# 'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=5']
# 翻页的url
page_urls = []
# http://wz.sun0769.com/political/index/politicsNewest?id=1&page=2
# 下一页 <a href="/political/index/politicsNewest?id=1&page=3" class="arrow-page prov_rota"></a>
def parse(self, response):
# 获取解析页面
# for i in range(1, 3):
# page = i+1
# print('-----> 目前的页数为:', page)
# yield scrapy.Request(f'http://wz.sun0769.com/political/index/politicsNewest?id=1&page={page}',
# callback=self.parse)
# if 200 <= response.status <= 300:
# new_urls = 'http://wz.sun0769.com' + response.xpath('//div[@class="mr-three paging-box"]/a[2]/@href').extract_first()
# print('-----> 新得页面的url是:', new_urls)
# if new_urls:
# self.page_urls.append(new_urls)
for page in range(1, 1000):
url = 'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=%d' % page
print('------> url', url)
yield scrapy.Request(url=url, callback=self.parse_item)
# logging.info('当前访问的url是 %s' % response.url)
# 此处为翻页
# new_urls = []
# try:
# new_urls = response.xpath('/html/body/div[2]/div[3]/div[3]/div//a/@href').extract()
# # '/html/body/div[2]/div[3]/div[3]/div/a[2]'
# print('-----> 新的url列表为:', new_urls)
# except Exception as e:
# print('解析下一页标签出错了 | 已经是最后一页,没有下一页标签了', e)
# if len(new_urls) > 0:
# for i in range(len(new_urls)):
# page_url = new_urls[i]
# print('-----> page_url', page_url)
# yield scrapy.Request('http://wz.sun0769.com' + page_url, callback=self.parse)
# 'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1'
# for url in response.xpath('/html/body/div[2]/div[3]/div[3]/div//a/@href').extract():
# print('-----> 每一条url是', url)
# yield scrapy.Request('http://wz.sun0769.com'+url, callback=self.parse)
# 得出下一页的标签,进行翻页操作,并且拼接成url访问
def parse_item(self, response):
if 200 <= response.status <=300:
print('-----> 当前正在解析的页面为:', response.url)
li_lists = response.xpath('/html/body/div[2]/div[3]/ul[2]//li')
for li in li_lists:
item = SunproItem()
sid = li.xpath('./span[1]/text()').extract_first()
status = li.xpath('./span[2]/text()').extract_first()
ask_title = li.xpath('./span[3]/a/text()').extract_first()
rep_time = li.xpath('./span[4]/text()').extract_first()
ask_time = li.xpath('./span[5]/text()').extract_first()
item["sid"] = sid
item["status"] = status
item["title"] = ask_title
item["rep_time"] = rep_time
item["ask_time"] = ask_time
print('-----> 编号:', sid)
print('-----> 状态:', status)
print('-----> 问政标题:', ask_title)
print('-----> 问政时间:', ask_time)
print('-----> 回复时间:', rep_time)
# 查找详情
detail_url = 'http://wz.sun0769.com' + li.xpath('./span[3]/a/@href').extract_first()
print('-----> 详情内容的url是:', detail_url)
yield scrapy.Request(url=detail_url, meta={'item': item},
callback=self.parse_detail)
else:
print('-----> 错误的status代码是', response.status)
def parse_detail(self, response):
item = response.meta['item']
details = response.xpath('//div[@class="details-box"]/pre/text()').extract_first()
content = ''.join(details)
if content == '':
content = '未知'
print('-----> 详情内容是:', content)
# 问政主体 : 部门是--
department = response.xpath('//div[@class="mr-three clear"]/div[@class="fl politics-fl"]/text()').extract_first()
# '/html/body/div[3]/div[2]/div[2]/div[3]/div[1]'
department = department.split(':')[-1]
print('-----> 问政部门是:', department)
if department == '':
department = '未知'
item['content'] = content
item['department'] = department
yield item
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class SunproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
sid = scrapy.Field() # 编号
status = scrapy.Field() # 状态
title = scrapy.Field() # 问政标题
rep_time = scrapy.Field() # 回复时间
ask_time = scrapy.Field() # 问政时间
content = scrapy.Field() # 问政内容
department = scrapy.Field() # 问政主体
middlewares.py
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import time
import urllib3
from scrapy import signals
import requests
import random
from fake_useragent import UserAgent
class SunproSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# http = ["123.179.161.230:41128",
# '171.80.187.163:38367',
# '112.240.182.233:44590',
# '113.226.97.93:55387',
# '117.26.221.137:52067',
# '182.244.168.247:38904',
# '1.199.193.249:35786',
# '183.141.154.179:46691',
# '183.141.154.179:46691',
# ]
# https = ['49.70.17.204:9999',
# '49.89.87.85:9999',
# '49.70.85.154:9999',
# '49.70.85.53:9999',
# '120.83.121.172:9999',
# '113.195.156.158:9999']
class SunproDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def __init__(self):
'''
self.user_agent_list = [
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) '
'Chrome/10.0.648.133 Safari/534.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36)'
]
'''
# 随机ua改为fake_useragent比较有派头,方便获取,实时更新
self.ua = UserAgent(verify_ssl=False)
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
# 下载中间件 改变发出的请求
def process_request(self, request, spider):
# 设置随机ua
# request.headers['User-Agent'] = random.choice(self.user_agent_list)
request.headers['User-Agent'] = self.ua.random
# 设置time.sleep随机延迟 降低爬取速率,防止被封ip
delay = random.randint(0, 3)
time.sleep(delay)
# 动态代理ip,此处为api接口url,访问可动态返回代理ip
apiUrl = 'http://api.goubanjia.com/dynamic/get/9ec90437c17a332fb83067f5cb7539e4.html?sep=3'
# 要抓取的目标网站地址 ,本人使用的为http://www.goubanjia.com/站的代理ip(全网代理ip)
# targetUrl = "http://1212.ip138.com/ic.asp";
# 获取IP列表
res = requests.get(apiUrl).text.strip("
")
# 按照
分割获取到的IP
ips = res.split("
")
# 随机选择一个IP
proxy_ip = random.choice(ips)
# 利用动态加载的代理ip,设置url的proxy
if request.url.startswith("http://"):
request.meta['proxy'] = "http://%s" % proxy_ip # http代理
elif request.url.startswith("https://"):
request.meta['proxy'] = "https://%s" % proxy_ip # https代理
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from scrapy.exceptions import DropItem
import xlrd
import xlwt
import xlutils
from xlutils.copy import copy
class SunproPipeline:
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
if item['sid'] in self.ids_seen:
raise DropItem('Duplicate item found: %s' % item)
else:
self.ids_seen.add(item['sid'])
# 处理得到的数据
# 1.写入数据库中
# 2.保存到表格中
print(item)
return item
# index = len(value) # 获取需要写入数据的行数
'''
try:
workbook = xlrd.open_workbook('详情.xls') # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
if len(sheets) == 0:
sheet = workbook.add_sheet('相关信息')
else:
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
value = list(item.values())
for i in len(value):
new_worksheet.write(rows_old, i, value[i]) # 追加写入数据,注意是从i+rows_old行开始写入
new_workbook.save('详情.xls') # 保存工作簿
print("xls格式表格【追加】写入数据成功!")
except Exception as e:
workbook = xlwt.Workbook()
sheet = workbook.add_sheet('相关信息')
# 表格的头
for i in range(len(item.keys())):
sheet.write(0, i, list(item.keys())[i])
# 写入数据
for i in range(len(item.values())):
sheet.write(1, i, list(item.values())[i])
workbook.save('详情.xls')
return item
'''
class mysqlPipeline(object):
def __init__(self):
# self.ids_seen = set()
self.db = pymysql.connect(
host='localhost', # 连接的是本地数据库
user='root', # 自己的mysql用户名
passwd='rootpwd', # 自己的密码
db='sun', # 数据库的名字
charset='utf8mb4', # 默认的编码方式:
cursorclass=pymysql.cursors.DictCursor)
self.cursor = self.db.cursor()
def process_item(self, item, spider):
'''
:param item: 对象
:param spider: 爬虫
:return:
'''
# if item['sid'] in self.ids_seen:
# raise DropItem('Duplicate item found: %s' % item)
# else:
# self.ids_seen.add(item['sid'])
# 将爬取的信息保存到mysql
# 将item里的数据拿出来
# title = item['title']
# link = item['link']
# posttime = item['posttime']
sid = int(item["sid"])
status = item["status"]
title = item["title"]
rep_time = item["rep_time"]
ask_time = item["ask_time"]
content = item['content']
department = item['department']
# 和本地的newsDB数据库建立连接
try:
# 使用cursor()方法获取操作游标
# cursor = self.db.cursor()
# SQL 插入语句
sql = "INSERT INTO sun(sid,status,title,content,department ,ask_time,rep_time)
VALUES (%d,'%s','%s','%s', '%s', '%s','%s')" % (sid, status, title, content, department, ask_time, rep_time)
# 执行SQL语句
print('-----> sql语句是:', sql)
self.cursor.execute(sql)
# 提交修改
self.db.commit()
except Exception as e:
print('-----> 存储数据出错了', e)
# 关闭连接
self.db.close()
self.cursor.close()
return item
def close_spider(self, spider):
pass
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for sunPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'sunPro'
SPIDER_MODULES = ['sunPro.spiders']
NEWSPIDER_MODULE = 'sunPro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 8
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'sunPro.middlewares.SunproSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'sunPro.middlewares.SunproDownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'sunPro.pipelines.SunproPipeline': 300,
'sunPro.pipelines.mysqlPipeline': 302,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STO RAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'