scrapy 模板:
spider.py:
import scrapy
from scrapy.http import Request
from crawl_cnblogs.crawl_cnblogs.items import ArticleItem
import scrapy_redis
# 了解:究竟真正的起始爬取的方法在哪?start_requests
class CnblogsSpider(scrapy.Spider):
name = 'cnblogs'
allowed_domains = ['www.cnblogs.com']
start_urls = ['https://www.cnblogs.com/']
#bro = webdriver.Chrome('./chromedriver') # 导入驱动
# def start_requests(self):
# yield Request(url='http://www.baidu.com')
def parse(self, response):
# print(response.text)
div_list=response.css('div.post_item')
for div in div_list:
item=ArticleItem()
title=div.xpath('./div[2]/h3/a/text()').extract_first()
# print(title)
item['title']=title
author=div.xpath('./div[2]/div/a/text()').extract_first()
# print(author)
item['author'] = author
desc=div.xpath('./div[2]/p/text()').extract()[-1]
# print(desc)
item['desc'] = desc
url=div.xpath('./div[2]/div/span[2]/a/@href').extract_first()
# print(url)
item['url'] = url
# 第一件,深度爬取爬下一页
# 第二件:广度爬取
# yield itme对象会去保存,Request对象会去爬取
# callback 回调,数据爬完回来,去哪做解析,默认调用parse
yield Request(url=url,callback=self.parse_detail,meta={'item':item})
# css选择器取属性::attr(属性名)
next_url='https://www.cnblogs.com'+response.css('div.pager>a:last-child::attr(href)').extract_first()
# print(next_url)
# 两种方式都可以
# yield Request(url=next_url,callback=self.parse)
yield Request(url=next_url)
def parse_detail(self,response):
item=response.meta.get('item')
print(item)
content=response.css('#post_detail').extract_first()
item['content']=str(content)
# print(str(content))
yield item
def closed(self,spider):
print("爬虫结束,会走我关了")
self.bro.close()
main.py
from scrapy.cmdline import execute
execute(['scrapy','crawl','xxx','--nolog'])
中间件:
# 添加selenium + user-agent池
import random
class Scrapy02DownloaderMiddleware(object):
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6"
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
def process_request(self, request, spider):
# 请求头
# print(request.headers)
# request.headers['User-Agent']=random.choice(self.user_agent_list)
# 设置cookie(并不是所有的请求,都需要带cookie,加一个判断即可)
# 可以使用cookie池
# print(request.cookies)
# # import requests # 如果自己搭建cookie池,这么写
# # ret=requests.get('127.0.0.1/get').json()['cookie']
# # request.cookies=ret
# request.cookies={'name':'lqz','age':18}
# 使用代理(使用代理池)
# print(request.meta)
# request.meta['proxy'] = 'http://117.27.152.236:1080'
return None
#
def process_response(self, request, response, spider):
from scrapy.http import Response,HtmlResponse
# 因为向该地址发请求,不能执行js,现在用selenium执行js,获取执行完的结果,再返回response对象
# 要执行滑动 --》 js
url=request.url
spider.bro.get(url)
page_source=spider.bro.page_source
import time
time.sleep(2)
new_response=HtmlResponse(url=url,body=page_source,encoding='utf-8',request=request)
return new_response
# 异常处理
# def process_exception(self, request, exception, spider):
# from scrapy.http import Request
# print('xxxx')
# # request.url='https://www.baidu.com/'
# request=Request(url='https://www.baidu.com/')
# return request
pip.py:
# 同步数据库
import pymysql
class MysqlArticlePipeline(object):
def open_spider(self, spider):
self.conn = pymysql.connect(host='127.0.0.1', user='root', password="123",
database='cnblogs', port=3306)
def process_item(self, item, spider):
cursor = self.conn.cursor()
sql = "insert into article (title,author,url,`desc`,content) values ('%s','%s','%s','%s','%s')"%(item['title'],item['author'],item['url'],item['desc'],item['content'])
cursor.execute(sql)
self.conn.commit()
return item
def close_spider(self, spider):
self.conn.close()
settings.py:
BOT_NAME = 'py1'
SPIDER_MODULES = ['py1.spiders']
NEWSPIDER_MODULE = 'py1.spiders'
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
DOWNLOAD_DELAY = 3 # 爬取后暂停时间
COOKIES_ENABLED = False # 不需要cookie ,则关闭
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'Referer': '', # 放盗链 -》 图片
'cookie': '',
}
DOWNLOADER_MIDDLEWARES = {
'py1.middlewares.Py1DownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
'py1.pipelines.Py1Pipeline': 300,
}
# redis 配置:
# mysql 配置:
# 分布式爬虫的配置
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# Ensure all spiders share same duplicates filter through redis.
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 300
}