zoukankan html css js c++ java

scrapy 随机中间件配置

Scrapy中添加随机User-Agent:

Scrapy中添加随机User-Agent:

1.pip install scrapy-fake-useragent

2.setting.py 写：
DOWNLOADER_MIDDLEWARES = {
    'lagoujob.middlewares.RandomUesrAgent': 1,
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}

3.middlewares.py 写入：
class RandomUesrAgent(object):
    def process_request(self, request, spider):
        ua = UserAgent()
        request.headers.setdefault("User-Agent", ua.random)

scrapy 添加ip池：

   国内的免费IP代理网站参考:
                    http://www.xicidaili.com/wt           
                    https://www.kuaidaili.com/free/
                    http://www.youdaili.net/Daili/guonei/   
                    http://ip.zdaye.com/FreeIPlist.html

配置：

中间件 middlewares.py 中 添加一个 代理 类
import random
from proxy1.settings import IPPOOL
 
 
class ProxychiMiddleware(object):
     # 定义一个请求之前的方法
     def process_request(self, request, spider):
              # 如果是 私密代理
              # request.meta['proxy'] = 'https://用户名and密码114.212.12.4:3128'    
              # 随即获取一个代理
              this_ip = random.choice(IPPOOL)
              request.meta['proxy'] = 'HTTP://'+this_ip
            
            return None

 setting.py 中 启用  middlewares.py 中的 代理类:
 DOWNLOADER_MIDDLEWARES = {
         #  启用的类名 要和  中间件中的类名一致
         'movie.middlewares.ProxychiMiddleware': 543,
}
 
# 定义一个代理池
IPPOOL=[
        {"ipaddr":"123.55.1.75:30325"},
        {"ipaddr":"220.184.213.12:6666"},
        {"ipaddr":"171.38.85.82:8123"},
        {"ipaddr":"111.121.193.214:3128"},
        {"ipaddr":"58.48.193.180:3128"},
        {"ipaddr":"171.37.29.26:9797"},
        {"ipaddr":"119.188.162.165:8081"} ]

重写start_request:

import scrapy
import random
# 设置一个代理池
proxy_pool = [{'HTTP':'111.155.116.215:8123'}]
class ProxydemoSpider(scrapy.Spider):
    name = 'proxydemo'
    allowed_domains = ['www.baidu.com']
    start_urls = ['http://www.baidu.com/']
 
    def start_requests(self):
        for url in self.start_urls:
            proxy_addr = random.choice(proxy_pool)  # 随机选一个
            yield scrapy.Request(url=url, callback=self.parse, meta={'proxy': proxy_addr})  # 通过meta参数添加代理
 
    def parse(self, response):
            print('proxy simida')

            proxy_addr = "http://ip:port"
            加密：

scrapy 爬取多层：

# -*- coding: utf-8 -*-
import scrapy
from Tencent.items import TencentItem
 
 
class TencentSpider(scrapy.Spider):
    # 爬虫名称
    name = 'tencent'
    # 允许爬取的域名
    allowed_domains = ['www.xxx.com']
    # 爬虫基础地址 用于爬虫域名的拼接
    base_url = 'https://www.xxx.com/'
    # 爬虫入口爬取地址
    start_urls = ['https://www.xxx.com/position.php']
    # 爬虫爬取页数控制初始值
    count = 1
    # 爬虫爬取页数 10为只爬取一页
    page_end = 1
 
    def parse(self, response):
 
 
        nodeList = response.xpath("//table[@class='tablelist']/tr[@class='odd'] | //table[@class='tablelist']/tr[@class='even']")
        for node in nodeList:
            item = TencentItem()
 
            item['title'] = node.xpath("./td[1]/a/text()").extract()[0]
            if len(node.xpath("./td[2]/text()")):
                item['position'] = node.xpath("./td[2]/text()").extract()[0]
            else:
                item['position'] = ''
            item['num'] = node.xpath("./td[3]/text()").extract()[0]
            item['address'] = node.xpath("./td[4]/text()").extract()[0]
            item['time'] = node.xpath("./td[5]/text()").extract()[0]
            item['url'] = self.base_url + node.xpath("./td[1]/a/@href").extract()[0]
            # 根据内页地址爬取
            yield scrapy.Request(item['url'], meta={'item': item}, callback=self.detail_parse)
 
            # 有下级页面爬取 注释掉数据返回
            # yield item
 
        # 循环爬取翻页
        nextPage = response.xpath("//a[@id='next']/@href").extract()[0]
        # 爬取页数控制及末页控制
        if self.count < self.page_end and nextPage != 'javascript:;':
            if nextPage is not None:
                # 爬取页数控制值自增
                self.count = self.count + 1
                # 翻页请求
                yield scrapy.Request(self.base_url + nextPage, callback=self.parse)
        else:
            # 爬虫结束
            return None
        
    def detail_parse(self, response):
        # 接收上级已爬取的数据
        item = response.meta['item']   
        #一级内页数据提取 
        item['zhize'] = response.xpath("//*[@id='position_detail']/div/table/tr[3]/td/ul[1]").xpath('string(.)').extract()[0]
        item['yaoqiu'] = response.xpath("//*[@id='position_detail']/div/table/tr[4]/td/ul[1]").xpath('string(.)').extract()[0]
        # 二级内页地址爬取
        yield scrapy.Request(item['url'] + "&123", meta={'item': item}, callback=self.detail_parse2)
        # 有下级页面爬取 注释掉数据返回
        # return item
    def detail_parse2(self, response):
        # 接收上级已爬取的数据
        item = response.meta['item']
        # 二级内页数据提取 
        item['test'] = "111111111111111111"
        # 最终返回数据给爬虫引擎
        return item

查看全文

相关阅读:
爬取校园新闻首页的新闻
 网络爬虫基础练习
 【mongoDB实战】mongo集群---主从复制篇
 【mongoDB实战】聚合管道--$unwind
【mongoDB实战】聚合管道--$unwind
【mongoDB实战】mongoDB数据导入和导出
 【mongoDB实战】mongoDB数据导入和导出
 【mongoDB实战】mongoDB数据备份和还原
 【mongoDB实战】mongoDB数据备份和还原
 【Restful】三分钟彻底了解Restful最佳实践

原文地址：https://www.cnblogs.com/shaozheng/p/12792270.html