zoukankan      html  css  js  c++  java
  • scrapy 反扒措施_2

    措施实现:

    禁用Cookie:

    Scrapy中,默认是打开cookie的 (#COOKIES_ENABLED = False)
    	设置为:COOKIES_ENABLED = False (cookie启用:no),对于需要cookie的可以在请求头中headers加入cookie
        
    class LagouspiderSpider(scrapy.Spider):
        name = "lagouspider"
        allowed_domains = ["www.lagou.com"]
     
        url = 'https://www.lagou.com/jobs/positionAjax.json?'#city=%E6%B7%B1%E5%9C%B3&needAddtionalResult=false'
        page = 1
        allpage =0
     
        cookie = 'JSESSIONID=ABAAABAAAFCAAEG34858C57541C1F9DF75AED18C3065736; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1524281748;  04797acf-4515-11e8-90b5- LGSID=20180421130026-e7e614d7-4520-PRE_SITE=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3Fcity%3D%25E6%25B7%25B1%25E5%259C%25B3%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2F4302345.html; LGRID=20180421130208-24b73966-4521-11e8-90f2-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1524286956'
        headers = {'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                   'Referer': 'https://www.lagou.com/jobs/list_python?city=%E6%B7%B1%E5%9C%B3&cl=false&fromSearch=true&labelWords=&suginput=',
                   'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
                   'cookie': cookie }
     
        def start_requests(self):
           yield scrapy.FormRequest(self.url, headers=self.headers, formdata={
                'first': 'true','pn': str(self.page),'kd': 'python','city': '深圳'}, callback=self.parse)
    
        --> post 请求
    

    .设置下载延时:

    在Scrapy中,默认是关闭请求下载延时的(#DOWNLOAD_DELAY = 3)
    
    去掉#,或者在spider的请求间中加入 time.sleep(random.randint(5, 10))
    
    def parse(self, response):
            #print(response.text)
            item = LagouItem()
            data = json.loads(response.text)
     
            totalCount = data['content']['positionResult']['totalCount']#总共多少条信息
            resultSize = data['content']['positionResult']['resultSize']#每页多少条信息
     
            result = data['content']['positionResult']['result']#得到一个包含15个信息的列表
            for each in result:
                for field in item.fields:
                    if field in each.keys():
                        item[field] = each.get(field)
                yield item
     
            time.sleep(random.randint(5, 10))
     
            if int(resultSize):
                self.allpage = int(totalCount) // int(resultSize) + 1
                if self.page < self.allpage:
                    self.page += 1
                    yield scrapy.FormRequest(self.url, headers=self.headers, formdata={
                'first': 'false','pn': str(self.page),'kd': 'python','city': '深圳'}, callback=self.parse)
    
    

    设置USER-AGENT和代理ip:

    settings.py:
    DOWNLOADER_MIDDLEWARES = {
        'doubanMongo.middlewares.RandomUserAgent': 300,
        'doubanMongo.middlewares.RandomProxy':400
    }
     
    USER_AGENTS = [
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)',
        'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
        'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
        'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
        'Mozilla/5.0 (Linux; U; Android 4.0.3; zh-cn; M032 Build/IML74K) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
        'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13'
    ]
     
    PROXIES=[{'ip_port':'117.48.214.249:16817','user_passwd':'632345244:4tf9pcpw'}
            #{'ip_port':'117.48.214.249:16817','user_passwd':''},
            #{'ip_port':'117.48.214.249:16817','user_passwd':''},
            #{'ip_port':'117.48.214.249:16817','user_passwd':''}
           ]
    

    middlewares中:

    from scrapy.conf import settings
    import base64
    import random
     
     
    class RandomProxy(object):
     
        def process_request(self, request, spider):
            proxy = random.choice(settings["PROXIES"])
            if proxy['user_passwd'] is None:
                request.meta['proxy'] = 'http://'+ proxy['ip_port']
            else:
                # 对账户密码进行base64编码转换
                b_pw =  bytes(proxy['user_passwd'], encoding = "utf-8")#string转为bytes
                base64_userpasswd = base64.encodestring(b_pw)#需要的参数是bytes对象
                # 对应到代理服务器的信令格式里
                s_base64_userpasswd = str(base64_userpasswd, encoding="utf-8") #bytes转为string
                request.headers['Proxy-Authorization'] = 'Basic ' + s_base64_userpasswd
                request.meta['proxy'] = "http://" + proxy['ip_port']
     
     
    class RandomUserAgent(object):
     
        def process_request(self, request, spider):
            useragent = random.choice(settings["USER_AGENTS"])
            #print(useragent)
            request.headers.setdefault('User-Agent',useragent)
    
    

    Selenium scrapy:

    PhantomJS:
        	https://phantomjs.org/download.html
    点击进入phantomjs-2.1.1-windows文件夹,进入bin文件夹,复制路径
    
    添加到环境变量: 启动(phantomjs -v,)
    
    执行: phantomjs.exe
    
    pip install Selenium,PyMongo
    
    scrapy startproject scrapyseleniumtest
    scrapy genspider taobao www.taobao.com
    
    
    from scrapy import Item, Field
    class ProductItem(Item):
       collection = 'products'
       image = Field()
       price = Field()
       deal = Field()
       title = Field()
       shop = Field()
       location = Field()
    
    
    from scrapy import Request, Spider
    from urllib.parse import quote
    from scrapyseleniumtest.items import ProductItem
     
    class TaobaoSpider(Spider):
       name = 'taobao'
       allowed_domains = ['www.taobao.com']
       base_url = 'https://s.taobao.com/search?q='
     
       def start_requests(self):
           for keyword in self.settings.get('KEYWORDS'):
               for page in range(1, self.settings.get('MAX_PAGE') + 1):
                   url = self.base_url + quote(keyword)
                   yield Request(url=url, callback=self.parse, meta={'page': page}, dont_filter=True)
    

    setttings.py:

    KEYWORDS = ['iPad']
    MAX_PAGE = 100
    
    DOWNLOADER_MIDDLEWARES = {
       'scrapyseleniumtest.middlewares.SeleniumMiddleware': 543,
    }
    
    ITEM_PIPELINES = {
       'scrapyseleniumtest.pipelines.MongoPipeline': 300,
    }
    MONGO_URI = 'localhost'
    MONGO_DB = 'taobao'
    

    对接 Selenium:

    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from scrapy.http import HtmlResponse
    from logging import getLogger
     
    class SeleniumMiddleware():
       def __init__(self, timeout=None, service_args=[]):
           self.logger = getLogger(__name__)
           self.timeout = timeout
           self.browser = webdriver.PhantomJS(service_args=service_args)
           self.browser.set_window_size(1400, 700)
           self.browser.set_page_load_timeout(self.timeout)
           self.wait = WebDriverWait(self.browser, self.timeout)
     
       def __del__(self):
           self.browser.close()
     
       def process_request(self, request, spider):
           """
           用PhantomJS抓取页面
           :param request: Request对象
           :param spider: Spider对象
           :return: HtmlResponse
           """
           self.logger.debug('PhantomJS is Starting')
           page = request.meta.get('page', 1)
           try:
               self.browser.get(request.url)
               if page > 1:
                   input = self.wait.until(
                       EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
                   submit = self.wait.until(
                       EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
                   input.clear()
                   input.send_keys(page)
                   submit.click()
               self.wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
               self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
               return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8', status=200)
           except TimeoutException:
               return HtmlResponse(url=request.url, status=500, request=request)
     
       @classmethod
       def from_crawler(cls, crawler):
           return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),
                      service_args=crawler.settings.get('PHANTOMJS_SERVICE_ARGS'))
        
        -->利用PhantomJS来代替Scrapy完成了页面的加载,最后将Response返回
    

    解析页面:

    def parse(self, response):
       products = response.xpath(
           '//div[@id="mainsrp-itemlist"]//div[@class="items"][1]//div[contains(@class, "item")]')
       for product in products:
           item = ProductItem()
           item['price'] = ''.join(product.xpath('.//div[contains(@class, "price")]//text()').extract()).strip()
           item['title'] = ''.join(product.xpath('.//div[contains(@class, "title")]//text()').extract()).strip()
           item['shop'] = ''.join(product.xpath('.//div[contains(@class, "shop")]//text()').extract()).strip()
           item['image'] = ''.join(product.xpath('.//div[@class="pic"]//img[contains(@class, "img")]/@data-src').extract()).strip()
           item['deal'] = product.xpath('.//div[contains(@class, "deal-cnt")]//text()').extract_first()
           item['location'] = product.xpath('.//div[contains(@class, "location")]//text()').extract_first()
           yield item
    

    存储:

    import pymongo
     
    class MongoPipeline(object):
       def __init__(self, mongo_uri, mongo_db):
           self.mongo_uri = mongo_uri
           self.mongo_db = mongo_db
     
       @classmethod
       def from_crawler(cls, crawler):
           return cls(mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DB'))
     
       def open_spider(self, spider):
           self.client = pymongo.MongoClient(self.mongo_uri)
           self.db = self.client[self.mongo_db]
     
       def process_item(self, item, spider):
           self.db[item.collection].insert(dict(item))
           return item
     
       def close_spider(self, spider):
           self.client.close()
    

    main.py

    from scrapy import cmdline
    
    cmdline.execute('scrapy crawl taobao --nolog '.split())
    
  • 相关阅读:
    mysql 需要掌握的重点
    Java基础知识之常见关键字以及概念总结
    abstract类中method
    java异常继承何类,运行时异常与一般异常的区别
    Java关键字final、static使用总结
    JAVA读取XML文件
    关于ApplicationContext的初始化
    web.xml配置详解
    maven javaProject打包发布成服务
    Spring Boot Actuator 配置和应用
  • 原文地址:https://www.cnblogs.com/shaozheng/p/12793609.html
Copyright © 2011-2022 走看看