zoukankan      html  css  js  c++  java
  • scrapy 模板

    scrapy 模板:

    spider.py:

    import scrapy
    
    from scrapy.http import Request
    from crawl_cnblogs.crawl_cnblogs.items import ArticleItem
    
    	
    import scrapy_redis
    
    # 了解:究竟真正的起始爬取的方法在哪?start_requests
    class CnblogsSpider(scrapy.Spider):
        name = 'cnblogs'
        allowed_domains = ['www.cnblogs.com']
        start_urls = ['https://www.cnblogs.com/']
        #bro = webdriver.Chrome('./chromedriver')    # 导入驱动
    
        # def start_requests(self):
        #     yield Request(url='http://www.baidu.com')
    
    
        def parse(self, response):
            # print(response.text)
            div_list=response.css('div.post_item')
            for div in div_list:
                item=ArticleItem()
                title=div.xpath('./div[2]/h3/a/text()').extract_first()
                # print(title)
                item['title']=title
                author=div.xpath('./div[2]/div/a/text()').extract_first()
                # print(author)
                item['author'] = author
                desc=div.xpath('./div[2]/p/text()').extract()[-1]
                # print(desc)
                item['desc'] = desc
                url=div.xpath('./div[2]/div/span[2]/a/@href').extract_first()
                # print(url)
                item['url'] = url
    
                # 第一件,深度爬取爬下一页
                # 第二件:广度爬取
                # yield itme对象会去保存,Request对象会去爬取
                # callback 回调,数据爬完回来,去哪做解析,默认调用parse
                yield Request(url=url,callback=self.parse_detail,meta={'item':item})
    
            # css选择器取属性::attr(属性名)
            next_url='https://www.cnblogs.com'+response.css('div.pager>a:last-child::attr(href)').extract_first()
            # print(next_url)
            # 两种方式都可以
            # yield Request(url=next_url,callback=self.parse)
            yield Request(url=next_url)
    
    
        def parse_detail(self,response):
            item=response.meta.get('item')
            print(item)
    
            content=response.css('#post_detail').extract_first()
            item['content']=str(content)
            # print(str(content))
            yield item
    
        def closed(self,spider):
            print("爬虫结束,会走我关了")
            self.bro.close()
    

    main.py

    from scrapy.cmdline import execute
    
    execute(['scrapy','crawl','xxx','--nolog'])
    

    中间件:

    # 添加selenium + user-agent池
    import random
    class Scrapy02DownloaderMiddleware(object):
        user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
            "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6"
            "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]
        def process_request(self, request, spider):
            # 请求头
            # print(request.headers)
            # request.headers['User-Agent']=random.choice(self.user_agent_list)
    
            # 设置cookie(并不是所有的请求,都需要带cookie,加一个判断即可)
            # 可以使用cookie池
            # print(request.cookies)
            # # import requests # 如果自己搭建cookie池,这么写
            # # ret=requests.get('127.0.0.1/get').json()['cookie']
            # # request.cookies=ret
            # request.cookies={'name':'lqz','age':18}
    
            # 使用代理(使用代理池)
            # print(request.meta)
            # request.meta['proxy'] = 'http://117.27.152.236:1080'
            return None
        #
        def process_response(self, request, response, spider):
            from scrapy.http import Response,HtmlResponse
            # 因为向该地址发请求,不能执行js,现在用selenium执行js,获取执行完的结果,再返回response对象
            # 要执行滑动 --》 js
            url=request.url
            spider.bro.get(url)
            page_source=spider.bro.page_source
            import time
            time.sleep(2)
            new_response=HtmlResponse(url=url,body=page_source,encoding='utf-8',request=request)
            return new_response
    
        # 异常处理
        # def process_exception(self, request, exception, spider):
        #     from scrapy.http import Request
        #     print('xxxx')
        #     # request.url='https://www.baidu.com/'
        #     request=Request(url='https://www.baidu.com/')
        #     return request
    
    

    pip.py:

    # 同步数据库
    import pymysql
    
    
    class MysqlArticlePipeline(object):
        def open_spider(self, spider):
            self.conn = pymysql.connect(host='127.0.0.1', user='root', password="123",
                                        database='cnblogs', port=3306)
    
        def process_item(self, item, spider):
            cursor = self.conn.cursor()
            sql = "insert into article (title,author,url,`desc`,content) values ('%s','%s','%s','%s','%s')"%(item['title'],item['author'],item['url'],item['desc'],item['content'])
            cursor.execute(sql)
            self.conn.commit()
            return item
    
        def close_spider(self, spider):
            self.conn.close()
    

    settings.py:

    BOT_NAME = 'py1'
    
    SPIDER_MODULES = ['py1.spiders']
    NEWSPIDER_MODULE = 'py1.spiders'
    
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    CONCURRENT_REQUESTS = 32
    DOWNLOAD_DELAY = 3     # 爬取后暂停时间
    
    COOKIES_ENABLED = False     # 不需要cookie ,则关闭
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en',
        'Referer': '',      # 放盗链 -》 图片
        'cookie': '',
    }
    
    DOWNLOADER_MIDDLEWARES = {
       'py1.middlewares.Py1DownloaderMiddleware': 543,
    }
    ITEM_PIPELINES = {
       'py1.pipelines.Py1Pipeline': 300,
    }
    
    # redis 配置:
    
    # mysql 配置:
    
    # 分布式爬虫的配置
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    # Ensure all spiders share same duplicates filter through redis.
    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    
    ITEM_PIPELINES = {
        'scrapy_redis.pipelines.RedisPipeline': 300
    }
    
  • 相关阅读:
    最长回文子串
    给定两个大小为 m 和 n 的有序数组 nums1 和 nums2。请你找出这两个有序数组的中位数,并且要求算法的时间复杂度为 O(log(m + n))。你可以假设 nums1 和 nums2 不会同时为空。
    MySQL分组查询指定列最新的数据
    GitLab的安装及使用教程(CentOS7)
    ElasticSearch常用RESTful API介绍
    JVM监控——jconsole、
    RocketMq安装运行
    IDEA常用插件-个人
    设计模式:Builder模式
    CentOS常用环境配置(MySQL、jdk、Redis、Nginx)
  • 原文地址:https://www.cnblogs.com/shaozheng/p/12802844.html
Copyright © 2011-2022 走看看