zoukankan      html  css  js  c++  java
  • scrapy爬虫

    a. 配置文件

    #settings.py
    DEPTH_LIMIT = 1    			#指定“递归”的层数
    ROBOTSTXT_OBEY = False		#对方网站规定哪些网址可以爬,这个选项表示不遵循此规定
    

      

     

    b. 选择器

    .//  		#表示对象的子孙中
    ./    		#儿子
    ./dev 		#儿子中的div标签
    ./div[@id='i1']		#儿子中的div标签且id='i1'
    obj.extract()		#列表中每一个对象转换字符串 => []
    obj.extract_first	#列表中的每一个对象转换字符串 => 列表第一个元素
    //div/text()            #获取某个标签的文本
    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    from scrapy.selector import Selector, HtmlXPathSelector
    from scrapy.http import HtmlResponse
    html = """<!DOCTYPE html>
    <html>
        <head lang="en">
            <meta charset="UTF-8">
            <title></title>
        </head>
        <body>
            <ul>
                <li class="item-"><a id='i1' href="link.html">first item</a></li>
                <li class="item-0"><a id='i2' href="llink.html">first item</a></li>
                <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
            </ul>
            <div><a href="llink2.html">second item</a></div>
        </body>
    </html>
    """
    response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8')
    # hxs = HtmlXPathSelector(response)
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[2]')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[@id]')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[@id="i1"]')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]')
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]/text()').extract()
    # print(hxs)
    # hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]/@href').extract()
    # print(hxs)
    # hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
    # print(hxs)
    # hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()
    # print(hxs)
     
    # ul_list = Selector(response=response).xpath('//body/ul/li')
    # for item in ul_list:
    #     v = item.xpath('./a/span')
    #     # 或
    #     # v = item.xpath('a/span')
    #     # 或
    #     # v = item.xpath('*/a/span')
    #     print(v)
    View Code

    c. 结构化处理

    setting.py
    
    ITEM_PIPELINES = {
       'day96.pipelines.Day96Pipeline': 300,
    }
    
    DB = "....."
    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    from scrapy.exceptions import DropItem
    
    class Day96Pipeline(object):
    
    
    
        def __init__(self,conn_str):
            self.conn_str = conn_str
    
    
        @classmethod
        def from_crawler(cls, crawler):
            """
            初始化时候,用于创建pipeline对象
            :param crawler:
            :return:
            """
            conn_str = crawler.settings.get('DB')
            return cls(conn_str)
    
        def open_spider(self,spider):
            """
            爬虫开始执行时,调用
            :param spider:
            :return:
            """
            self.conn = open(self.conn_str, 'a')
    
        def close_spider(self,spider):
            """
            爬虫关闭时,被调用
            :param spider:
            :return:
            """
            self.conn.close()
    
        def process_item(self, item, spider):
            """
            每当数据需要持久化时,就会被调用
            :param item:
            :param spider:
            :return:
            """
            # if spider.name == 'chouti'
            tpl = "%s
    %s
    
    " %(item['title'],item['href'])
            self.conn.write(tpl)
    
            # 交给下一个pipeline处理
            return item
    
            # 丢弃item,不交给
            # raise DropItem()
    pipelines.py

    d. 常用命令 

    scrapy startproject sp1
    cd p1
    scrapy genspider baidu baidu.com      #创建爬虫
    scrapy crawl baidu
    scrapy crawl baidu --nolog
    

    e. 目录结构

    sp1
    	- scrapy.cfg		 #初始配置文件
    	- sp1
    		- spiders		 #目录
    		- items.py		 #格式化
    		- pipelines.py	 #持久化
    		- middlewares.py #中间件
    		- settings.py    #配置
    

      

      

      

    事例

    # -*- coding: utf-8 -*-
    import scrapy
    import sys
    import io
    from scrapy.selector import Selector,HtmlXPathSelector
    
    class ChoutiSpider(scrapy.Spider):
        name = 'chouti'
        allowed_domains = ['chouti.com']
        start_urls = ['http://dig.chouti.com/']
    
        def parse(self, response):
            hxs = Selector(response=response).xpath('//div[@id="content-list"]/div[@class="item"]')
            for obj in hxs:
                a = obj.xpath('.//a[@class="show-content color-chag"]/text()').extract_first()
                print(a.strip())
    获取抽屉新闻标题
    # -*- coding: utf-8 -*-
    import scrapy
    import sys
    import io
    from scrapy.selector import Selector,HtmlXPathSelector
    
    class ChoutiSpider(scrapy.Spider):
        name = 'chouti'
        allowed_domains = ['chouti.com']
        start_urls = ['http://dig.chouti.com/']
    
        visited_urls = set()
    
        def parse(self, response):
    
            #获取当前页的所有页码的url
            hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract()
            for url in hxs:
                md5_url = self.md5(url)
                if md5_url in self.visited_urls:
                    print('已经存在',url)
                else:
                    self.visited_urls.add(md5_url)
                    print(url)
    
        def md5(self,url):
            import hashlib
            obj = hashlib.md5()
            obj.update(bytes(url,encoding='utf-8'))
            return obj.hexdigest()
    获取抽屉当前页的所有页码
    # -*- coding: utf-8 -*-
    import scrapy
    import sys
    import io
    from scrapy.http import Request
    from scrapy.selector import Selector,HtmlXPathSelector
    
    class ChoutiSpider(scrapy.Spider):
        name = 'chouti'
        allowed_domains = ['chouti.com']
        start_urls = ['http://dig.chouti.com/']
    
        visited_urls = set()
    
        def parse(self, response):
    
            #获取当前页的所有页码的url
    
            hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract()
    
            for url in hxs:
                md5_url = self.md5(url)
                if md5_url in self.visited_urls:
                    pass
                else:
                    print(url)
                    self.visited_urls.add(md5_url)
                    url = "http://dig.chouti.com%s" %url
                    #将新要访问的url增加到调度器
                    yield Request(url=url,callback=self.parse)
    
    
    
    
        def md5(self,url):
            import hashlib
            obj = hashlib.md5()
            obj.update(bytes(url,encoding='utf-8'))
            return obj.hexdigest()
    获取抽屉所有页码

    a. 避免重复的url

    setting.py 
    
    DUPEFILTER_CLASS = "day96.duplication.RepeatFilter"
    
    class RepeatFilter(object):
        def __init__(self):
            self.visited_set = set()
        @classmethod
        def from_settings(cls, settings):
            print('...')
            return cls()
    
        def request_seen(self, request):
            if request.url in self.visited_set:
                return True
            self.visited_set.add(request.url)
            return False
    
        def open(self):  # can return deferred
            print('open')
            pass
    
        def close(self, reason):  # can return a deferred
            print('close')
            pass
        def log(self, request, spider):  # log that a request has been filtered
            # print('log....')
            pass
    duplication.py
    # -*- coding: utf-8 -*-
    import scrapy
    import sys
    import io
    from scrapy.http import Request
    from scrapy.selector import Selector,HtmlXPathSelector
    
    
    class ChoutiSpider(scrapy.Spider):
        name = 'chouti'
        allowed_domains = ['chouti.com']
        start_urls = ['http://dig.chouti.com/']
    
        from scrapy.dupefilter import RFPDupeFilter
    
        def parse(self, response):
            print(response.url)
    
            #获取当前页的所有页码的url
    
            hxs = Selector(response=response).xpath('//div[@id="dig_lcpage"]//a/@href').extract()
    
            for url in hxs:
    
                url = "http://dig.chouti.com%s" %url
                #将新要访问的url增加到调度器
                yield Request(url=url,callback=self.parse)
    
    
    
    
        def md5(self,url):
            import hashlib
            obj = hashlib.md5()
            obj.update(bytes(url,encoding='utf-8'))
            return obj.hexdigest()
    chouti.py

      

     116

    传智播客

    #爬传智播客的老师的名称
    
    #scrapy startproject mySpider
    
    #cat /Users/huaixiaozi/PycharmProjects/mySpider/mySpider/items.py
    
        import scrapy
        class MyspiderItem(scrapy.Item):
            name = scrapy.Field()
            title = scrapy.Field()
            info = scrapy.Field()
    
    #cat /Users/huaixiaozi/PycharmProjects/mySpider/mySpider/spiders/itcastspider.py
        import scrapy
        from mySpider.items import ItcastItem
    
        #创建一个爬虫类
        class ItcastSpider(scrapy.Spider):
            #爬虫名
            name = "itcast"
            #允许爬虫作用的范围
            allowd_domains = ["http://www.itcast.cn"]
            #爬虫起始的url
            start_urls = ["http://www.itcast.cn/channel/teacher.shtml#"]
    
            def parse(self, response):
    
                #通过scripy自带的xpath匹配出所有老师的根节点列表集合
                teacher_list = response.xpath('//div[@class="li_txt"]')
    
                teacherItem = []
                #遍历根节点集合
                for each in teacher_list:
                    item = ItcastItem()
                    name = each.xpath('./h3/text()').extract()
                    title = each.xpath('./h4/text()').extract()
                    info = each.xpath('./p/text()').extract()
                    print("--------------",type(name))
                    item['name'] = name[0]
                    item['title'] = title[0]
                    item['info'] = info[0]
    
                    teacherItem.append(item)
                return teacherItem
    
    
    
    
    
    #保存到json文件
    scrapy crawl itcast -o itcast.json
    #保存到csv文件
    scrapy crawl itcast -o itcast.csv
    爬传智播客的老师的名称

    沛齐 

  • 相关阅读:
    [CF846E]Chemistry in Berland题解
    [CF846D]Monitor题解
    [CF846B]Math Show题解
    [CF846A]Curriculum Vitae题解
    斜率优化 学习笔记
    【CF115E】Linear Kingdom Races 题解(线段树优化DP)
    【洛谷P3802】小魔女帕琪 题解(概率期望)
    7月13日考试 题解(DFS序+期望+线段树优化建图)
    【BZOJ1426】收集邮票 题解 (期望)
    【HNOI2010】弹飞绵羊 题解(分块)
  • 原文地址:https://www.cnblogs.com/golangav/p/7474722.html
Copyright © 2011-2022 走看看