zoukankan      html  css  js  c++  java
  • scrapy

    __author__ = 'Administrator'
    # -*- encoding:utf-8 -*-
    import scrapy
    class QuoteSpider(scrapy.Spider):
        name = 'poxiao'
        start_urls=['https://www.poxiao.com/type/movie/']
        def parse(self, response):#固定的
            quotes=response.xpath('//li/h3')#内容
            for quote in quotes:
                yield {
                    'name':quote.xpath('./a/text()').extract_first(),
                    'author':'https://www.poxiao.com'+quote.xpath('./a/@href').extract_first()
                }
                next_page=response.xpath('//div[@class="list-pager"]/a[last()-1]/@href').extract_first()
                if next_page:
                    yield response.follow(next_page,self.parse)

    用SCRAPY爬取某网页链接地址

    scrapy runspider ***.py  运行此工程

    SCRAPY runspider ***.py -o aa.json      保存成JSON文件

    scrap runspider ***.py -o aa.csv -t csv    保存成EXCEL

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class MovieSpider(scrapy.Spider):
        name = 'movie'
        allowed_domains = ['poxiao.com']
        start_urls = ['https://www.poxiao.com/type/movie/index_2.html',
                      'https://www.poxiao.com/type/movie/index_3.html'
                      ]
    
        def parse(self, response):
            filname=response.url.split('/')[-1].split('.')[-2]
            with open(filname,'wb')as f:
                f.write(response.body)

    爬取HTML源文件 

    # -*- coding: utf-8 -*-
    import scrapy
    from meiju.items import MeijuItem
    
    class Mj100Spider(scrapy.Spider):
        name = 'mj100'
        allowed_domains = ['meijutt.com']
        start_urls = ['https://www.meijutt.com/new100.html']
    
        def parse(self, response):
            movies=response.xpath('//h5/a')
            for each_movie in movies:
                item=MeijuItem()
                item['name']=each_movie.xpath('./text()').extract_first()
                yield item
    class MeijuPipeline(object):
        def process_item(self, item, spider):
            with open('my_meiju.txt','a')as fp:
                fp.write(item['name']+'
    ')
    class MeijuItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        name=scrapy.Field()

    爬取美剧100实例     注意  还要注释一下PIPLINE文件里的内容 就是有300    优先级那个

    # -*- coding: utf-8 -*-
    import scrapy
    from poxiao.items import PoxiaoItem
    
    
    class NameSpider(scrapy.Spider):
        name = 'name'
        allowed_domains = ['poxiao.com']
        start_urls = ['https://www.poxiao.com/type/movie/']
    
        def parse(self, response):
    
            movie=response.xpath('//div[@class="gkpic"]//img')
            for i in movie:
                item=PoxiaoItem()
                item['src']=i.xpath('./@src').extract_first()
                item['name']=i.xpath('./@alt').extract_first()
                yield item
                next_page=response.xpath('//div[@class="list-pager"]/a[last()-1]/@href').extract_first()
                if next_page:
                    yield response.follow("https://www.poxiao.com"+next_page,self.parse)

    第一个小爬虫

    import os
    import requests
    class PoxiaoPipeline(object):
        def process_item(self, item, spider):
            filename=os.path.join(r"d:untitled1poxiao",item['name']+'.jpg')
            with open(filename,'wb') as f:
                f.write(requests.get(item['src']).content)
  • 相关阅读:
    CSS3——复杂选择器
    单元测试覆盖率设置
    你必须了解的「架构」小历史
    js正则表达式:学习网址和部分正则验证
    转: js实现全角半角检测的方法
    Linux and the Unix Philosophy(1)
    HTML DOM 对象
    理解css中的 content:" " 是什么意思
    JS
    js
  • 原文地址:https://www.cnblogs.com/xupanfeng/p/11765545.html
Copyright © 2011-2022 走看看