zoukankan      html  css  js  c++  java
  • python爬虫之scrapy

    环境:centos6 + python3
    安装:pip3 install scrapy
    报错:src/twisted/test/raiser.c:4:20: error: Python.h: No such file or directory
        src/twisted/test/raiser.c:6:6: error: #error Python headers needed to compile C extensions, please install development version of Python.
        error: command 'gcc' failed with exit status 1
    解决:需要安装Python的头文件和静态库包(python-devel)
    yum search python3 | grep devel #搜索python3下的devel
    yum install -y python34-devel.x86_64#安装python34-devel.x86_64 

    pip3 install scrapy#成功 


    1、创建项目
    cd /home/chaoge/mypython/crawler/

    scrapy startproject myscrapy

    vi items.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class MyscrapyItem(scrapy.Item):
        # define the fields for your item here like:
        #职位名称
        positionName = scrapy.Field()
        #职位连接
        positionLink = scrapy.Field()
        #职位类型
        positionType = scrapy.Field()
        #招聘人数
        peopleNum = scrapy.Field()
        #工作地点
        workLocation = scrapy.Field()
        #发布时间
        publishTime = scrapy.Field()

    vi pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    import json
    
    class MyscrapyPipeline(object):
        def __init__(self):
            self.filename = open("tencent.json","wb")
    
        def process_item(self, item, spider):
            text = json.dumps(dict(item),ensure_ascii=False)+"
    "
            self.filename.write(text.encode("utf-8"))
            return item
        
        def close_spider(self,spider):
            self.filename.close()

    vi settings.py 


    # Override the default request headers:
    DEFAULT_REQUEST_HEADERS = { 
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',  
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    }
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'myscrapy.middlewares.MyscrapySpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'myscrapy.middlewares.MyscrapyDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = { 
        'myscrapy.pipelines.MyscrapyPipeline': 300,
    }
    2、创建基础类
    cd myscrapy/myscrapy/spiders

    scrapy genspider tencent "tencent.com"

    vi tencent.py 

    # -*- coding: utf-8 -*-
    import scrapy
    from myscrapy.items import MyscrapyItem
    
    
    class TencentSpider(scrapy.Spider):
        name = 'tencent'
        allowed_domains = ['tencent.com']
        url = "http://hr.tencent.com/position.php?&start="
        offset=0
        #start_urls = ['http://tencent.com/']
        start_urls = [url+str(offset)]
        def parse(self, response):
            for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"):
                #初始化模型对象
                item = MyscrapyItem()
    
                #职位名称
                item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]
                #职位连接
                item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0]
                #职位类型
                item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
                #招聘人数
                item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]
                #工作地点
                item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
                #发布时间
                item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]
    
                #将数据交给管道文件处理
                yield item
    
            if self.offset < 50:
                self.offset += 10
                #将请求重新发送给调度器,入队列,出队列,交给下载器下载
                yield scrapy.Request(self.url + str(self.offset),callback = self.parse)
            else:
                print("end.")
    3、执行

    scrapy crawl tencent


    模拟登陆

    # -*- coding: utf-8 -*-
    import scrapy
    
    
    class RenrenspiderSpider(scrapy.Spider):
        name = 'renrenspider'
        allowed_domains = ['renren.com']
        #start_urls = ['http://renren.com/']
    
        def start_requests(self):
            url = 'http://www.renren.com/PLogin.do'
            yield scrapy.FormRequest(url = url, formdata={"email":"XXXX@163.com","password":"XXXXXX"},callback=self.parse_page)
        def parse_page(self, response):
            with open("info.html","wb") as filename:
                filename.write(response.body)

    下载图片:

    运行时报错: File "/usr/lib64/python3.4/site-packages/scrapy/pipelines/images.py", line 15, in <module>
        from PIL import Image
    ImportError: No module named 'PIL'
    解决办法:pip3 install pillow
    vi items.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class DouyuItem(scrapy.Item):
        # define the fields for your item here like:
        nickname = scrapy.Field()
        imagelink = scrapy.Field()
        imagepath = scrapy.Field()
    vi pipelines.py 
    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import scrapy
    from scrapy.utils.project import get_project_settings
    from scrapy.pipelines.images import ImagesPipeline
    import json
    import os
    
    class DouyuPipeline(ImagesPipeline):
        #获取settings中的常量
        IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
        def get_media_requests(self, item, info):
            image_url = item['imagelink'] 
            yield scrapy.Request(image_url)
    
        def item_completed(self, results, item, info):
            print(results)
            #exit(1)
            image_path = [x['path'] for ok, x in results if ok]
            os.rename(self.IMAGES_STORE+"/"+image_path[0],self.IMAGES_STORE+"/"+item['nickname']+".jpg")
            item['imagepath'] = self.IMAGES_STORE+"/"+item['nickname']
            return item

    vi spiders/douyuavatar.py

    # -*- coding: utf-8 -*-
    import scrapy
    from douyu.items import DouyuItem
    import json
    
    class DouyuavatarSpider(scrapy.Spider):
        name = 'douyuavatar'
        allowed_domains = ['capi.douyucdn.cn']
        url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
        offset=0
        #start_urls = ['http://tencent.com/']
        start_urls = [url+str(offset)]
        def parse(self, response):
            #把json转换为python格式
            data = json.loads(response.text)['data']
            #print(data)
            #exit(1)
            for value in data:
                item = DouyuItem()
                item['nickname'] = value['nickname']
                item['imagelink'] = value['vertical_src']
                yield item
            if self.offset < 50:    
                self.offset += 20
                yield scrapy.Request(self.url + str(self.offset), callback = self.parse)


  • 相关阅读:
    openstack-ntp时间同步服务
    如何将icon图标库引入自己的项目中
    微信小程序实现滑动tab切换和点击tab切换并显示相应的数据(附源代码)
    微信小程序分享至朋友圈的方法
    微信小程序--分享功能
    mpvue-新建页面、页面跳转、自适应单位
    微信小程序mpvue-动态改变navigationBarTitleText值
    mpvue中使用flyjs全局拦截
    H5 布局 -- 让容器充满屏幕高度或自适应剩余高度
    使用mpvue开发小程序如何定义全局变量
  • 原文地址:https://www.cnblogs.com/fonyer/p/8871446.html
Copyright © 2011-2022 走看看