zoukankan      html  css  js  c++  java
  • 爬虫scrapy模块

    首先下载scrapy模块

    这里有惊喜

    https://www.cnblogs.com/bobo-zhang/p/10068997.html

    创建一个scrapy文件

    首先在终端找到一个文件夹

    输入

    scrapy startproject jy (项目件名)

    修改setting文件配置

    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = Fals

    cd 到 spiders文件,在终端创建一个文件

    scrapy genspider myjy(文件名) www.xxx.com

    在文件里执行我们的第一个代码吧

    #实现解析+持久化存储

    
    
    # -*- coding: utf-8 -*-
    import scrapy


    class FirstSpider(scrapy.Spider):
    #爬虫文件的名称
    name = 'first'
    #允许的域名
    #allowed_domains = ['www.xxx.com']
    #起始url列表
    start_urls = ['https://www.qiushibaike.com/text/']
    #实现了数据的基本解析操作
    # def parse(self, response):
    # div_list = response.xpath('//div[@id="content-left"]/div')
    # for div in div_list:
    # #author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
    # #如果可以保证xpath返回的列表中只有一个列表元素则可以使用extract_first(),否则必须使用extract()
    # author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
    # content = div.xpath('./a[1]/div/span//text()').extract()
    # content = ''.join(content)
    # print(author,content)

    #实现解析+持久化存储
    #1.基于终端指令的持久化存储
    # 只可以将parse方法的返回值持久化存储到本地的文本中
    #2.基于管道的持久化存储


    # 1.基于终端指令的持久化存储
    def parse(self, response):
    div_list = response.xpath('//div[@id="content-left"]/div')
    all_data = []
    for div in div_list:
    #author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
    #如果可以保证xpath返回的列表中只有一个列表元素则可以使用extract_first(),否则必须使用extract()
    author = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
    content = div.xpath('./a[1]/div/span//text()').extract()
    content = ''.join(content)

    dic = {
    'author':author,
    'content':content
    }

    all_data.append(dic)

    return all_data

     

    最后运行文件

    scrapy crawl myjy

    #解析+管道持久化存储

    首先在psrse里写入文件

    # -*- coding: utf-8 -*-
    import scrapy
    
    from bossPro.items import BossproItem
    class BossSpider(scrapy.Spider):
        name = 'boss'
        # allowed_domains = ['www.xxx.com']
        start_urls = ['https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&scity=101010100&industry=&position=']
    
        url = 'https://www.zhipin.com/c101010100/?query=python爬虫&page=%d&ka=page-2'
        page = 1
        #解析+管道持久化存储
        def parse(self, response):
            li_list = response.xpath('//div[@class="job-list"]/ul/li')
            for li in li_list:
                job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div/text()').extract_first()
                salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()').extract_first()
                company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()
    
                #实例化一个item对象
                item = BossproItem()
                #将解析到的数据全部封装到item对象中
                item['job_name'] = job_name
                item['salary'] = salary
                item['company'] = company
    
                #将item提交给管道
                yield item
    
            if self.page <= 3:
                print('if 执行!!!')
                self.page += 1
                new_url = format(self.url%self.page)
                print(new_url)
                #手动请求发送
                yield scrapy.Request(url=new_url,callback=self.parse)

    配置items.py文件,用来作为数据结构

    import scrapy
    
    
    class BossproItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        job_name = scrapy.Field()
        salary = scrapy.Field()
        company = scrapy.Field()

    在pipelines.py里写入文件

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import pymysql
    from redis import Redis
    class BossproPipeline(object):
        fp = None
        def open_spider(self, spider):
            print('开始爬虫......')
            self.fp = open('./boss.txt','w',encoding='utf-8')
        def close_spider(self, spider):
            print('结束爬虫......')
            self.fp.close()
        #爬虫文件每向管道提交一次item,则该方法就会被调用一次.
        #参数:item 就是管道接收到的item类型对象
    
        def process_item(self, item, spider):
            #print(item)
            self.fp.write(item['job_name']+':'+item['salary']+':'+item['company']+'
    ')
            return item #返回给下一个即将被执行的管道类
    
    class mysqlPileLine(object):
        conn = None
        cursor =None
        def open_spider(self,spider):
            self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='',db='scrapy',charset="utf8")
            print(self.conn)
        def process_item(self, item, spider):
            self.cursor = self.conn.cursor()
            # print(item)
            #print('insert into boss values ("%s","%s","%s")'%(item['job_name'],item['salary'],item['company']))
            try:
                print('insert into boss values ("%s","%s","%s")'%(item['job_name'],item['salary'],item['company']))
                self.cursor.execute('insert into boss values ("%s","%s","%s")'%(item['job_name'],item['salary'],item['company']))
                self.conn.commit()
            except Exception as e:
                print(e)
                self.conn.rollback()
        def close_spider(self,spider):
            self.conn.close()
            self.cursor.close()
    
    class redisPileLine(object):
        conn = None
        def open_spider(self,spider):
            self.conn = Redis(host='127.0.0.1',port=6379)
            print(self.conn)
        def process_item(self, item, spider):
            # print(item)
            dic = {
                'name':item['job_name'],
                'salary':item['salary'],
                'company':item['company']
            }
            self.conn.lpush('boss',dic)

    别忘了在setting里面配置

    ITEM_PIPELINES = {
       # 'boss.pipelines.BossPipeline': 300,
       'boss.pipelines.redisPipeline': 301,
       # 'boss.pipelines.mysqlPipeline': 302,
    }
  • 相关阅读:
    mysql GROUP_CONCAT 查询某个字段(查询结果默认逗号拼接)
    mysql中find_in_set的使用
    Libev源码分析07:Linux下的eventfd简介
    Libev源码分析06:异步信号同步化--sigwait、sigwaitinfo、sigtimedwait和signalfd
    Nova中的Hook机制
    Python深入:stevedore简介
    Libev源码分析05:Libev中的绝对时间定时器
    Python深入:setuptools简介
    Libev源码分析04:Libev中的相对时间定时器
    Libev源码分析02:Libev中的IO监视器
  • 原文地址:https://www.cnblogs.com/zhangqing979797/p/10458393.html
Copyright © 2011-2022 走看看