zoukankan html css js c++ java

爬虫框架——scrapy(-)

一：scrapy框架简介和基础使用

1:使用准备

a)概念：python开发的一个拥有高速、方便、强大等特点，用来爬取网站数据的框架。所谓的框架就是集成了相应的功能且具有很强通用性模板。

b)安装：linux,mac:pip install scrapy

2:使用流程

--工程创建：

　　　　　　打开终端输入 --scrapy startproject pro_name

工程目录文件作用及说明：

project_name/

　　　　　　　　　scrapy.cfg: #项目主配置信息

project_name/

__init__.py

items.py #编写数据存储模版类，用于结构化数据

pipelines.py #管道文件，用于数据存储

settings.py #配置文件，如递归层数，robats协议，中间件配置等等

spiders/ #爬虫主目录如创建爬虫文件，编写爬虫解析规则

--爬虫文件创建

进入工程目录输入 scrapy genspiders file_name www.xxxxx.com(起始url，爬虫文件内部可修改)

# -*- coding: utf-8 -*-
import scrapy


class App01Spider(scrapy.Spider):
    #爬虫文件名称  通过文件名称，定位到需要执行哪一个爬虫文件
    name = 'app01'
    allowed_domains = ['www.baidu.com']# 允许的域名，只能爬去该域名下的页面数据，可注释
    start_urls = ['https://fanyi.baidu.com/sug']#起始url
    #爬虫开始时需要运行的方法，只运行一次。
    def start_requests(self):
        pass
    #解析方法，对获取的页面内容进行指定数据解析。请求一次运行一次
    def parse(self, response):
          print(response.text)
    #爬虫文件结束后执行的方法，常常用来关闭资源文件          
    def closed(self):
       pass

　　 --settings文件配置：

伪装请求头

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'

　　 robats协议

# Obey robots.txt rules   网站是否可以爬去的协议，非强制型
ROBOTSTXT_OBEY = False

　　 --编写爬虫文件

--解析数据建议使用xpath，scrapy已经集成。可一直接使用

class QiushiSpider(scrapy.Spider):
    name = 'qiushi'
    #因为图片不是html文件  所以需要注释
    #allowed_domains = ['www.qiushibaike.com/text']
    start_urls = ['https://www.qiushibaike.com/text/']
    page_num=1
    #建议用 xpath解析，框架已经集成该模块
    def parse(self, response):

       #xpath的解析返回列表
       div_list= response.xpath('//div[@id="content-left"]/div')
       # print(div_list)
       data_list=[]
       print(len(div_list))
       try:
           for div   in  div_list:
               #div返回列表 列表内获取的是对象
               # extract()[0]==extract_first()   获取对象的内容
               authon =div.xpath('./div/a[2]/h2/text()').extract()[0]
               content =div.xpath('.//div[@class="content"]/span/text()').extract_first()
               item =QiushiproItem()
               item['authon']=authon
               item['content']=content
               # print(item)
               yield item
       except:
           pass

　　 --运行程序

scrapy crawl file_name # 输出日志

scrapy crawl file_name --nolog #不输出日志

3:持久化存储

--基于命令进行持久化存储

scrapy crawl qiushi -o qiushi.csv --nolog

--基于管道进行存储

a:编写items文件

import scrapy


class QiushiproItem(scrapy.Item):
    # define the fields for your item here like:
    #item 编写需要存储的对象
    authon = scrapy.Field()
    content=scrapy.Field()

　 b:编写爬虫文件

--导入items

--yiled提交item对象给管道

# -*- coding: utf-8 -*-
import scrapy
from qiushiPro.items import QiushiproItem

class QiushiSpider(scrapy.Spider):
    name = 'qiushi'
    #因为图片不是html文件  所以需要注释
    #allowed_domains = ['www.qiushibaike.com/text']
    start_urls = ['https://www.qiushibaike.com/text/']
    page_num=1
    #建议用 xpath解析，框架已经集成该模块
    def parse(self, response):

       #xpath的解析返回列表
       div_list= response.xpath('//div[@id="content-left"]/div')
       # print(div_list)
       data_list=[]
       print(len(div_list))
       try:
           for div   in  div_list:
               #div返回列表 列表内获取的是对象
               # extract()[0]==extract_first()   获取对象的内容
               authon =div.xpath('./div/a[2]/h2/text()').extract()[0]
               content =div.xpath('.//div[@class="content"]/span/text()').extract_first()
               item =QiushiproItem()
               item['authon']=authon
               item['content']=content
               # print(item)
               yield item
       except:
           pass

　　c:pipelines文件中编写持久化保存逻辑

    --process_spider(self,item,spider)   yiled提交一次，该方法执行一次s
    --open_spider(self,spider)   开始爬虫时程序的开始，只执行一次，常常用作准备工作
    --close_spider(self,spider)  结束程序时执行的方法，只执行一次，常常用来关闭资源。

d:配置settings文件开启管道

ITEM_PIPELINES = {
   'qiushiPro.pipelines.QiushiproPipeline': 300,
   
}

　--基于redis持久化存储

     1:  官网安装并解压
     2：进入目录  make编译
     3: 进入src 文件进行启动
        --开启客户端  ./redis-cli
        --开启服务器  ./redis-server ../redis.conf   结合配置文件进行配置
        --三个文件
            redis.conf  配置文件
            redis-server  编译后启动服务器
            redis-cil  启动客户端
     4:重复以上步骤(pipelines进行部分修改)

class redisPipeline(object):
    coon =None
    def open_spider(self,spider):
        self.coon=redis.Redis(host='127.0.0.1',port=6379)
    def process_item(self, item, spider):
        # print(item)
        authon =item['authon']
        content=item['content']
        dict = {
            'authon':authon,
            'content':content
        }
        # print(self.coon.lpush('data',dict))
        try:
            self.coon.lpush('data',json.dumps(dict))
        except:
            pass
        return item

--基于mysql持久化存储

--和redis存储相同，修改pipelines.py文件中，数据库即可。

--多种数据库同时存储

--编写多个管道文件

--settings开启管道文件

#设置管道优先级数值越大优先级越高
ITEM_PIPELINES = {
   'qiushiPro.pipelines.QiushiproPipeline': 300,
   'qiushiPro.pipelines.redisPipeline': 200,
}

4:多个url数据爬取解决方法一

1:设置多个url,手动发送多个url请求

--分析url统一模板，并通过yiled scrapy.Request(url,callback=self.parse)方法，递归调用parse解析方法

       if self.page_num<=13:
           self.page_num+=1
           url= 'https://www.qiushibaike.com/text/page/%d'%self.page_num
           #callback递归调用解析函数
           yield  scrapy.Request(url,callback=self.parse)

　--使用crawlscrapy(下文介绍)

查看全文

相关阅读:
解耦和耦合
 python os.remove()方法
 python中split()、os.path.split()函数用法
 P7116-[NOIP2020]微信步数【数学】
2021牛客OI赛前集训营-方格计数【计数,dp】
2021牛客OI赛前集训营-树数树【树上启发式合并,堆】
Ybtoj-排列计数【矩阵乘法,分块幂】
P7888-「MCOI-06」Distinct Subsequences【dp】
号爸十一集训 Logs
数据结构专项题解

原文地址：https://www.cnblogs.com/yingjp/p/10551383.html