zoukankan html css js c++ java

Scrapy 框架

一、Scrapy 安装

#Windows安装
pip3 install wheel 
pip3 install lxml
pip3 install pyopenssl
pip3 install pywin32#下载并安装pywin32：https://sourceforge.net/projects/pywin32/files/pywin32/
pip3 install scrapy


#Linux安装
pip3 install scrapy

二、Scrapy命令介绍

startproject #创建项目
        genspider    #创建爬虫程序
        settings     #如果是在项目目录下，则得到的是该项目的配置
        runspider    #运行一个独立的python文件，不必创建项目
        shell        #scrapy shell url地址  在交互式调试，如选择器规则正确与否
        fetch        #独立于程单纯地爬取一个页面，可以拿到请求头
        view         #下载完毕后直接弹出浏览器，以此可以分辨出哪些数据是ajax请求
        version      #scrapy version 查看scrapy的版本，scrapy version -v查看scrapy依赖库的版本
    Project-only commands:
        crawl        #运行爬虫，必须创建项目才行，确保配置文件中ROBOTSTXT_OBEY = False
        check        #检测项目中有无语法错误
        list         #列出项目中所包含的爬虫名
        edit         #编辑器
        parse        #scrapy parse url地址 --callback 回调函数  #以此可以验证我们的回调函数是否正确
        bench        #scrapy bentch压力测试

三、创建项目

scrapy startproject mytest #创建项目mytest
cd mytest  #进入项目目录
scrapy genspider   douban.com www.douban.com   #创建一个豆瓣的爬虫程序
<C:Users80426mytestmytest>scrapy runspider  spiders/douban_com.py #执行爬虫程序
scrapy settings --get BOT_NAME #获取配置文件信息
scrapy fetch --nolog http://www.xiaohua.com   #获取响应内容（类似curl命令
scrapy fetch --nolog --headers http://www.xiaohua.com #获取响应的请求头
scrapy list  #所有的爬虫名
scrapy check  #检测爬虫有没有错误
scrapy shell https://www.baidu.com  #直接对目标站点发请求
scrapy bench #压力测试（速度较慢）

查看全文

相关阅读:
linux 常用命令
 ubuntu 安装在硬盘与配置
 linux管道符、重定向与环境变量
 linux用户身份与文件权限
 centos开启ftp服务
 js实现常见排序算法
 算法分析
 Vim
CSS的3种使用方法
 cookie 在登录时的存储，获取，清除

原文地址：https://www.cnblogs.com/zhangb8042/p/10046394.html