Scrapy笔记: 一 安装:
-
pip3 install wheel
-
pip3 install lxml
-
pip3 install pyopenssl
-
pip3 install -i https://mirrors.aliyun.com/pypi/simple/ pypiwin32
-
下载文件(twisted): https://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted pip3 install 文件路径Twisted-19.2.0-cp36-cp36m-win_amd64.whl 6.pip3 install scrapy 7.scrapy 测试安装是否成功 Scrapy 1.6.0 - no active project ...
-
-
二 scrapy命令 全局(所有路径下都可以使用): bench Run quick benchmark test fetch Fetch a URL using the Scrapy downloader
# 会把爬虫程序创建在当前目录下
genspider Generate new spider using pre-defined templates
# 可以在当前目录下启动爬虫程序
runspider Run a self-contained spider (without creating a project)
runspider 爬虫程序的绝对路径
settings Get settings values
shell Interactive scraping console
# 创建scrapy项目
startproject Create new project
version Print Scrapy version
view
局部(在scrapy项目中可以使用):
bench Run quick benchmark test
# 监测语法
check Check spider contracts
# 根据爬虫程序的name 启动爬虫程序
crawl Run a spider
# !!!!!!!重点使用它!!!!!!!
scrapy crawl name
edit Edit spider
fetch Fetch a URL using the Scrapy downloader
genspider Generate new spider using pre-defined templates
# 查看所有的爬虫程序
list List available spiders
parse Parse URL (using its spider) and print the results
runspider Run a self-contained spider (without creating a project)
settings Get settings values
shell Interactive scraping console
startproject Create new project
version Print Scrapy version
view Open URL in browser, as seen by Scrapy
三 创建scrapy项目
# 创建实例
python3 manage.py app01
python3 manage.py bbs
Scrapy:
# 创建项目
scrapy startproject spider_project
cd spider_project
# 创建爬虫程序
scrapy genspider chouti chouti.com
# 执行爬虫程序
scrapy crawl chouti
# 取消日志
scrapy crawl --nolog chouti
settings:
# 不遵循反爬协议
ROBOTSTXT_OBEY = False
main.py
from scrapy.cmdline import execute
# execute(['scrapy', 'crawl', 'baidu'])
execute("scrapy crawl --nolog chouti".split(' '))
main.py
from scrapy.cmdline import execute
# execute(['scrapy', 'crawl', 'baidu'])
execute("scrapy crawl --nolog chouti".split(' '))
Spiders.chouti.py
class ChoutiSpider(scrapy.Spider):
def parse(self, response):
def parse_user_index(self, response):
items.py
# 新闻items类
class SpiderNewListItem(scrapy.Item):
# define the fields for your item here like:
# 新闻链接
new_url = scrapy.Field()
# 新闻文本
new_text = scrapy.Field()
# 点赞数
nice_num = scrapy.Field()
# 新闻ID
new_id = scrapy.Field()
# 评论数
commit_num = scrapy.Field()
# 新闻详情
new_content = scrapy.Field()
# 发表新闻用户的主页
user_link = scrapy.Field()
# 新闻items类
class SpiderUserListItem(scrapy.Item):
# define the fields for your item here like:
# 新闻链接
new_url = scrapy.Field()
# 新闻文本
new_text = scrapy.Field()
# 点赞数
nice_num = scrapy.Field()
# 新闻ID
new_id = scrapy.Field()
# 评论数
commit_num = scrapy.Field()
# 新闻详情
new_content = scrapy.Field()
# 用户名
user_name = scrapy.Field()
pipelines.py
class SpiderNewListPipeline(object):
def __init__(self, ip, port, mongo_db):
self.ip = ip
self.port = port
self.mongo_db = mongo_db
# 必须配置才可以启动ITEM_PIPELINES
ITEM_PIPELINES = {
'spider_project.pipelines.SpiderNewListPipeline': 300,
'spider_project.pipelines.SpiderUserListPipeline': 301,
}
# MongoDB配置信息
IP = 'localhost'
PORT = 27017
DB = 'chouti'