爬取《盗墓笔记》和爬取《宦海沉浮》原理一样,但是使用了两种不同的追踪链接的方式,《盗墓笔记》使用的是跟踪下一页链接,直至没有下一页为止,《宦海沉浮》则是将所有的url都放在start_urls中进行爬取
第一步:新建项目
KeysdeMacBook:Desktop keys$ scrapy startproject Novel New Scrapy project 'Novel', using template directory '/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/scrapy/templates/project', created in: /Users/keys/Desktop/Novel You can start your first spider with: cd Novel scrapy genspider example example.com
第二步:创建爬虫
KeysdeMacBook:Desktop keys$ cd Novel/ KeysdeMacBook:MyCrawl keys$ scrapy genspider BiJi www.mossiella.com/html/255.html
第三步:配置item.py
# -*- coding: utf-8 -*- import scrapy class NovelItem(scrapy.Item): url = scrapy.Field() title = scrapy.Field() content = scrapy.Field()
第四步:编写爬虫
# -*- coding: utf-8 -*- import scrapy from Novel.items import NovelItem class BijiSpider(scrapy.Spider): name = 'BiJi' allowed_domains = ['www.mossiella.com'] start_urls = ['http://www.mossiella.com/html/255.html/'] def parse(self, response): url = response.url title = response.xpath('//h1/text()').extract_first('') content = ','.join(response.xpath('//div[@class="zwcent"]/p/text()').extract()) myitem = NovelItem() myitem['url'] = url myitem['title'] = title myitem['content'] = content yield myitem next_url = response.xpath('//a[@rel="next"]') if next_url: url = next_url.css('::attr(href)').extract_first('') yield scrapy.Request(url=url, callback=self.parse)
第五步:配置pipeline.py
import pymysql class MysqlPipeline(object): # 采用同步的机制写入mysql def __init__(self): self.conn = pymysql.connect( '127.0.0.1', 'root', 'rootkeys', 'Article', charset="utf8", use_unicode=True) self.cursor = self.conn.cursor() def process_item(self, item, spider): insert_sql = """ insert into BiJi(url, title, content) VALUES (%s, %s, %s) """ # 使用VALUES实现传值 self.cursor.execute( insert_sql, (item["url"], item["title"], item["content"])) self.conn.commit()
第六步:配置setting.py
BOT_NAME = 'Novel' SPIDER_MODULES = ['Novel.spiders'] NEWSPIDER_MODULE = 'Novel.spiders' USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' ROBOTSTXT_OBEY = False
第七步:运行爬虫
import os import sys from scrapy.cmdline import execute sys.path.append(os.path.dirname(os.path.abspath(__file__))) run_spider = 'BiJi' if __name__ == '__main__': print('Running Spider of ' + run_spider) execute(['scrapy', 'crawl', run_spider])