10.1.简书整站爬虫
创建项目
scrapy startproject jianshu scrapy genspider -t crawl jianshu_spider "jianshu.com"
jianshu_spider.py
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from jianshu.items import JianshuItem class JianshuSpiderSpider(CrawlSpider): name = 'jianshu_spider' allowed_domains = ['jianshu.com'] start_urls = ['http://jianshu.com/'] rules = ( Rule(LinkExtractor(allow=r'.*/p/[0-9a-z][12].*'), callback='parse_detail', follow=True), ) def parse_detail(self, response): title = response.xpath("//h1[@class='title']/text()").get() avatar = response.xpath("//a[@class='avatar']/img/@src").get() author = response.xpath("//span[@class='name']/a/text()").get() pub_time = response.xpath("//span[@class='publish-time']/text()").get().replace("*","") #获取文章id url = response.url url1 = url.split("?")[0] article_id = url1.split("/")[-1] #文章内容,包括标签,而不是存文本内容 content = response.xpath("//div[@class='show-content']").get() word_count = response.xpath("//span[@class='wordage']/text()").get() comment_count = response.xpath("//span[@class='comments-count']/text()").get() read_count = response.xpath("//span[@class='views-count']/text()").get() like_count = response.xpath("//span[@class='likes-count']/text()").get() subjects = ",".join(response.xpath("//div[@class='include-collection']/a/div/text()").getall()) item = JianshuItem( title=title, avatar=avatar, pub_time=pub_time, author=author, origin_url=response.url, content=content, article_id=article_id, subjects=subjects, word_count=word_count, comment_count=comment_count, like_count=like_count, read_count=read_count ) yield item
items.py
import scrapy class JianshuItem(scrapy.Item): title = scrapy.Field() content = scrapy.Field() article_id = scrapy.Field() origin_url = scrapy.Field() author = scrapy.Field() avatar = scrapy.Field() pub_time = scrapy.Field() read_count = scrapy.Field() like_count = scrapy.Field() word_count = scrapy.Field() subjects = scrapy.Field() comment_count = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*- # import pymysql # # class JianshuPipeline(object): # def __init__(self): # dbparams = { # 'host': '127.0.0.1', # 'port': 3306, # 'user': 'root', # 'password': '123456', # 'database': 'jianshu', # 'charset': 'utf8' # } # self.conn = pymysql.connect(**dbparams) # self.cursor = self.conn.cursor() # self._sql = None # # def process_item(self, item, spider): # self.cursor.execute(self.sql, (item['title'], item['content'], # item['author'], item['avatar'], item['pub_time'], item['article_id'], # item['origin_url'],item['like_count'],item['word_count'],item['subjects'],item['comment_count'],item['read_count'])) # self.conn.commit() # return item # # @property # def sql(self): # if not self._sql: # self._sql = """ # insert into article(id,title,content,author,avatar,pub_time, # article_id,origin_url,like_count,word_count,subjects,comment_count,read_count) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) # """ # return self._sql # return self._sql # 采用twisted异步保存到mysql import pymysql from twisted.enterprise import adbapi from pymysql import cursors class JianshuTwistedPipeline(object): def __init__(self): dbparams = { 'host': '127.0.0.1', 'port': 3306, 'user': 'root', 'password': '123456', 'database': 'jianshu', 'charset': 'utf8', 'cursorclass': cursors.DictCursor } self.dbpool = adbapi.ConnectionPool("pymysql", **dbparams) self._sql = None @property def sql(self): if not self._sql: self._sql = """ insert into article(id,title,content,author,avatar,pub_time, article_id,origin_url,like_count,word_count,subjects,comment_count,read_count) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ return self._sql return self._sql def process_item(self, item, spider): defer = self.dbpool.runInteraction(self.insert_item, item) defer.addErrback(self.handle_error, item, spider) def insert_item(self, cursor, item): cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['article_id'], item['origin_url'],item['like_count'],item['word_count'],item['subjects'],item['comment_count'],item['read_count'])) def handle_error(self, error, item, spider): # print(error) pass
middlewares.py
# -*- coding: utf-8 -*- from selenium import webdriver import time from scrapy.http.response.html import HtmlResponse class SeleniumDownloadMiddleware(object): def __init__(self): self.driver = webdriver.Chrome() def process_request(self,request,spider): self.driver.get(request.url) time.sleep(1) try: while True: showmore = self.driver.find_element_by_class_name('show-more') showmore.click() time.sleep(0.5) if not showmore: break except: pass source = self.driver.page_source response = HtmlResponse(url=self.driver.current_url,body=source,request=request,encoding='utf-8') return response
settings.py
ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 1 DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36', } DOWNLOADER_MIDDLEWARES = { 'jianshu.middlewares.SeleniumDownloadMiddleware': 543, } ITEM_PIPELINES = { # 'jianshu.pipelines.JianshuPipeline': 300, 'jianshu.pipelines.JianshuTwistedPipeline': 1, }
start.py
from scrapy import cmdline cmdline.execute("scrapy crawl jianshu_spider".split())