zoukankan html css js c++ java

爬虫第六篇：scrapy框架爬取某书网整站爬虫爬取

新建项目

# 新建项目
$ scrapy startproject jianshu
# 进入到文件夹
$ cd jainshu
# 新建spider文件
$ scrapy genspider -t crawl jianshu_spider jainshu.com

items.py文件

import scrapy


class ArticleItem(scrapy.Item):
    title = scrapy.Field()
    content = scrapy.Field()
    article_id = scrapy.Field()
    origin_url = scrapy.Field()
    author = scrapy.Field()
    avatar = scrapy.Field()
    pub_time = scrapy.Field()

jianshu_spider.py文件

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu.items import ArticleItem


class JianshuSpiderSpider(CrawlSpider):
    name = 'jianshu_spider'
    allowed_domains = ['jianshu.com']
    start_urls = ['https://www.jianshu.com/']

    rules = (
        Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
    )

    def parse_detail(self, response):
        title = response.xpath("//h1[@class='title']/text()").get()
        content = response.xpath("//div[@class='show-content-free']").get()
        avatar = response.xpath("//a[@class='avatar']/img/@src").get()
        author = response.xpath("//div[@class='info']/span/a/text()").get()
        pub_time = response.xpath("//span[@class='publish-time']/text()").get()
        article_id = response.url.split("?")[0].split("/")[-1]
        origin_url = response.url
        item = ArticleItem(
            title=title,
            content=content,
            avatar=avatar,
            pub_time=pub_time,
            article_id=article_id,
            origin_url=origin_url,
            author=author
        )
        yield item

同步的MySQL插入数据

import pymysql


class JianshuPipeline(object):
    def __init__(self):
        dbparams = {
            'host': '127.0.0.1',
            'user': 'root',
            'password': '123456',
            'database': 'jianshu',
            'port': 3306,
            'charset': 'utf8'
        }
        self.conn = pymysql.connect(**dbparams)
        self.cursor = self.conn.cursor()
        self._sql = None

    def process_item(self, item, spider):
        self.cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], 
                                       item['pub_time'], item['origin_url'], item['article_id']))
        self.conn.commit()
        return item

    @property
    def sql(self):
        if not self._sql:
            self._sql = """
            insert into article(title,content, author, avatar, pub_time, origin_url, article_id) values (%s, %s, %s, %s, %s, %s,%s)
            """
            return self._sql
        return self._sql

异步的MySQL插入数据

from twisted.enterprise import adbapi
from pymysql import cursors
class JianshuTwistedPipeline(object):
    def __init__(self):
        dbparams = {
            'host': '127.0.0.1',
            'user': 'root',
            'password': '123456',
            'database': 'jianshu',
            'port': 3306,
            'charset': 'utf8',
            'cursorclass': cursors.DictCursor
        }
        self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams)
        self._sql = None

    @property
    def sql(self):
        if not self._sql:
            self._sql = """
                insert into article(title,content, author, avatar, pub_time, origin_url, article_id) values (%s, %s, %s, %s, %s, %s,%s)
                """
            return self._sql
        return self._sql

    def process_item(self, item, spider):
        defer = self.dbpool.runInteraction(self.insert_item, item)
        defer.addErrback(self.handle_error, item, spider)

    def insert_item(self, cursor, item):
        cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], 
                                  item['pub_time'], item['origin_url'], item['article_id']))

    def handle_error(self, error, item, spider):
        print('=' * 10 + 'error' + '=' * 10)
        print(error)
        print('=' * 10 + 'error' + '=' * 10)

查看全文

相关阅读:
数组分割成多个数组
 node-inspector调试工具
 6.17周六随写
 JavaScript设计模式
 JavaScript设计模式
 async源码学习
 Linux信号列表
 php常用Stream函数集介绍
 php进程控制
 php 单例模式与常驻服务

原文地址：https://www.cnblogs.com/leijing0607/p/8075324.html