zoukankan html css js c++ java

python之scrapy篇(一)

一、首先创建工程(cmd中进行)

scrapy startproject xxx

二、编写Item文件

添加要字段

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class DoubanItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 电影标题
    title = scrapy.Field()

    # 电影信息
    info = scrapy.Field()

    # 电影评分
    score = scrapy.Field()

    # 评分人数
    number = scrapy.Field()

    # 简介
    content = scrapy.Field()

# 简介
content = scrapy.Field()

三、进入spider文件(cmd中进行)

scrapy genspider demo 'www.movie.douban.com'

创建完成后进入

编写代码

# -*- coding: utf-8 -*-
import scrapy
from douban.items import DoubanItem


class DoubanmovieSpider(scrapy.Spider):
    name = 'doubanmovie'
    allowed_domains = ['movie.douban.com']
    offset = 0
    url = "https://movie.douban.com/top250?start="
    start_urls = (
        url + str(offset),
    )

    def parse(self, response):
        item = DoubanItem()
        # 标题
        movies = response.xpath("//div[@class='info']")

        for movie in movies:
            name = movie.xpath('div[@class="hd"]/a/span/text()').extract()

            message = movie.xpath('div[@class="bd"]/p/text()').extract()

            star = movie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()

            number = movie.xpath('div[@class="bd"]/div[@class="star"]/span/text()').extract()

            quote = movie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()

            if quote:
                quote = quote[0]
            else:
                quote = ''

            item['title'] = ''.join(name)
            item['info'] = quote
            item['score'] = star[0]
            item['content'] = ';'.join(message).replace(' ', '').replace('
', '')
            item['number'] = number[1].split('人')[0]

            yield item

        if self.offset < 225:
            self.offset += 25
            yield scrapy.Request(self.url + str(self.offset), callback=self.parse)

四、配置setting文件

# -*- coding: utf-8 -*-

# Scrapy settings for douban project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'douban'

SPIDER_MODULES = ['douban.spiders']
NEWSPIDER_MODULE = 'douban.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"

# Obey robots.txt rules
# ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36',
  # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  # 'Accept-Language': 'en',
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'douban.middlewares.DoubanSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   # 'douban.middlewares.DoubanDownloaderMiddleware': 100,
   'douban.middlewares.RandomUserAgent': 100,
}

USER_AGENT = [
   # Opera
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
   "Opera/8.0 (Windows NT 5.1; U; en)",
   "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
   "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
   # Firefox
   "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
   "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
   # Safari
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
   # chrome
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
   "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
   "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
   # 360
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
   "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
   # 淘宝浏览器
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
   # 猎豹浏览器
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
   "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
   "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
   # QQ浏览器
   "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
   "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
   # sogou浏览器
   "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
   "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
   # maxthon浏览器
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
   # UC浏览器
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
]

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   # 'douban.pipelines.DoubanPipeline': 300,
   'douban.pipelines.DoubanSqlPipeline': 300,
   'douban.pipelines.DoubanWritePipeline': 250,
}
# 主机名
MYSQL_HOST = "IP"
# 端口号
MYSQL_PORT = 3306
# 数据库用户名
MYSQL_USER = "root"
# 数据库密码
MYSQL_PASSWORD = "Password"
# 数据库名称
MYSQL_DBNAME = "mydouban"
# 存放数据的表名称
MYSQL_TABLENAME = "doubanmovies"
# 数据库编码
MYSQL_CHARSET = "utf8"


# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

配置管道文件

ITEM_PIPELINES = {
# 'douban.pipelines.DoubanPipeline': 300,
'douban.pipelines.DoubanSqlPipeline': 300,
'douban.pipelines.DoubanWritePipeline': 250,
}

User_Agent配置(包含了大多数浏览器)

DOWNLOADER_MIDDLEWARES = {
   # 'douban.middlewares.DoubanDownloaderMiddleware': 100,
   'douban.middlewares.RandomUserAgent': 100,
}

USER_AGENT = [
   # Opera
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
   "Opera/8.0 (Windows NT 5.1; U; en)",
   "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
   "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
   # Firefox
   "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
   "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
   # Safari
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
   # chrome
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
   "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
   "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
   # 360
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
   "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
   # 淘宝浏览器
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
   # 猎豹浏览器
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
   "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
   "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
   # QQ浏览器
   "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
   "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
   # sogou浏览器
   "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
   "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
   # maxthon浏览器
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
   # UC浏览器
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
]

# 主机名
MYSQL_HOST = "IP"
# 端口号
MYSQL_PORT = 3306
# 数据库用户名
MYSQL_USER = "root"
# 数据库密码
MYSQL_PASSWORD = "password"
# 数据库名称
MYSQL_DBNAME = "mydouban"
# 存放数据的表名称
MYSQL_TABLENAME = "doubanmovies"
# 数据库编码
MYSQL_CHARSET = "utf8"

五、管道文件pipelines

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

import pymongo
import pymysql
from scrapy import settings
import json
import logging
from pymysql import cursors
from twisted.enterprise import adbapi
import time
import copy

#保存到本地
class DoubanWritePipeline(object):
    def __init__(self):
        self.filename = open("douban.json", "wb")

    def process_item(self, item, spider):
        text = json.dumps(dict(item), ensure_ascii=False) + "
"
        self.filename.write(text.encode("utf-8"))
        return item

    def colse_spider(self, spider):
        self.filename.close()

#保存到mysql(提前建立好数据库)
class DoubanSqlPipeline(object):
    def __init__(self):
        self.conn = pymysql.connect(host='IP', user='root',
                               passwd='password', db='mydouban', charset='utf8')
        self.cur = self.conn.cursor()

    def process_item(self, item, spider):
        title = item["title"]
        info = item["info"]
        score = item["score"]
        number = item["number"]
        content = item["content"]
        # 创建sql语句
        sql = """INSERT INTO doubanmovies (title,info,score,number,content,createtime) VALUES ("{}","{}","{}","{}","{}","{}")""".format(
            pymysql.escape_string(title), pymysql.escape_string(info), pymysql.escape_string(score), pymysql.escape_string(number), pymysql.escape_string(content),pymysql.escape_string(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
        # 执行sql语句
        self.conn.ping(reconnect=True)
        self.cur.execute(sql)
        self.conn.commit()

    def close_spider(self, spider):
        self.cur.close()
        self.conn.close()

运行

scrapy crawl xxx

OVER！

下面展示一下效果...

查看全文

相关阅读:
netty的基本使用
 netty 实现简单的rpc调用
 NIO 的基本使用
 BIO实现 Socket 通信
 springboot使用ElasticSearch
docker-compose安装rabbitmq集群(主从集群---》镜像集群)
杂谈：面向微服务的体系结构评审中需要问的三个问题
 使用Spring Boot和RxJava的构建响应式REST API
JVM体系结构详解
 如何成为更好的程序员？

原文地址：https://www.cnblogs.com/jake-jin/p/11359798.html