zoukankan html css js c++ java

利用Scrapy框架对4567电影爬取

1.创建一个爬虫文件Movie:--scrapy genspider Movie

2.在爬虫文件中编写:

# -*- coding: utf-8 -*-
import scrapy
from dianying.items import DianyingItem


class MovieSpider(scrapy.Spider):
    name = 'Movie'
    #allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.4567kan.com/index.php/vod/show/class/爱情/id/7.html']
    url = 'https://www.4567kan.com/index.php/vod/show/class/爱情/id/7/page/%d.html'
    pageNumber = 2 # 爬取的页码

    def parse(self, response):
        li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
        for li in li_list:
            name = li.xpath('./div/a/@title')[0].extract()
            detai_url = 'https://www.4567kan.com'+li.xpath('./div/a/@href').extract_first()
            item = DianyingItem()
            item['name'] = name
            # 利用dedail_url对每个详情页面进行爬取
            yield scrapy.Request(detai_url,callback=self.parse_detai,meta={'item':item})  # meta参数的作用,给回调函数
        if self.pageNumber < 5: #爬取前五页的代码数据
            new_url = format(self.url%self.pageNumber)
            yield scrapy.Request(new_url,callback=self.parse)

    def parse_detai(self,response):
        item = response.meta['item']
        desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
        item['desc'] = desc
        yield item  # 把item传给管道

3.在item.py中编写

import scrapy


class DianyingItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    desc = scrapy.Field()


4.在管道中编写

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql

class DianyingPipeline:
    conn = None
    c = None
    def open_spider(self,spider):
        self.conn = pymysql.Connect(user='root', password='123456', host='localhost'
                               , port=3306, database='xuezhijun',charset = 'utf8')
        self.c = self.conn.cursor()

    def process_item(self, item, spider):
        name = item['name']
        desc = item['desc']
        try:
            self.c.execute('insert into DY values (%s,%s)',
                      (name, desc))
        except Exception as  e:
            print(e)
            self.conn.rollback()
        self.conn.commit()
        return item
    def close_spider(self,spider):
        self.c.close()
        self.conn.close()

5.设置一下配置文件

ITEM_PIPELINES = {
   'dianying.pipelines.DianyingPipeline': 300,
}

ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'

查看全文

相关阅读:
FreeRTOS 动态内存管理
 NODE.JS之我见
 Maven使用详解
 WPF依赖属性详解
 对比MFC资源文件谈谈WPF布局方式
 MAPPING SEGMENTS TO PAGES
只用一行代码让你的ASP.NET MVC 跑起来
 WPF The Hard Way
Java判断回文数算法简单实现
 2014让我受益最大的那些书--别找了，都在这里

原文地址：https://www.cnblogs.com/KingOfCattle/p/13038892.html