zoukankan      html  css  js  c++  java
  • python-scrapy深度爬取

    爬取电影网站

    movie.py

    import scrapy
    from MyProjectDianying.items import MyprojectdianyingItem

    class MovieSpider(scrapy.Spider):
    name = 'movie'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.1905.com/vod/list/n_1_t_1/o3.html?fr=vodhome_js_lx']

    url = 'https://www.1905.com/vod/list/n_1_t_1/o3p%d.html'
    page = 2

    def parse(self, response):
    divs = response.xpath('//*[@id="content"]/section[4]/div')
    for div in divs:
    href = div.xpath('./a/@href')[0].extract()
    title = div.xpath('./a/@title')[0].extract()
    item = MyprojectdianyingItem()
    item["href"] = href
    item["title"] = title
    print(title)
    yield scrapy.Request(href, callback=self.parse_href, meta={'item': item})
    if self.page < 4:
    url = format(self.url % self.page)
    yield scrapy.Request(url,callback=self.parse)
    self.page += 1

    def parse_href(self,response):
    detail = response.xpath('//*[@id="playerBoxIntroCon"]/text()')[0].extract()
    item = response.meta['item']
    item["detail"] = detail
    yield item

    items.py

    import scrapy

    class MyprojectdianyingItem(scrapy.Item):
    # define the fields for your item here like:
    href = scrapy.Field()
    title = scrapy.Field()
    detail = scrapy.Field()

    settings.py

    USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    ROBOTSTXT_OBEY = False
    LOG_LEVEL = 'ERROR'
    ITEM_PIPELINES = {
    'MyProjectDianying.pipelines.MyprojectdianyingPipeline': 300,
    }

    pipelines.py

    class MyprojectdianyingPipeline:
    fp = None
    def open_spider(self,spider):
    self.fp = open('dianying.txt', mode='w', encoding='utf-8')

    def process_item(self, item, spider):

    href = item["href"]
    title = item["title"]
    detail = item["detail"]
    self.fp.write(title+href+detail+' ')
    return item

    def close_spider(self,spider):
    self.fp.close()
  • 相关阅读:
    堆排序
    上线打包不常见错误整理
    ios开发者相关的几个apple邮箱
    App被拒选择回复还是重新提审,如何选择最高效的应对方式?
    iOS证书(.p12)和描述文件(.mobileprovision)申请
    OC与Swift混编
    tableViewCell重用
    tabBar选择不同item设置标题不同颜色
    iOS 关于TouchID指纹解锁的实现
    cocoaPods报错You need at least git version 1.8.5 to use CocoaPods
  • 原文地址:https://www.cnblogs.com/shiyi525/p/14274049.html
Copyright © 2011-2022 走看看