zoukankan      html  css  js  c++  java
  • 爬取虎牙标题、作者、热度

    # -*- coding: utf-8 -*-
    import scrapy
    from huyaAll1.items import Huyaall1Item


    class HuyaSpider(scrapy.Spider):
    name = 'huya'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.huya.com/g/xingxiu']

    # 建立通用的url模板:
    url = "https://www.huya.com/cache.php?m=LiveList&do=getLiveListByPage&gameId=1663&tagAll=0&page=%d"

    def parse(self, response):
    li_list = response.xpath('//*[@id="js-live-list"]/li')
    for li in li_list:
    title = li.xpath('./a[2]/text()').extract_first()
    author = li.xpath('./span/span[1]/i/text()').extract_first()
    hot = li.xpath('./span/span[2]/i[2]/text()').extract_first()
    # 实例化item类型对象:
    item = Huyaall1Item()
    item['title'] = title
    item['author'] = author
    item['hot'] = hot
    yield item

    # 手动请求发送:
    for page in range(2, 5):
    new_url = format(self.url % 1)
    # 发起get请求:
    yield scrapy.Request(url=new_url, callback=self.parse_othor)

    # 解析方法模拟parse进行定义、必须要有和parse同样的参数:
    def parse_othor(self, response):
    print(response.text)
  • 相关阅读:
    sh_09_字典的定义
    sh_08_格式化字符串
    sh_07_元组遍历
    sh_06_元组基本使用
    sh_05_列表遍历
    sh_04_列表排序
    sh_03_列表的数据统计
    图片懒加载
    UA池和ip代理池
    爬虫篇 --- 分布式爬虫
  • 原文地址:https://www.cnblogs.com/zhang-da/p/12432110.html
Copyright © 2011-2022 走看看