zoukankan html css js c++ java

scrapy爬虫系列之二--翻页爬取及日志的基本用法

功能点：如何翻页爬取信息，如何发送请求，日志的简单实用

爬取网站：腾讯社会招聘网

完整代码：https://files.cnblogs.com/files/bookwed/tencent.zip

主要代码：

job.py

# -*- coding: utf-8 -*-
import scrapy
from tencent.items import TencentItem
import logging  # 日志模块

logger = logging.getLogger(__name__)

class JobSpider(scrapy.Spider):
    """职位爬虫"""
    name = 'job'
    allowed_domains = ["tencent.com"]
    offset = 0
    baseUrl = "https://hr.tencent.com/position.php?start={}"
    start_urls = [baseUrl.format(offset)]

    def parse(self, response):
        # //tr[@class="even" or @class="odd"]
        # xpath()，返回一个含有selector对象的列表
        job_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
        for job in job_list:
            item = TencentItem()
            # extract() 提取字符串，返回一个包含字符串数据的列表
            # extract_first()，返回列表中的第一个字符串
            # extract()[0] 可以替换成extract_first()，不用再进行判断是否为空了
            item["name"] = job.xpath("./td[1]/a/text()").extract_first()
            item["url"] = job.xpath("./td[1]/a/@href").extract()[0]
            item["type"] = job.xpath("./td[2]/text()")
            item["type"] = item["type"].extract()[0] if len(item["type"]) > 0 else None
            item["people_number"] = job.xpath("./td[3]/text()").extract()[0]
            item["place"] = job.xpath("./td[4]/text()").extract()[0]
            item["publish_time"] = job.xpath("./td[5]/text()").extract()[0]
            # 打印方式1
            # logging.warning(item)
            # 打印方式2，【推荐，可以看到是哪个文件打印的】
            logger.warning(item)
            # 为什么使用yield？好处？
            # 让整个函数变成一个生成器。每次遍历的时候挨个读到内存中，不会导致内存的占用量瞬间变高
            yield item

        # 第一种：拼接url
        # if self.offset < 3090:
        #     self.offset += 10
        #     url = self.baseUrl.format(self.offset)
        #     yield scrapy.Request(url, callback=self.parse)

        # yield response.follow(next_page, self.parse)

        # 第二种：从response获取要爬取的链接，并发送请求处理，知道链接全部提取完
        if len(response.xpath("//a[@class='noactive' and @id='next']")) == 0:
            temp_url = response.xpath("//a[@id='next']/@href").extract()[0]
            # yield response.follow("https://hr.tencent.com/"+temp_url, callback=self.parse)
            yield scrapy.Request(
                "https://hr.tencent.com/"+temp_url,
                callback=self.parse,
                # meta={"item": item}    # meta实现在不同的解析函数中传递数据
                # dont_filter=True    # 重复请求
            )   # 此处的callback指返回的响应由谁进行解析，如果和第一页是相同的处理，则用parse，否则定义新方法，指定该新方法

    def parse1(self, response):
        item = response.meta["item"]
        print(item)
        print("*"*30)

pipelines.py

import json


class TencentPipeline(object):
    # 可选实现，参数初始化等
    def __init__(self):
        self.f = open('tencent_job.json', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        # item（Item对象） -  被爬取的item
        # spider（Spider对象）- 爬取item时的spider；通过spider.name可以获取爬虫名称
        content = json.dumps(dict(item), ensure_ascii=False)+",
"
        self.f.write(content)
        return item

    def open_spider(self, spider):
        # 可选，spider开启时，该方法被调用
        pass

    def close_spider(self, spider):
        # 可选，spider关闭时，该方法被调用
        self.f.close()

查看全文

相关阅读:
树链剖分（模板）洛谷3384
ST表（模板）洛谷3865
IOI 2005 River （洛谷 3354）
IOI 2005 River （洛谷 3354）
poj1094 Sorting It All Out
poj1094 Sorting It All Out
spfa（模板）
HAOI 2006 受欢迎的牛（洛谷2341）
HAOI 2006 受欢迎的牛（洛谷2341）
洛谷1850(NOIp2016) 换教室——期望dp

原文地址：https://www.cnblogs.com/bookwed/p/10617942.html