zoukankan      html  css  js  c++  java
  • scrapy爬取python职位

    • 使用scrapy框架爬取前程无忧上的python职位
    1. 创建cmd文件:star.cmd
    scrapy startproject Jobs
    cd Jobs
    scrapy genspider Job51Spider www.51job.com
    
    1. 使用编译器打开Jobs开始项目
    • 打开/spiders/Job51Spider.py 写入
    # -*- coding: utf-8 -*-
    import json
    import re
    import time
    
    from scrapy import Spider, Request
    import requests
    
    from Jobs.items import Job51Item
    
    
    class Job51spiderSpider(Spider):
        name = 'Job51Spider'
        allowed_domains = ['www.51job.com']
        start_urls = ['http://www.51job.com/']
    
        # 配置搜索城市, 和搜索关键字
        kw = 'python'
        sou_url = 'https://search.51job.com/list/{city_code},000000,0000,00,9,99,{kw},2,1.html'
        # 城市编号js
        city_codings_url = 'https://js.51jobcdn.com/in/js/2016/layer/area_array_c.js?20180319'
    
        def start_requests(self):
            # 获取循环城市
            cities = self.get_url_citycods()
            forcity = list(cities)[:2]  # 这里切割前两个城市
            for city in forcity:
                yield Request(
                    self.sou_url.format(city_code=cities[city], kw=self.kw),
                    callback=self.parse_jobs,
                    meta={'city': city}
                )
    
        def parse_jobs(self, response):
            city = response.meta['city']
            els = response.css('.dw_table .el')[1:]
            # import ipdb; ipdb.set_trace()
            for el in els:
                item = Job51Item()
                item['soucity'] = city
                item['pname'] = el.css('span a::text').extract_first().strip()
                item['purl'] = el.css('span a::attr(href)').extract_first().strip()
                item['cname'] = el.css('span.t2 a::text').extract_first().strip()
                item['curl'] = el.css('span.t2 a::attr(href)').extract_first().strip()
                item['address'] = el.css('span.t3::text').extract_first().strip()
                item['pay'] = el.css('span.t4::text').extract_first()
                item['retime'] = el.css('span.t5::text').extract_first().strip()
                yield item
    
            next_page = response.css('.bk a::text')[-1].extract().strip()
            # import ipdb;ipdb.set_trace()
            if next_page == '下一页':
                next_url = response.css('.bk a::attr(href)')[-1].extract().strip()
                yield Request(url=next_url, callback=self.parse_jobs, dont_filter=True, meta={'city': city})
    
        # 获取城市编号
        def get_url_citycods(self):
            area_text = requests.get(self.city_codings_url).text
            ss = re.search('({.*}).*?', area_text, re.S)
            st = ss.group()
            st_dict = json.loads(st)
            # 键值调换
            in_dict = {}
            # for k in st_dict:
            #     in_dict[st_dict[k]] = k
            # with open('data.json', 'wt', encoding='utf-8') as fs:
            #     json.dump(in_dict, fs, indent=4, ensure_ascii=False)
            # # 获取主要城市
            in_dict.clear()
            for k in st_dict:
                if k.find('0000') not in [-1]:
                    in_dict[st_dict[k]] = k
            with open('city_big.json', 'wt', encoding='utf-8') as fs:
                json.dump(in_dict, fs, indent=4, ensure_ascii=False)
            return in_dict
    
        def parse(self, response):
            pass 
    
    • items.py 中
    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    from scrapy import Item, Field
    
    
    class Job51Item(Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        soucity = Field()
        # 职位名
        pname = Field()
        # 职位地址
        purl = Field()
        # 公司名
        cname = Field()
        # 公司地址
        curl = Field()
        # 工作地点
        address = Field()
        # 工资
        pay = Field()
        # 发布时间
        retime = Field()  
        
    
    • pipelines.py 中存入mongodb
    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    from pymongo import MongoClient
    
    class Job51Pipeline(object):
        
        job51s = 'job51'
    
        def __init__(self, mongo_uri, mongo_db):
            self.mongo_uri = mongo_uri
            self.mongo_db = mongo_db
    
        @classmethod
        def from_crawler(cls, crawler):
            return cls(
                    mongo_uri = crawler.settings.get('MONGO_URI'),
                    mongo_db = crawler.settings.get('MONGO_DB')
                )
    
        def open_spider(self, spider):
            self.client = MongoClient(self.mongo_uri)
            self.db = self.client[self.mongo_db]
    
        def close_spider(self, spider):
            self.client.close()
    
        def process_item(self, item, spider):
            self.db[self.job51s].insert_one(dict(item))
            return item 
    
    • setttings.py 中配置, 最后即可
    FEED_EXPORT_ENCODING = 'utf-8'
    MONGO_URI = 'localhost'
    MONGO_DB = 'jobsconnection'
    

    仅供参考学习

  • 相关阅读:
    Android Gradle Plugin指南(五)——Build Variants(构建变种版本号)
    文件内容操作篇clearerr fclose fdopen feof fflush fgetc fgets fileno fopen fputc fputs fread freopen fseek ftell fwrite getc getchar gets
    文件操作篇 close creat dup dup2 fcntl flock fsync lseek mkstemp open read sync write
    嵌入式linux应用程序调试方法
    version control system:git/hg/subversion/cvs/clearcase/vss。software configruation management。代码集成CI:Cruisecontrol/hudson/buildbot
    最值得你所关注的10个C语言开源项目
    如何记录linux终端下的操作日志
    CentOS 5.5 虚拟机安装 VirtualBox 客户端增强功能
    sizeof, strlen区别
    C/C++嵌入式开发面试题
  • 原文地址:https://www.cnblogs.com/yymor/p/10243495.html
Copyright © 2011-2022 走看看