zoukankan      html  css  js  c++  java
  • scrapy爬取python职位

    • 使用scrapy框架爬取前程无忧上的python职位
    1. 创建cmd文件:star.cmd
    scrapy startproject Jobs
    cd Jobs
    scrapy genspider Job51Spider www.51job.com
    
    1. 使用编译器打开Jobs开始项目
    • 打开/spiders/Job51Spider.py 写入
    # -*- coding: utf-8 -*-
    import json
    import re
    import time
    
    from scrapy import Spider, Request
    import requests
    
    from Jobs.items import Job51Item
    
    
    class Job51spiderSpider(Spider):
        name = 'Job51Spider'
        allowed_domains = ['www.51job.com']
        start_urls = ['http://www.51job.com/']
    
        # 配置搜索城市, 和搜索关键字
        kw = 'python'
        sou_url = 'https://search.51job.com/list/{city_code},000000,0000,00,9,99,{kw},2,1.html'
        # 城市编号js
        city_codings_url = 'https://js.51jobcdn.com/in/js/2016/layer/area_array_c.js?20180319'
    
        def start_requests(self):
            # 获取循环城市
            cities = self.get_url_citycods()
            forcity = list(cities)[:2]  # 这里切割前两个城市
            for city in forcity:
                yield Request(
                    self.sou_url.format(city_code=cities[city], kw=self.kw),
                    callback=self.parse_jobs,
                    meta={'city': city}
                )
    
        def parse_jobs(self, response):
            city = response.meta['city']
            els = response.css('.dw_table .el')[1:]
            # import ipdb; ipdb.set_trace()
            for el in els:
                item = Job51Item()
                item['soucity'] = city
                item['pname'] = el.css('span a::text').extract_first().strip()
                item['purl'] = el.css('span a::attr(href)').extract_first().strip()
                item['cname'] = el.css('span.t2 a::text').extract_first().strip()
                item['curl'] = el.css('span.t2 a::attr(href)').extract_first().strip()
                item['address'] = el.css('span.t3::text').extract_first().strip()
                item['pay'] = el.css('span.t4::text').extract_first()
                item['retime'] = el.css('span.t5::text').extract_first().strip()
                yield item
    
            next_page = response.css('.bk a::text')[-1].extract().strip()
            # import ipdb;ipdb.set_trace()
            if next_page == '下一页':
                next_url = response.css('.bk a::attr(href)')[-1].extract().strip()
                yield Request(url=next_url, callback=self.parse_jobs, dont_filter=True, meta={'city': city})
    
        # 获取城市编号
        def get_url_citycods(self):
            area_text = requests.get(self.city_codings_url).text
            ss = re.search('({.*}).*?', area_text, re.S)
            st = ss.group()
            st_dict = json.loads(st)
            # 键值调换
            in_dict = {}
            # for k in st_dict:
            #     in_dict[st_dict[k]] = k
            # with open('data.json', 'wt', encoding='utf-8') as fs:
            #     json.dump(in_dict, fs, indent=4, ensure_ascii=False)
            # # 获取主要城市
            in_dict.clear()
            for k in st_dict:
                if k.find('0000') not in [-1]:
                    in_dict[st_dict[k]] = k
            with open('city_big.json', 'wt', encoding='utf-8') as fs:
                json.dump(in_dict, fs, indent=4, ensure_ascii=False)
            return in_dict
    
        def parse(self, response):
            pass 
    
    • items.py 中
    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    from scrapy import Item, Field
    
    
    class Job51Item(Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        soucity = Field()
        # 职位名
        pname = Field()
        # 职位地址
        purl = Field()
        # 公司名
        cname = Field()
        # 公司地址
        curl = Field()
        # 工作地点
        address = Field()
        # 工资
        pay = Field()
        # 发布时间
        retime = Field()  
        
    
    • pipelines.py 中存入mongodb
    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    from pymongo import MongoClient
    
    class Job51Pipeline(object):
        
        job51s = 'job51'
    
        def __init__(self, mongo_uri, mongo_db):
            self.mongo_uri = mongo_uri
            self.mongo_db = mongo_db
    
        @classmethod
        def from_crawler(cls, crawler):
            return cls(
                    mongo_uri = crawler.settings.get('MONGO_URI'),
                    mongo_db = crawler.settings.get('MONGO_DB')
                )
    
        def open_spider(self, spider):
            self.client = MongoClient(self.mongo_uri)
            self.db = self.client[self.mongo_db]
    
        def close_spider(self, spider):
            self.client.close()
    
        def process_item(self, item, spider):
            self.db[self.job51s].insert_one(dict(item))
            return item 
    
    • setttings.py 中配置, 最后即可
    FEED_EXPORT_ENCODING = 'utf-8'
    MONGO_URI = 'localhost'
    MONGO_DB = 'jobsconnection'
    

    仅供参考学习

  • 相关阅读:
    WebAPI下的如何实现参数绑定
    MYSQL主从不同步延迟原理
    mysql的limit经典用法及优化
    ASP.NET MVC中的模型绑定
    使用EF实现数据库的增删改查
    NoSQL数据库技术特性解析之文档数据库
    MySQL 缓存 Query Cache
    Loadrunner test web service which need username and password
    vb写文件时报'Invalid procedure call or argument'
    Shell 笔记
  • 原文地址:https://www.cnblogs.com/yymor/p/10243495.html
Copyright © 2011-2022 走看看