zoukankan      html  css  js  c++  java
  • scrapy基础知识之 CrawlSpiders爬取lagou招聘保存在mysql(分布式):

    items.py

     1 import scrapy
     2 class LagouItem(scrapy.Item):
     3     # define the fields for your item here like:
     4     # name = scrapy.Field()
     5     #id
     6     # obj_id=scrapy.Field()
     7     #职位名
     8     positon_name=scrapy.Field()
     9     #工作地点
    10     work_place=scrapy.Field()
    11     #发布日期
    12     publish_time=scrapy.Field()
    13     #工资
    14     salary=scrapy.Field()
    15     #工作经验
    16     work_experience=scrapy.Field()
    17     #学历
    18     education=scrapy.Field()
    19     #full_time
    20     full_time=scrapy.Field()
    21     #标签
    22     tags=scrapy.Field()
    23     #公司名字
    24     company_name=scrapy.Field()
    25     # #产业
    26     # industry=scrapy.Field()
    27     #职位诱惑
    28     job_temptation=scrapy.Field()
    29     #工作描述
    30     job_desc=scrapy.Field()
    31     #公司logo地址
    32     logo_image=scrapy.Field()
    33      #领域
    34     field=scrapy.Field()
    35     #发展阶段
    36     stage=scrapy.Field()
    37     #公司规模
    38     company_size=scrapy.Field()
    39     # 公司主页
    40     home = scrapy.Field()
    41     #职位发布者
    42     job_publisher=scrapy.Field()
    43     #投资机构
    44     financeOrg=scrapy.Field()
    45     #爬取时间
    46     crawl_time=scrapy.Field()
    View Code

    lagou.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from LaGou.items import LagouItem
    from LaGou.utils.MD5 import get_md5
    from datetime import datetime
    
    
    class LagouSpider(CrawlSpider):
        name = 'lagou'
        allowed_domains = ['lagou.com']
        start_urls = ['https://www.lagou.com/zhaopin/']
        content_links=LinkExtractor(allow=(r"https://www.lagou.com/jobs/d+.html"))
        page_links=LinkExtractor(allow=(r"https://www.lagou.com/zhaopin/d+"))
        rules = (
            Rule(content_links, callback="parse_item", follow=False),
            Rule(page_links,follow=True)
        )
    
        def parse_item(self, response):
            item=LagouItem()
            #获取到公司拉钩主页的url作为ID
            # item["obj_id"]=get_md5(response.url)
            #公司名称
            item["company_name"]=response.xpath('//dl[@class="job_company"]//a/img/@alt').extract()[0]
            # 职位
            item["positon_name"]=response.xpath('//div[@class="job-name"]//span[@class="name"]/text()').extract()[0]
            #工资
            item["salary"]=response.xpath('//dd[@class="job_request"]//span[1]/text()').extract()[0]
            # 工作地点
            work_place=response.xpath('//dd[@class="job_request"]//span[2]/text()').extract()[0]
            item["work_place"]=work_place.replace("/","")
            # 工作经验
            work_experience=response.xpath('//dd[@class="job_request"]//span[3]/text()').extract()[0]
            item["work_experience"]=work_experience.replace("/","")
            # 学历
            education=response.xpath('//dd[@class="job_request"]//span[4]/text()').extract()[0]
            item["education"]=education.replace("/","")
            # full_time
            item['full_time']=response.xpath('//dd[@class="job_request"]//span[5]/text()').extract()[0]
            #tags
            tags=response.xpath('//dd[@class="job_request"]//li[@class="labels"]/text()').extract()
            item["tags"]=",".join(tags)
            #publish_time
            item["publish_time"]=response.xpath('//dd[@class="job_request"]//p[@class="publish_time"]/text()').extract()[0]
            # 职位诱惑
            job_temptation=response.xpath('//dd[@class="job-advantage"]/p/text()').extract()
            item["job_temptation"]=",".join(job_temptation)
            # 工作描述
            job_desc=response.xpath('//dd[@class="job_bt"]/div//p/text()').extract()
            item["job_desc"]=",".join(job_desc).replace("xa0","").strip()
            #job_publisher
            item["job_publisher"]=response.xpath('//div[@class="publisher_name"]//span[@class="name"]/text()').extract()[0]
            # 公司logo地址
            logo_image=response.xpath('//dl[@class="job_company"]//a/img/@src').extract()[0]
            item["logo_image"]=logo_image.replace("//","")
            # 领域
            field=response.xpath('//ul[@class="c_feature"]//li[1]/text()').extract()
            item["field"]="".join(field).strip()
            # 发展阶段
            stage=response.xpath('//ul[@class="c_feature"]//li[2]/text()').extract()
            item["stage"]="".join(stage).strip()
            # 投资机构
            financeOrg=response.xpath('//ul[@class="c_feature"]//li[3]/p/text()').extract()
            if financeOrg:
                item["financeOrg"]="".join(financeOrg)
            else:
                item["financeOrg"]=""
            #公司规模
            if financeOrg:
                 company_size= response.xpath('//ul[@class="c_feature"]//li[4]/text()').extract()
                 item["company_size"]="".join(company_size).strip()
            else:
                company_size = response.xpath('//ul[@class="c_feature"]//li[3]/text()').extract()
                item["company_size"] = "".join(company_size).strip()
            # 公司主页
            item["home"]=response.xpath('//ul[@class="c_feature"]//li/a/@href').extract()[0]
            # 爬取时间
            item["crawl_time"]=datetime.now()
    
            yield item
    View Code

    pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import pymysql
    class LagouPipeline(object):
    
        def process_item(self, item, spider):
            con = pymysql.connect(host="127.0.0.1", user="root", passwd="229801", db="lagou",charset="utf8")
            cur = con.cursor()
            sql = ("insert into lagouwang(company_name,positon_name,salary,work_place,work_experience,education,full_time,tags,publish_time,job_temptation,job_desc,job_publisher,logo_image,field,stage,financeOrg,company_size,home,crawl_time)"
                   "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")
            lis=(item["company_name"],item["positon_name"],item["salary"],item["work_place"],item["work_experience"],item["education"],item['full_time'],item["tags"],item["publish_time"],item["job_temptation"],item["job_desc"],item["job_publisher"],item["logo_image"],item["field"],item["stage"],item["financeOrg"],item["company_size"],item["home"],item["crawl_time"])
            cur.execute(sql, lis)
            con.commit()
            cur.close()
            con.close()
    
            return item
    View Code

    middlewares.py (主要是User_Agent的随机切换 没有加ip代理)

    import random
    from LaGou.settings import USER_AGENTS
    
    
    class RandomUserAgent(object):
        def process_request(self, request, spider):
            useragent = random.choice(USER_AGENTS)
    
            request.headers.setdefault("User-Agent", useragent)
    View Code

    settings.py

    BOT_NAME = 'LaGou'
    
    SPIDER_MODULES = ['LaGou.spiders']
    NEWSPIDER_MODULE = 'LaGou.spiders'
    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = 5
    COOKIES_ENABLED = False
    USER_AGENTS = [
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
        "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
        "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
        "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
       ]
    DOWNLOADER_MIDDLEWARES = {
          'LaGou.middlewares.RandomUserAgent': 1,
    #    'LaGou.middlewares.MyCustomDownloaderMiddleware': 543,
    }
    ITEM_PIPELINES = {
          #'scrapy_redis.pipelines.RedisPipeline':300,
    
        'LaGou.pipelines.LagouPipeline': 300,
    }
    View Code

    main.py(用于启动调试)

    1 #coding=utf-8
    2 from scrapy.cmdline import execute
    3 execute(["scrapy","crawl","lagou"])
    View Code

    在settings.py配置加入如下代码会实现分布式数据保存在redis里面,怎么从redis取出数据参考前几章

    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    SCHEDULER_PERSIST = True
    ITEM_PIPELINES = {
          'scrapy_redis.pipelines.RedisPipeline':300,
    
        #'LaGou.pipelines.LagouPipeline': 300,
    }

    主要用到知识点:CrawlSpider的(LinkExtractor,Rule),内容的处理(xpath,extract),字符的处理(join,replace,strip,split),User_Agent随机切换等

     

  • 相关阅读:
    CSS优化压缩
    CSS clear both清除浮动总结
    jQuery下拉列表二级联动插件
    网站靠什么赚钱?
    程序员到高级程序员的职业生涯
    ie数组不支持indexOf 方法解决
    jquery中的each方法,$.each this.each $.fn.each
    css3写出0.5px的边框
    js 自己创建ready多个可以依次加载
    javascript中的removeEventListener失效问题
  • 原文地址:https://www.cnblogs.com/huwei934/p/6978320.html
Copyright © 2011-2022 走看看