zoukankan      html  css  js  c++  java
  • scrapy 抓取拉勾网数据

    其实很简单,却因为一些小问题,折腾不少时间,简要记录一下,以备后需。

    >> scrapy startproject lagou
    >> cd lagou
    >> scrapy gen lagou_jd www.lagou.com

    定义item

    在items.py中继续完善定义:

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class LagouItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        job_title = scrapy.Field()
        job_description = scrapy.Field()
        job_url = scrapy.Field()

    完善爬虫

    # -*- coding: utf-8 -*-
    from scrapy.selector import Selector
    from scrapy.contrib.spiders import CrawlSpider,Rule
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from lagou.items import LagouItem
    import codecs,re
    from bs4 import BeautifulSoup
    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    class LagoujdSpider(CrawlSpider):
        name = "lagoujd"
        allowed_domains = ["lagou.com"]
        start_urls = (
            'http://www.lagou.com/jobs/787409.html',
        )
    
        rules = [
            Rule(SgmlLinkExtractor(allow =r'jobs/d+.html'),callback = 'parse_lagou',follow=True),
        ]
        
    
    
        def parse_lagou(self, response): # 主要改名,不能使用默认的parse!
            self.SPLIT_DEMAND = re.compile(u'(要求|资格|条件)[::;
    ]?')
            self.SPLIT_LINE = re.compile(u'[;;。
    ]')
            self.DEMAND = re.compile(u'具备|熟悉|具有|熟练|掌握|良好的|能够|丰富的|以上学历|优秀的|有深入研究|有很强的|工作
    经历|工作经验|善于|懂得|优先|不少于|不超过|喜欢|较强的.{2,8}能力|相关专业|相关学历|开发经验|实习经验|d年以上')
    
            item = LagouItem()
            sel = Selector(response)
            try:
                item["job_title"] =sel.xpath("//title/text()").extract()[0].split('-')[0][:-2].strip()
                job_des = sel.xpath('//*[@id="container"]/div[1]/div[1]/dl[1]/dd[2]').extract()[0]
                job_des = BeautifulSoup(job_des).get_text()
                item["job_description"] = self.get_demand(job_des)
                item["job_url"] = response.url
                print item['job_title']
            except Exception,e:
                print e
           # if item.has_key("job_title") and item.has_key("job_description"):
           #     with codecs.open("./output/"+item["job_title"].strip()+".txt",'a','utf-8') as fw:
           #         fw.write(item["job_description"])
           #         print item["job_title"],"done"
            
            
            return item
    
        def get_demand(self,jdstr):
    
            res = []
            if self.SPLIT_DEMAND.search(jdstr):
                pos = self.SPLIT_DEMAND.search(jdstr).span()[1]
                linelist =self.SPLIT_LINE.split(jdstr[pos:])
                for line in linelist:
                    if len(line)<5:continue
                    if re.match('d',line.strip()):
                        res.append(line)
                    elif self.DEMAND.search(line):
                        res.append(line)
                    else:
                        break
            return '
    '.join(res)

    存储抓取的数据为json格式

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import json
    import codecs
    
    class LagouPipeline(object):
        def __init__(self):
            self.file = codecs.open('lagou_jd.json','w',encoding='utf-8')
    
    
        def process_item(self, item, spider):
            line = json.dumps(dict(item),ensure_ascii=False)+'
    '
            self.file.write(line)
            return item
        
    
        def spider_closed(self,spider):
            self.file.close()

    在setttings.py 中注册pipline

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for lagou project
    #
    # For simplicity, this file contains only the most important settings by
    # default. All the other settings are documented here:
    #
    #     http://doc.scrapy.org/en/latest/topics/settings.html
    #
    
    BOT_NAME = 'lagou'
    
    SPIDER_MODULES = ['lagou.spiders']
    NEWSPIDER_MODULE = 'lagou.spiders'
    
    ITEM_PIPELINES = {
        'lagou.pipelines.LagouPipeline':300,
    }
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'lagou (+http://www.yourdomain.com)'

    运行,各种抓!!!

    >> scrapy crawl lagou_jd
    或者 
    >> scrapy crawl lagou_jd -o item.json -t json

    demo:

    jkmiao@jkmiao-ipin:~/workplace/spiders/lagou$ more lagou_jd.json 
    {"job_url": "http://www.lagou.com/jobs/1102051.html", "job_description": "1、具有2年以上互联网产品经验,优秀的交互设计
    能力,对产品设计有极高的要求,追求极致的用户体验 2、善于观察与学习,具有宽广的视野、对于整体产品规划有自己的见解和理念 
    3、有优秀缜密的逻辑与思维能力,良好的协调能力、分析、计划及项目管理能力,具备良好的团队精神,沟通能力强 4、熟练使用 Axu
    re 、 visio 、 office 等软件
     5、有成熟的O2O平台类产品设计经验者优先", "job_title": "产品经理"}
    {"job_url": "http://www.lagou.com/jobs/917776.html", "job_description": "1、有2年以上互联网产品规划和体验设计相关经验,
    熟悉互联网或软件产品整体实现过程,包括从需求分析到产品发布
    2、有完整策划至少2个以上成功、目前尚在运营中的互联网产品设
    计案例
    3、能通过数据分析等系统性方法深刻理解用户需求并予以满足
    4、执行力强,善于组织协调并推动项目进展
    5、对工作充满
    热情,富有创新精神,能承受较大的工作压力
    6、有良好的学习能力、良好的沟通能力和团队合作精神,出色的组织能力", "job_titl
    e": "产品经理"}

    新建脚本文件preprocess.py,进一步预处理

    #!/usr/bin/env python
    # coding=utf-8
    
    import simplejson as json
    import re
    import sys,codecs
    from collections import defaultdict
    reload(sys)
    sys.setdefaultencoding('utf-8')
    from simhash import Simhash
    
    
    
    def get_top_jobname(jobname,namelist):
        namelist = sorted(namelist)
        dis = [ (Simhash(jobname).distance(Simhash(other)),other) for other in namelist ]
        dis = sorted(dis,key=lambda x:x[0])
        return dis[0]
    
    
    def clean_text(fname='./lagou_jd.json'):
        SPLIT_LINE = re.compile(u'[;;。
    ]')
        FILTER_DEMAND = re.compile(u'薪酬|待遇|待遇|福利|加入我们|职责|你|成为')
        res = defaultdict(str)
       # fw1 = codecs.open('demands.txt','w','utf-8')
       # fw2 = codecs.open('duty.txt','w','utf-8')
        i=1
        for line in codecs.open(fname):
            jd = json.loads(line)
            if not re.match(u'd',jd['job_demand'].strip()) or len(jd["job_demand"])<8 or len(
    jd["job_title"])<2:continue
            if FILTER_DEMAND.search(jd['job_demand']):continue
            
            if len(res.keys())>0:
                top_job = get_top_jobname(jd['job_title'],res.keys())
            else:
                top_job = tuple((0,jd['job_title']))
    
            if top_job[0]<7: # 如果距离<,就归并为一个职业
                if top_job[0]>4:
                    print top_job[0],top_job[1],jd['job_title']
                jd['job_title'] =  top_job[1]
    
    
            jd["job_demand"] = re.sub(ur"xa0","",jd["job_demand"].decode('utf-8'))
          # jd["job_duty"] = re.sub(ur"xa0","",jd["job_duty"].decode('utf-8'))
            jd["sum_request"] = re.sub(ur"xa0|s+"," ",jd["sum_request"].decode('utf-8'))
    
            demand = [ x.strip() for x in jd['job_demand'].split() if len(x)>5]
            if len(demand)<3: continue
           # duty = [x.strip() for x in jd['job_duty'].split() if len(x)>5]
            sum_request = [ x.strip() for x in jd['sum_request'].split() if len(x)>3 ]
            
    
            jd['job_demand'] = '
    '.join(demand)
        #   jd['job_duty'] = '
    '.join(duty)
            
        #    fw1.write('
    '.join(demand)+'
    ')
        #    fw2.write('
    '.join(duty)+'
    ')
           
    
            if not res.has_key(jd["job_title"]):
                res[jd["job_title"]] = ' '.join(sum_request)+'
    '+jd["job_demand"].strip()
            else:
                res[jd['job_title']] += '
    '+'
    '.join(SPLIT_LINE.split(jd['job_demand']))
            i += 1
            if i%100==0:
                print i
        print i,"done"
        print len(res)
        json.dump(res,codecs.open('./lagou_jd_clean.json','w','utf-8'))
    
    
    def get_jds(fname='./lagou_jd_clean.json'):
        res = json.load(codecs.open(fname))
        i = 1
        for k,v in res.iteritems():
            if len(v.split())<16:
                print i,k
                print v
                print "
    ============
    "
                i += 1
                if i>20:
                    break
    
    
    if __name__ == "__main__":
        clean_text()
    每天一小步,人生一大步!Good luck~
  • 相关阅读:
    分享15个Linux 实用技巧,提高工作效率
    隐藏搜索框:CSS 动画正反向序列
    js检测开发者工具是否打开,防止别人恶意调试我们的代码
    Js实现元素右滑回弹效果(基于Uniapp)
    css ::marker伪元素,修改li的项目符号颜色,字号字体
    Jump Game II
    Google 面经 09/26
    Word Search
    Remove Duplicates from Sorted List
    Remove Duplicates from Sorted List II
  • 原文地址:https://www.cnblogs.com/jkmiao/p/4831843.html
Copyright © 2011-2022 走看看