zoukankan      html  css  js  c++  java
  • scrapy实战2分布式爬取lagou招聘(加入了免费的User-Agent随机动态获取库 fake-useragent 使用方法查看:https://github.com/hellysmile/fake-useragent)

    items.py

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define here the models for your scraped items
     4 #
     5 # See documentation in:
     6 # http://doc.scrapy.org/en/latest/topics/items.html
     7 
     8 import scrapy
     9 class LagouItem(scrapy.Item):
    10     # define the fields for your item here like:
    11     # name = scrapy.Field()
    12     #id
    13     obj_id=scrapy.Field()
    14     #职位名
    15     positon_name=scrapy.Field()
    16     #工作地点
    17     work_place=scrapy.Field()
    18     #发布日期
    19     publish_time=scrapy.Field()
    20     #工资
    21     salary=scrapy.Field()
    22     #工作经验
    23     work_experience=scrapy.Field()
    24     #学历
    25     education=scrapy.Field()
    26     #full_time
    27     full_time=scrapy.Field()
    28     #标签
    29     tags=scrapy.Field()
    30     #公司名字
    31     company_name=scrapy.Field()
    32     # #产业
    33     # industry=scrapy.Field()
    34     #职位诱惑
    35     job_temptation=scrapy.Field()
    36     #工作描述
    37     job_desc=scrapy.Field()
    38     #公司logo地址
    39     logo_image=scrapy.Field()
    40      #领域
    41     field=scrapy.Field()
    42     #发展阶段
    43     stage=scrapy.Field()
    44     #公司规模
    45     company_size=scrapy.Field()
    46     # 公司主页
    47     home = scrapy.Field()
    48     #职位发布者
    49     job_publisher=scrapy.Field()
    50     #投资机构
    51     financeOrg=scrapy.Field()
    52     #爬取时间
    53     crawl_time=scrapy.Field()
    View Code

    spiders>lagou.py

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from scrapy.linkextractors import LinkExtractor
     4 from scrapy.spiders import CrawlSpider, Rule
     5 from LaGou.items import LagouItem
     6 from LaGou.utils.MD5 import get_md5
     7 from datetime import datetime
     8 
     9 
    10 class LagouSpider(CrawlSpider):
    11     name = 'lagou'
    12     allowed_domains = ['lagou.com']
    13     start_urls = ['https://www.lagou.com/zhaopin/']
    14     content_links=LinkExtractor(allow=(r"https://www.lagou.com/jobs/d+.html"))
    15     page_links=LinkExtractor(allow=(r"https://www.lagou.com/zhaopin/d+"))
    16     rules = (
    17         Rule(content_links, callback="parse_item", follow=False),
    18         Rule(page_links,follow=True)
    19     )
    20 
    21     def parse_item(self, response):
    22         item=LagouItem()
    23         #获取到公司拉钩主页的url作为ID
    24         item["obj_id"]=get_md5(response.url)
    25         #公司名称
    26         item["company_name"]=response.xpath('//dl[@class="job_company"]//a/img/@alt').extract()[0]
    27         # 职位
    28         item["positon_name"]=response.xpath('//div[@class="job-name"]//span[@class="name"]/text()').extract()[0]
    29         #工资
    30         item["salary"]=response.xpath('//dd[@class="job_request"]//span[1]/text()').extract()[0]
    31         # 工作地点
    32         work_place=response.xpath('//dd[@class="job_request"]//span[2]/text()').extract()[0]
    33         item["work_place"]=work_place.replace("/","")
    34         # 工作经验
    35         work_experience=response.xpath('//dd[@class="job_request"]//span[3]/text()').extract()[0]
    36         item["work_experience"]=work_experience.replace("/","")
    37         # 学历
    38         education=response.xpath('//dd[@class="job_request"]//span[4]/text()').extract()[0]
    39         item["education"]=education.replace("/","")
    40         # full_time
    41         item['full_time']=response.xpath('//dd[@class="job_request"]//span[5]/text()').extract()[0]
    42         #tags
    43         tags=response.xpath('//dd[@class="job_request"]//li[@class="labels"]/text()').extract()
    44         item["tags"]=",".join(tags)
    45         #publish_time
    46         item["publish_time"]=response.xpath('//dd[@class="job_request"]//p[@class="publish_time"]/text()').extract()[0]
    47         # 职位诱惑
    48         job_temptation=response.xpath('//dd[@class="job-advantage"]/p/text()').extract()
    49         item["job_temptation"]=",".join(job_temptation)
    50         # 工作描述
    51         job_desc=response.xpath('//dd[@class="job_bt"]/div//p/text()').extract()
    52         item["job_desc"]=",".join(job_desc).replace("xa0","").strip()
    53         #job_publisher
    54         item["job_publisher"]=response.xpath('//div[@class="publisher_name"]//span[@class="name"]/text()').extract()[0]
    55         # 公司logo地址
    56         logo_image=response.xpath('//dl[@class="job_company"]//a/img/@src').extract()[0]
    57         item["logo_image"]=logo_image.replace("//","")
    58         # 领域
    59         field=response.xpath('//ul[@class="c_feature"]//li[1]/text()').extract()
    60         item["field"]="".join(field).strip()
    61         # 发展阶段
    62         stage=response.xpath('//ul[@class="c_feature"]//li[2]/text()').extract()
    63         item["stage"]="".join(stage).strip()
    64         # 投资机构
    65         financeOrg=response.xpath('//ul[@class="c_feature"]//li[3]/p/text()').extract()
    66         if financeOrg:
    67             item["financeOrg"]="".join(financeOrg)
    68         else:
    69             item["financeOrg"]=""
    70         #公司规模
    71         if financeOrg:
    72              company_size= response.xpath('//ul[@class="c_feature"]//li[4]/text()').extract()
    73              item["company_size"]="".join(company_size).strip()
    74         else:
    75             company_size = response.xpath('//ul[@class="c_feature"]//li[3]/text()').extract()
    76             item["company_size"] = "".join(company_size).strip()
    77         # 公司主页
    78         item["home"]=response.xpath('//ul[@class="c_feature"]//li/a/@href').extract()[0]
    79         # 爬取时间
    80         item["crawl_time"]=datetime.now()
    81 
    82         yield item
    View Code

    pipelines.py

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define your item pipelines here
     4 #
     5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
     6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
     7 
     8 import pymysql
     9 class LagouPipeline(object):
    10 
    11     def process_item(self, item, spider):
    12         con = pymysql.connect(host="127.0.0.1", user="root", passwd="229801", db="lagou",charset="utf8")
    13         cur = con.cursor()
    14         sql = ("insert into lagouwang(obj_id,company_name,positon_name,salary,work_place,work_experience,education,full_time,tags,publish_time,job_temptation,job_desc,job_publisher,logo_image,field,stage,financeOrg,company_size,home,crawl_time)"
    15                "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")
    16         lis=(item["obj_id"],item["company_name"],item["positon_name"],item["salary"],item["work_place"],item["work_experience"],item["education"],item['full_time'],item["tags"],item["publish_time"],item["job_temptation"],item["job_desc"],item["job_publisher"],item["logo_image"],item["field"],item["stage"],item["financeOrg"],item["company_size"],item["home"],item["crawl_time"])
    17         cur.execute(sql, lis)
    18         con.commit()
    19         cur.close()
    20         con.close()
    21 
    22         return item
    View Code

    middlewares.py

     1 from scrapy import signals
     2 import random
     3 #from LaGou.settings import USER_AGENTS
     4 from fake_useragent import UserAgent
     5 
     6 class RandomUserAgent(object):
     7     # def __init__(self,crawl):
     8     #     super(RandomUserAgent,self).__init__()
     9     #     self.ua=UserAgent()
    10     def process_request(self, request, spider):
    11         #useragent = random.choice(USER_AGENTS)
    12         ua=UserAgent()
    13         request.headers.setdefault("User-Agent",ua.random)
    View Code

    settings.py

      1 # -*- coding: utf-8 -*-
      2 
      3 # Scrapy settings for LaGou project
      4 #
      5 # For simplicity, this file contains only settings considered important or
      6 # commonly used. You can find more settings consulting the documentation:
      7 #
      8 #     http://doc.scrapy.org/en/latest/topics/settings.html
      9 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
     10 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
     11 
     12 BOT_NAME = 'LaGou'
     13 
     14 SPIDER_MODULES = ['LaGou.spiders']
     15 NEWSPIDER_MODULE = 'LaGou.spiders'
     16 
     17 
     18 # Crawl responsibly by identifying yourself (and your website) on the user-agent
     19 #USER_AGENT = 'LaGou (+http://www.yourdomain.com)'
     20 
     21 # Obey robots.txt rules
     22 ROBOTSTXT_OBEY = False
     23 
     24 # Configure maximum concurrent requests performed by Scrapy (default: 16)
     25 #CONCURRENT_REQUESTS = 32
     26 
     27 # Configure a delay for requests for the same website (default: 0)
     28 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
     29 # See also autothrottle settings and docs
     30 DOWNLOAD_DELAY = 5
     31 # The download delay setting will honor only one of:
     32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
     33 #CONCURRENT_REQUESTS_PER_IP = 16
     34 
     35 # Disable cookies (enabled by default)
     36 COOKIES_ENABLED = False
     37 # USER_AGENTS = [
     38 #     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
     39 #     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
     40 #     "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
     41 #     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
     42 #     "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
     43 #     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
     44 #     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
     45 #     "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
     46 #    ]
     47 # Disable Telnet Console (enabled by default)
     48 #TELNETCONSOLE_ENABLED = False
     49 
     50 # Override the default request headers:
     51 #DEFAULT_REQUEST_HEADERS = {
     52 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
     53 #   'Accept-Language': 'en',
     54 #}
     55 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
     56 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
     57 SCHEDULER_PERSIST = True
     58 # Enable or disable spider middlewares
     59 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
     60 #SPIDER_MIDDLEWARES = {
     61 #    'LaGou.middlewares.LagouSpiderMiddleware': 543,
     62 
     63 #}
     64 
     65 # Enable or disable downloader middlewares
     66 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
     67 DOWNLOADER_MIDDLEWARES = {
     68       'LaGou.middlewares.RandomUserAgent': 1,
     69 #    'LaGou.middlewares.MyCustomDownloaderMiddleware': 543,
     70 }
     71 
     72 # Enable or disable extensions
     73 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
     74 #EXTENSIONS = {
     75 #    'scrapy.extensions.telnet.TelnetConsole': None,
     76 #}
     77 
     78 # Configure item pipelines
     79 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
     80 ITEM_PIPELINES = {
     81       'scrapy_redis.pipelines.RedisPipeline':300,
     82 
     83     #'LaGou.pipelines.LagouPipeline': 300,
     84 }
     85 
     86 # Enable and configure the AutoThrottle extension (disabled by default)
     87 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
     88 #AUTOTHROTTLE_ENABLED = True
     89 # The initial download delay
     90 #AUTOTHROTTLE_START_DELAY = 5
     91 # The maximum download delay to be set in case of high latencies
     92 #AUTOTHROTTLE_MAX_DELAY = 60
     93 # The average number of requests Scrapy should be sending in parallel to
     94 # each remote server
     95 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
     96 # Enable showing throttling stats for every response received:
     97 #AUTOTHROTTLE_DEBUG = False
     98 
     99 # Enable and configure HTTP caching (disabled by default)
    100 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    101 #HTTPCACHE_ENABLED = True
    102 #HTTPCACHE_EXPIRATION_SECS = 0
    103 #HTTPCACHE_DIR = 'httpcache'
    104 #HTTPCACHE_IGNORE_HTTP_CODES = []
    105 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    View Code

    redis数据:

    mysql数据:

    申明:以上只限于参考学习交流!!!更多:https://github.com/huwei86/spiderlagou

  • 相关阅读:
    e621. Activating a Keystroke When Any Child Component Has Focus
    e587. Filling Basic Shapes
    e591. Drawing Simple Text
    e595. Drawing an Image
    e586. Drawing Simple Shapes
    e636. Listening to All Key Events Before Delivery to Focused Component
    在 PL/SQL 块的哪部分可以对初始变量赋予新值? (选择1项)
    Oracle数据库中,在SQL语句中连接字符串的方法是哪个?(选择1项)
    你判断下面语句,有什么作用?(单选)
    Oracle数据库表空间与数据文件的关系描述正确的是( )
  • 原文地址:https://www.cnblogs.com/huwei934/p/6985478.html
Copyright © 2011-2022 走看看