zoukankan      html  css  js  c++  java
  • scrapy实战2分布式爬取lagou招聘(加入了免费的User-Agent随机动态获取库 fake-useragent 使用方法查看:https://github.com/hellysmile/fake-useragent)

    items.py

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define here the models for your scraped items
     4 #
     5 # See documentation in:
     6 # http://doc.scrapy.org/en/latest/topics/items.html
     7 
     8 import scrapy
     9 class LagouItem(scrapy.Item):
    10     # define the fields for your item here like:
    11     # name = scrapy.Field()
    12     #id
    13     obj_id=scrapy.Field()
    14     #职位名
    15     positon_name=scrapy.Field()
    16     #工作地点
    17     work_place=scrapy.Field()
    18     #发布日期
    19     publish_time=scrapy.Field()
    20     #工资
    21     salary=scrapy.Field()
    22     #工作经验
    23     work_experience=scrapy.Field()
    24     #学历
    25     education=scrapy.Field()
    26     #full_time
    27     full_time=scrapy.Field()
    28     #标签
    29     tags=scrapy.Field()
    30     #公司名字
    31     company_name=scrapy.Field()
    32     # #产业
    33     # industry=scrapy.Field()
    34     #职位诱惑
    35     job_temptation=scrapy.Field()
    36     #工作描述
    37     job_desc=scrapy.Field()
    38     #公司logo地址
    39     logo_image=scrapy.Field()
    40      #领域
    41     field=scrapy.Field()
    42     #发展阶段
    43     stage=scrapy.Field()
    44     #公司规模
    45     company_size=scrapy.Field()
    46     # 公司主页
    47     home = scrapy.Field()
    48     #职位发布者
    49     job_publisher=scrapy.Field()
    50     #投资机构
    51     financeOrg=scrapy.Field()
    52     #爬取时间
    53     crawl_time=scrapy.Field()
    View Code

    spiders>lagou.py

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from scrapy.linkextractors import LinkExtractor
     4 from scrapy.spiders import CrawlSpider, Rule
     5 from LaGou.items import LagouItem
     6 from LaGou.utils.MD5 import get_md5
     7 from datetime import datetime
     8 
     9 
    10 class LagouSpider(CrawlSpider):
    11     name = 'lagou'
    12     allowed_domains = ['lagou.com']
    13     start_urls = ['https://www.lagou.com/zhaopin/']
    14     content_links=LinkExtractor(allow=(r"https://www.lagou.com/jobs/d+.html"))
    15     page_links=LinkExtractor(allow=(r"https://www.lagou.com/zhaopin/d+"))
    16     rules = (
    17         Rule(content_links, callback="parse_item", follow=False),
    18         Rule(page_links,follow=True)
    19     )
    20 
    21     def parse_item(self, response):
    22         item=LagouItem()
    23         #获取到公司拉钩主页的url作为ID
    24         item["obj_id"]=get_md5(response.url)
    25         #公司名称
    26         item["company_name"]=response.xpath('//dl[@class="job_company"]//a/img/@alt').extract()[0]
    27         # 职位
    28         item["positon_name"]=response.xpath('//div[@class="job-name"]//span[@class="name"]/text()').extract()[0]
    29         #工资
    30         item["salary"]=response.xpath('//dd[@class="job_request"]//span[1]/text()').extract()[0]
    31         # 工作地点
    32         work_place=response.xpath('//dd[@class="job_request"]//span[2]/text()').extract()[0]
    33         item["work_place"]=work_place.replace("/","")
    34         # 工作经验
    35         work_experience=response.xpath('//dd[@class="job_request"]//span[3]/text()').extract()[0]
    36         item["work_experience"]=work_experience.replace("/","")
    37         # 学历
    38         education=response.xpath('//dd[@class="job_request"]//span[4]/text()').extract()[0]
    39         item["education"]=education.replace("/","")
    40         # full_time
    41         item['full_time']=response.xpath('//dd[@class="job_request"]//span[5]/text()').extract()[0]
    42         #tags
    43         tags=response.xpath('//dd[@class="job_request"]//li[@class="labels"]/text()').extract()
    44         item["tags"]=",".join(tags)
    45         #publish_time
    46         item["publish_time"]=response.xpath('//dd[@class="job_request"]//p[@class="publish_time"]/text()').extract()[0]
    47         # 职位诱惑
    48         job_temptation=response.xpath('//dd[@class="job-advantage"]/p/text()').extract()
    49         item["job_temptation"]=",".join(job_temptation)
    50         # 工作描述
    51         job_desc=response.xpath('//dd[@class="job_bt"]/div//p/text()').extract()
    52         item["job_desc"]=",".join(job_desc).replace("xa0","").strip()
    53         #job_publisher
    54         item["job_publisher"]=response.xpath('//div[@class="publisher_name"]//span[@class="name"]/text()').extract()[0]
    55         # 公司logo地址
    56         logo_image=response.xpath('//dl[@class="job_company"]//a/img/@src').extract()[0]
    57         item["logo_image"]=logo_image.replace("//","")
    58         # 领域
    59         field=response.xpath('//ul[@class="c_feature"]//li[1]/text()').extract()
    60         item["field"]="".join(field).strip()
    61         # 发展阶段
    62         stage=response.xpath('//ul[@class="c_feature"]//li[2]/text()').extract()
    63         item["stage"]="".join(stage).strip()
    64         # 投资机构
    65         financeOrg=response.xpath('//ul[@class="c_feature"]//li[3]/p/text()').extract()
    66         if financeOrg:
    67             item["financeOrg"]="".join(financeOrg)
    68         else:
    69             item["financeOrg"]=""
    70         #公司规模
    71         if financeOrg:
    72              company_size= response.xpath('//ul[@class="c_feature"]//li[4]/text()').extract()
    73              item["company_size"]="".join(company_size).strip()
    74         else:
    75             company_size = response.xpath('//ul[@class="c_feature"]//li[3]/text()').extract()
    76             item["company_size"] = "".join(company_size).strip()
    77         # 公司主页
    78         item["home"]=response.xpath('//ul[@class="c_feature"]//li/a/@href').extract()[0]
    79         # 爬取时间
    80         item["crawl_time"]=datetime.now()
    81 
    82         yield item
    View Code

    pipelines.py

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define your item pipelines here
     4 #
     5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
     6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
     7 
     8 import pymysql
     9 class LagouPipeline(object):
    10 
    11     def process_item(self, item, spider):
    12         con = pymysql.connect(host="127.0.0.1", user="root", passwd="229801", db="lagou",charset="utf8")
    13         cur = con.cursor()
    14         sql = ("insert into lagouwang(obj_id,company_name,positon_name,salary,work_place,work_experience,education,full_time,tags,publish_time,job_temptation,job_desc,job_publisher,logo_image,field,stage,financeOrg,company_size,home,crawl_time)"
    15                "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)")
    16         lis=(item["obj_id"],item["company_name"],item["positon_name"],item["salary"],item["work_place"],item["work_experience"],item["education"],item['full_time'],item["tags"],item["publish_time"],item["job_temptation"],item["job_desc"],item["job_publisher"],item["logo_image"],item["field"],item["stage"],item["financeOrg"],item["company_size"],item["home"],item["crawl_time"])
    17         cur.execute(sql, lis)
    18         con.commit()
    19         cur.close()
    20         con.close()
    21 
    22         return item
    View Code

    middlewares.py

     1 from scrapy import signals
     2 import random
     3 #from LaGou.settings import USER_AGENTS
     4 from fake_useragent import UserAgent
     5 
     6 class RandomUserAgent(object):
     7     # def __init__(self,crawl):
     8     #     super(RandomUserAgent,self).__init__()
     9     #     self.ua=UserAgent()
    10     def process_request(self, request, spider):
    11         #useragent = random.choice(USER_AGENTS)
    12         ua=UserAgent()
    13         request.headers.setdefault("User-Agent",ua.random)
    View Code

    settings.py

      1 # -*- coding: utf-8 -*-
      2 
      3 # Scrapy settings for LaGou project
      4 #
      5 # For simplicity, this file contains only settings considered important or
      6 # commonly used. You can find more settings consulting the documentation:
      7 #
      8 #     http://doc.scrapy.org/en/latest/topics/settings.html
      9 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
     10 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
     11 
     12 BOT_NAME = 'LaGou'
     13 
     14 SPIDER_MODULES = ['LaGou.spiders']
     15 NEWSPIDER_MODULE = 'LaGou.spiders'
     16 
     17 
     18 # Crawl responsibly by identifying yourself (and your website) on the user-agent
     19 #USER_AGENT = 'LaGou (+http://www.yourdomain.com)'
     20 
     21 # Obey robots.txt rules
     22 ROBOTSTXT_OBEY = False
     23 
     24 # Configure maximum concurrent requests performed by Scrapy (default: 16)
     25 #CONCURRENT_REQUESTS = 32
     26 
     27 # Configure a delay for requests for the same website (default: 0)
     28 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
     29 # See also autothrottle settings and docs
     30 DOWNLOAD_DELAY = 5
     31 # The download delay setting will honor only one of:
     32 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
     33 #CONCURRENT_REQUESTS_PER_IP = 16
     34 
     35 # Disable cookies (enabled by default)
     36 COOKIES_ENABLED = False
     37 # USER_AGENTS = [
     38 #     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
     39 #     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
     40 #     "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
     41 #     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
     42 #     "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
     43 #     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
     44 #     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
     45 #     "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
     46 #    ]
     47 # Disable Telnet Console (enabled by default)
     48 #TELNETCONSOLE_ENABLED = False
     49 
     50 # Override the default request headers:
     51 #DEFAULT_REQUEST_HEADERS = {
     52 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
     53 #   'Accept-Language': 'en',
     54 #}
     55 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
     56 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
     57 SCHEDULER_PERSIST = True
     58 # Enable or disable spider middlewares
     59 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
     60 #SPIDER_MIDDLEWARES = {
     61 #    'LaGou.middlewares.LagouSpiderMiddleware': 543,
     62 
     63 #}
     64 
     65 # Enable or disable downloader middlewares
     66 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
     67 DOWNLOADER_MIDDLEWARES = {
     68       'LaGou.middlewares.RandomUserAgent': 1,
     69 #    'LaGou.middlewares.MyCustomDownloaderMiddleware': 543,
     70 }
     71 
     72 # Enable or disable extensions
     73 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
     74 #EXTENSIONS = {
     75 #    'scrapy.extensions.telnet.TelnetConsole': None,
     76 #}
     77 
     78 # Configure item pipelines
     79 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
     80 ITEM_PIPELINES = {
     81       'scrapy_redis.pipelines.RedisPipeline':300,
     82 
     83     #'LaGou.pipelines.LagouPipeline': 300,
     84 }
     85 
     86 # Enable and configure the AutoThrottle extension (disabled by default)
     87 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
     88 #AUTOTHROTTLE_ENABLED = True
     89 # The initial download delay
     90 #AUTOTHROTTLE_START_DELAY = 5
     91 # The maximum download delay to be set in case of high latencies
     92 #AUTOTHROTTLE_MAX_DELAY = 60
     93 # The average number of requests Scrapy should be sending in parallel to
     94 # each remote server
     95 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
     96 # Enable showing throttling stats for every response received:
     97 #AUTOTHROTTLE_DEBUG = False
     98 
     99 # Enable and configure HTTP caching (disabled by default)
    100 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    101 #HTTPCACHE_ENABLED = True
    102 #HTTPCACHE_EXPIRATION_SECS = 0
    103 #HTTPCACHE_DIR = 'httpcache'
    104 #HTTPCACHE_IGNORE_HTTP_CODES = []
    105 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    View Code

    redis数据:

    mysql数据:

    申明:以上只限于参考学习交流!!!更多:https://github.com/huwei86/spiderlagou

  • 相关阅读:
    创建基于 SQL Server 表的外部内容类型
    symfony入门之 app/console 命令
    git上传代码到github
    ubuntu下配置apache虚拟主机
    ubuntu14.04下解决编辑器无法输入中文
    PHP把域名解析为站点IP
    mysql平常总结
    php防sql注入函数
    常用的正则检测总结
    redis缓存注意事项
  • 原文地址:https://www.cnblogs.com/huwei934/p/6985478.html
Copyright © 2011-2022 走看看