zoukankan      html  css  js  c++  java
  • 腾讯招聘网站爬虫

    1. scrapy startproject Qqspider
    2. cd Qqspider
    3. cd Qqspider
    4. ls
    5. vi items.py
    6.   1 # -*- coding: utf-8 -*-
        2 
        3 # Define here the models for your scraped items
        4 #
        5 # See documentation in:
        6 # http://doc.scrapy.org/en/latest/topics/items.html
        7 
        8 import scrapy
        9 
       10 
       11 class QqspiderItem(scrapy.Item):
       12     # define the fields for your item here like:
       13     positionName = scrapy.Field()
       14     positionLink = scrapy.Field()
       15     positionType = scrapy.Field()
       16     peopleNum = scrapy.Field()
       17     workLocation = scrapy.Field()
       18     publishTime = scrapy.Field()
      ~                                          

      7.vi  pipelines.py

    7.   1 # -*- coding: utf-8 -*-
        2 
        3 # Define your item pipelines here
        4 #
        5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
        6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
        7 
        8 import json
        9 class QqspiderPipeline(object):
       10 
       11 
       12     def __init__(self):
       13         self.filename = open("tencent.json", "w")
       14 
       15     def process_item(self, item, spider):
       16         text = json.dumps(dict(item), ensure_ascii = False)
       17         self.filename.write(text.encode("utf-8"))
       18         return item
       19 
       20     def close_spider(self, spider):
       21         self.filename.close()
       22 

      8.vi settings.py

    8.  42 DEFAULT_REQUEST_HEADERS = {
       43     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
       44     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       45 #   'Accept-Language': 'en',
       46 }

      68 ITEM_PIPELINES = {
       69     'QqSpider.pipelines.QqspiderPipeline': 300,
       70 }



      9.cd spiders

    9. scrapy genspider t crawl qqcent tencent.com
    10. vi qqcent.py
    11.  1 # -*- coding: utf-8 -*-
        2 import scrapy
        3 from scrapy.linkextractors import LinkExtractor
        4 from scrapy.spiders import CrawlSpider, Rule
        5 from QqSpider.items import QqspiderItem
        6 
        1 # -*- coding: utf-8 -*-
        2 import scrapy
        3 from scrapy.linkextractors import LinkExtractor
        4 from scrapy.spiders import CrawlSpider, Rule
        5 from QqSpider.items import QqspiderItem
        6 
        7 class QqcentSpider(CrawlSpider):
        8     name = 'qqcent'
        9     allowed_domains = ['tencent.com']
       10     start_urls = ['http://hr.tencent.com/position.php?&start=0#a']
       11     pagelink = LinkExtractor(allow=("start=d+"))
       12     rules = [
       13         Rule(pagelink, callback='parseQqcent', follow=True)
       14     ]
       15 
       16     def parseQqcent(self, response):
       17         #i = {}
       18         #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
       19         #i['name'] = response.xpath('//div[@id="name"]').extract()
       20         #i['description'] = response.xpath('//div[@id="description"]').extract()
       21 
       22         for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
       23             item = QqspiderItem()
       24 
       25             item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]
       26             item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0]
       27 
       28             item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
       29 
       30             item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]
       31 
       32             item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
       33 
       34             item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]
       35 
       36             yield item

      12 scrapy crawl qqcent

    12. vi  tencent.json
  • 相关阅读:
    C#入门分享(三)——C#常量与变量
    C#入门分享(二)——Visual Studio的应用与C#基本语法
    使用Axure设计基于中继器的左侧导航菜单
    关于C#调用非托管DLL,报“内存已损坏的”坑,坑,坑
    SQLserver 查询某个表的字段及字段属性
    关于数据库新建用户提示“用户、组或角色‘’XXX‘’在当前数据库中已已存在”的解决办法
    (转载)sqlserver2008”备份集中的数据库备份与现有的XX数据库不同”解决办法
    sql server 查询表字段的说明备注信息
    (转)软件产品化,对客户意味着什么?
    (转)软件产品化,国内IT人之痛
  • 原文地址:https://www.cnblogs.com/hizf/p/7834525.html
Copyright © 2011-2022 走看看