zoukankan      html  css  js  c++  java
  • 腾讯招聘网站爬虫

    1. scrapy startproject Qqspider
    2. cd Qqspider
    3. cd Qqspider
    4. ls
    5. vi items.py
    6.   1 # -*- coding: utf-8 -*-
        2 
        3 # Define here the models for your scraped items
        4 #
        5 # See documentation in:
        6 # http://doc.scrapy.org/en/latest/topics/items.html
        7 
        8 import scrapy
        9 
       10 
       11 class QqspiderItem(scrapy.Item):
       12     # define the fields for your item here like:
       13     positionName = scrapy.Field()
       14     positionLink = scrapy.Field()
       15     positionType = scrapy.Field()
       16     peopleNum = scrapy.Field()
       17     workLocation = scrapy.Field()
       18     publishTime = scrapy.Field()
      ~                                          

      7.vi  pipelines.py

    7.   1 # -*- coding: utf-8 -*-
        2 
        3 # Define your item pipelines here
        4 #
        5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
        6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
        7 
        8 import json
        9 class QqspiderPipeline(object):
       10 
       11 
       12     def __init__(self):
       13         self.filename = open("tencent.json", "w")
       14 
       15     def process_item(self, item, spider):
       16         text = json.dumps(dict(item), ensure_ascii = False)
       17         self.filename.write(text.encode("utf-8"))
       18         return item
       19 
       20     def close_spider(self, spider):
       21         self.filename.close()
       22 

      8.vi settings.py

    8.  42 DEFAULT_REQUEST_HEADERS = {
       43     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
       44     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       45 #   'Accept-Language': 'en',
       46 }

      68 ITEM_PIPELINES = {
       69     'QqSpider.pipelines.QqspiderPipeline': 300,
       70 }



      9.cd spiders

    9. scrapy genspider t crawl qqcent tencent.com
    10. vi qqcent.py
    11.  1 # -*- coding: utf-8 -*-
        2 import scrapy
        3 from scrapy.linkextractors import LinkExtractor
        4 from scrapy.spiders import CrawlSpider, Rule
        5 from QqSpider.items import QqspiderItem
        6 
        1 # -*- coding: utf-8 -*-
        2 import scrapy
        3 from scrapy.linkextractors import LinkExtractor
        4 from scrapy.spiders import CrawlSpider, Rule
        5 from QqSpider.items import QqspiderItem
        6 
        7 class QqcentSpider(CrawlSpider):
        8     name = 'qqcent'
        9     allowed_domains = ['tencent.com']
       10     start_urls = ['http://hr.tencent.com/position.php?&start=0#a']
       11     pagelink = LinkExtractor(allow=("start=d+"))
       12     rules = [
       13         Rule(pagelink, callback='parseQqcent', follow=True)
       14     ]
       15 
       16     def parseQqcent(self, response):
       17         #i = {}
       18         #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
       19         #i['name'] = response.xpath('//div[@id="name"]').extract()
       20         #i['description'] = response.xpath('//div[@id="description"]').extract()
       21 
       22         for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
       23             item = QqspiderItem()
       24 
       25             item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]
       26             item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0]
       27 
       28             item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
       29 
       30             item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]
       31 
       32             item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
       33 
       34             item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]
       35 
       36             yield item

      12 scrapy crawl qqcent

    12. vi  tencent.json
  • 相关阅读:
    C#开发Activex控件疑难杂症
    spring、struts、mybatis、Postgresql集成使用存储过程进行分页
    C#开发Activex控件升级
    通过Maven将Web程序部署到远程Tomcat8服务器的一些注意事项
    分页存储过程Oracle版
    JSP EL表达式(转)
    关于Log4x
    C#类在初始化时的执行顺序
    使用MSMQ 远程队列
    tomcat部署与Context(转)
  • 原文地址:https://www.cnblogs.com/hizf/p/7834525.html
Copyright © 2011-2022 走看看