zoukankan      html  css  js  c++  java
  • scrapy框架

    #创建项目
    scrapy startproject demo
    
    #开一个爬虫项目
    cd demo 
    scrapy genspider first www.baidu.com
    
    #setting 中设置 
    ROBOTSTXT_OBEY = False
    USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
    
    #执行爬虫
    scrapy crawl first --nolog

    #持久化存储(简单)
    scrapy crawl first -o a.json


    #取值
    text = div.xpath('./div[1]//h2/text()')[0].extract()
    word = div.xpath('./a//span/text()').extract_first()
     1   #实例化一个item类型的对象
     2             item = BossproItem()
     3             #
     4             item['title'] = title
     5             item['salary'] = salary
     6             item['company'] = company
     7             
     8             #将item对象提交给管道进行持久化存储
     9             yield item
    10 
    11 
    12#item文件中
    13 class BossproItem(scrapy.Item):
    14     # define the fields for your item here like:
    15     # name = scrapy.Field()
    16     title = scrapy.Field()
    17     salary = scrapy.Field()
    18     company = scrapy.Field()
    19 
    20 
    21 #pipelines中
    22 class BossproPipeline(object):
    23     fp = None
    24     #只会被执行一次(开始爬虫的时候执行一次)
    25     def open_spider(self,spider):
    26         print('开始爬虫!!!')
    27         self.fp = open('./job.txt','w',encoding='utf-8')
    28     #爬虫文件没提交一次item,该方法会被调用一次
    29     def process_item(self, item, spider):
    30         self.fp.write(item['title']+"	"+item['salary']+'	'+item['company']+'
    ')
    31         return item
    32     def close_spider(self,spider):
    33         print('爬虫结束!!!')
    34         self.fp.close()
    35 #注意:默认情况下,管道机制并没有开启.需要手动在配置文件中进行开启
    36 
    37 
    38 ITEM_PIPELINES = {
    39    'demo.pipelines.DemoPipeline': 300,
    40 }
    多页爬去数据

    page = 1
    page_model = 'https://www.zhipin.com/c101010100/?query=python&page=%d'
    
    
    class FirstSpider(scrapy.Spider):
    page = 1
    page_model = 'https://www.zhipin.com/c101010100/?query=python&page=%d'
    name = 'first'
    # allowed_domains = ['www.baidu.com']
    start_urls = [
    'https://www.zhipin.com/c101010100/?query=python&page=1']

    def parse(self, response):
    li_list = response.xpath('//div[@class="job-list"]/ul/li')

    for li in li_list:
    title = li.xpath('.//div[@class="info-primary"]/h3[@class="name"]/a/div/text()').extract_first()
    salary = li.xpath('.//div[@class="info-primary"]/h3[@class="name"]/a/span/text()').extract_first()
    company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()

    demoitem = DemoItem()
    demoitem['title'] = title
    demoitem['salary'] = salary
    demoitem['company'] = company

    yield demoitem

    if self.page <= 5:
    self.page += 1
    new_url = format(self.page_model % self.page)
    yield scrapy.Request(url=new_url, callback=self.parse)
  • 相关阅读:
    c语言基础学习10_文件操作02
    c语言_文件操作_FILE结构体小解释
    初识 Swift编程语言(中文版)
    Jquery滑动门实现
    【一步一步走(1)】远程桌面软件VNC的安装与配置
    并查集 路径压缩(具体解释)
    linux中操作java进程
    HDOJ 3944 DP?
    选择排序与冒泡排序
    UVa145 Gondwanaland Telecom
  • 原文地址:https://www.cnblogs.com/ls1997/p/10862794.html
Copyright © 2011-2022 走看看