zoukankan      html  css  js  c++  java
  • scrapy将爬取的数据存入MySQL数据库

    items.py
    
    import scrapy
    
    
    class InsistItem(scrapy.Item):
    
        positionname=scrapy.Field()
        type=scrapy.Field()
        place=scrapy.Field()
        mian=scrapy.Field()
        time=scrapy.Field()
    
    pipelines.py
    
    import json
    import scrapy
    import pymysql
    from scrapy.pipelines.images import ImagesPipeline
    class InsistPipeline(object):
        def __init__(self):
            self.db=pymysql.connect(host='localhost',user='dsuser',passwd='badpassword',db='dsdb',charset='utf8',port=3306)
            self.cur=self.db.cursor()
        def process_item(self, item, spider):
            sql='INSERT INTO job(name,type,place,mian,time) VALUES(%s,%s,%s,%s,%s) '
            self.cur.execute(sql,(item['positionname'],item['type'],item['place'],item['mian'],item['time']))
            self.db.commit()
            return item
          
        def close_spider(self, spider):
            self.cur.close()
            self.db.close()
    
    insisits.py
    #爬虫程序
    import scrapy
    from insist.items import InsistItem
    import json
    class InsistsSpider(scrapy.Spider):
        name = 'insists'
        allowed_domains = ['careers.tencent.com']
        #start_urls =['https://careers.tencent.com/search.html?index=']
        baseURL='https://careers.tencent.com/tencentcareer/api/post/Query?pageSize=10&pageIndex='
        offset=1
        start_urls=[baseURL+str(offset)]
    
        def parse(self, response):
            contents = json.loads(response.text)
            jobs = contents['Data']['Posts']
            item = InsistItem()
            for job in jobs:
                item['positionname'] = job['RecruitPostName']
                item['type'] = job['BGName']
                item['place'] = job['LocationName']
                item['mian'] = job['CategoryName']
                item['time'] = job['LastUpdateTime']
                yield item#返回后继续执行数据
            if self.offset<=5:
                  self.offset+=1
                  url=self.baseURL+str(self.offset)
                  yield scrapy.Request(url,callback=self.parse)

  • 相关阅读:
    每日leetcode-数组-506. 相对名次
    每日leetcode-数组-412. Fizz Buzz
    每日leetcode-数组-299. 猜数字游戏
    正则表达式
    Linux三剑客之grep
    Google 74版本上传附件没有“选择文件”按钮
    Google卸载后再次安装提示安装失败
    linux下,数据泵导dmp文件
    Oracle数据库创建表空间
    Lr controller运行时,报错missing newline in C:userAdministratorDesktopjiekouusername.dat
  • 原文地址:https://www.cnblogs.com/persistence-ok/p/11647296.html
Copyright © 2011-2022 走看看