zoukankan      html  css  js  c++  java
  • scrapy将爬取的数据存入MySQL数据库

    items.py
    
    import scrapy
    
    
    class InsistItem(scrapy.Item):
    
        positionname=scrapy.Field()
        type=scrapy.Field()
        place=scrapy.Field()
        mian=scrapy.Field()
        time=scrapy.Field()
    
    pipelines.py
    
    import json
    import scrapy
    import pymysql
    from scrapy.pipelines.images import ImagesPipeline
    class InsistPipeline(object):
        def __init__(self):
            self.db=pymysql.connect(host='localhost',user='dsuser',passwd='badpassword',db='dsdb',charset='utf8',port=3306)
            self.cur=self.db.cursor()
        def process_item(self, item, spider):
            sql='INSERT INTO job(name,type,place,mian,time) VALUES(%s,%s,%s,%s,%s) '
            self.cur.execute(sql,(item['positionname'],item['type'],item['place'],item['mian'],item['time']))
            self.db.commit()
            return item
          
        def close_spider(self, spider):
            self.cur.close()
            self.db.close()
    
    insisits.py
    #爬虫程序
    import scrapy
    from insist.items import InsistItem
    import json
    class InsistsSpider(scrapy.Spider):
        name = 'insists'
        allowed_domains = ['careers.tencent.com']
        #start_urls =['https://careers.tencent.com/search.html?index=']
        baseURL='https://careers.tencent.com/tencentcareer/api/post/Query?pageSize=10&pageIndex='
        offset=1
        start_urls=[baseURL+str(offset)]
    
        def parse(self, response):
            contents = json.loads(response.text)
            jobs = contents['Data']['Posts']
            item = InsistItem()
            for job in jobs:
                item['positionname'] = job['RecruitPostName']
                item['type'] = job['BGName']
                item['place'] = job['LocationName']
                item['mian'] = job['CategoryName']
                item['time'] = job['LastUpdateTime']
                yield item#返回后继续执行数据
            if self.offset<=5:
                  self.offset+=1
                  url=self.baseURL+str(self.offset)
                  yield scrapy.Request(url,callback=self.parse)

  • 相关阅读:
    easyui多选与接收不一致解决方案
    PEP8规范总结
    python2与python3的区别
    双下方法(魔术方法内置方法)
    文件操作
    抽象类,接口类,归一化设计,多态,鸭子类型
    html
    数据库的初识
    boby标签中相关标签
    爬虫介绍
  • 原文地址:https://www.cnblogs.com/persistence-ok/p/11647296.html
Copyright © 2011-2022 走看看