zoukankan      html  css  js  c++  java
  • scrapy实战8关于数据异步写入mysql:

    环境:python3

    爬取网址:腾讯社招(http://hr.tencent.com/position.php?keywords=&tid=0&start=0#a)总共2202条数据

    pipelines.py

     1 from twisted.enterprise import adbapi
     2 import pymysql
     3 import pymysql.cursors
     4 
     5 
     6 class MysqlTwistedPipeline(object):
     7     def __init__(self,dbpool):
     8         self.dbpool=dbpool
     9 
    10     @classmethod
    11     def from_settings(cls,settings):
    12         dbpool=adbapi.ConnectionPool("pymysql",host=settings["MYSQL_HOST"],db=settings["MYSQL_DBNAME"],user=settings["MYSQL_USER"],password=settings["MYSQL_PASSWORD"],charset="utf8", cursorclass=pymysql.cursors.DictCursor,
    13             use_unicode=True)
    14         return cls(dbpool)
    15 
    16     def process_item(self,item,spider):
    17         # 使用twisted将mysql插入变成异步执行
    18         self.dbpool.runInteraction(self.do_insert,item)
    19 
    20 
    21     def do_insert(self,cursor,item):
    22         # 执行具体的插入
    23         # 根据不同的item 构建不同的sql语句并插入到mysql中
    24         insert_sql, params = item.get_insert_sql()
    25         cursor.execute(insert_sql, params)

    items.py

     1 import scrapy
     2 
     3 
     4 class TencentItem(scrapy.Item):
     5    
     6     positionname=scrapy.Field()
     7     positionlink=scrapy.Field()
     8     positionType=scrapy.Field()
     9     positionNum=scrapy.Field()
    10     positionLocation=scrapy.Field()
    11     publishTime=scrapy.Field()
    12 
    13 
    14     def get_insert_sql(self):
    15         insert_sql="""
    16         insert into tencent(positionname,positionlink,positionType,positionNum,positionLocation,publishTime)
    17         VALUES (%s,%s,%s,%s,%s,%s)
    18         
    19         """
    20         params=(
    21             self['positionname'], self['positionlink'], self['positionType'], self['positionNum'],
    22             self['positionLocation'], self['publishTime']
    23         )
    24         return insert_sql,params
    25     

    settings.py

    BOT_NAME = 'tencent'
    
    SPIDER_MODULES = ['tencent.spiders']
    NEWSPIDER_MODULE = 'tencent.spiders'
    
    
    ROBOTSTXT_OBEY = False
    
    (不用分布式可忽略下面三项)
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    SCHEDULER_PERSIST = True
    
    
    DOWNLOAD_DELAY = 2
    
    DEFAULT_REQUEST_HEADERS = {
      'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       #'Accept-Language': 'en',
    }
    
    
    ITEM_PIPELINES = {
           'scrapy_redis.pipelines.RedisPipeline':400,(不用分布式可忽略)
          'tencent.pipelines.MysqlTwistedPipeline': 300,
    }
    REDIS_HOST = '172.21.118.56'(分布式主机ip 不用分布式可忽略)
    REDIS_PORT = 6379(不用分布式可忽略)
    
    
    MYSQL_HOST = "127.0.0.1"
    MYSQL_DBNAME = "tencent"(自己数据库名字)
    MYSQL_USER = "usrername"(用户名)
    MYSQL_PASSWORD = "userpassword"(密码)

    spiders/Tencent.py

    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import Rule
    from scrapy_redis.spiders import RedisCrawlSpider
    from tencent.items import TencentItem
    
    class TencentSpider(RedisCrawlSpider):
        name = "Tencent"
        allowed_domains = ["tencent.com"]
        redis_key = 'TencentSpider:start_urls'
    
    
        page_link=LinkExtractor(allow=(r"start=d+"))
    
        rules=[
                Rule(page_link,callback = "parseContent",follow=True)
        ]
    
        def parseContent(self, response):
            list=response.xpath('//tr[@class="even"] | //tr[@class="odd"]')
            for infos in list:
                item=TencentItem()
                item['positionname']=infos.xpath("./td[1]/a/text()").extract()[0]
                item['positionlink']=infos.xpath("./td[1]/a/@href").extract()[0]
                item['positionType']=infos.xpath("./td[2]/text()").extract()
                item['positionNum']=infos.xpath("./td[3]/text()").extract()[0]
                item['positionLocation']=infos.xpath("./td[4]/text()").extract()[0]
                item['publishTime']=infos.xpath("./td[5]/text()").extract()[0]
    
                yield item
  • 相关阅读:
    dwSun带你选Python的编辑器/IDE
    ubuntu中文乱码解决
    解决matplotlib中文显示
    1506.01186-Cyclical Learning Rates for Training Neural Networks
    1503.02531-Distilling the Knowledge in a Neural Network.md
    1804.03235-Large scale distributed neural network training through online distillation.md
    mysql导入太慢解决方法
    已某个时间单位(日月周年)来分割时间段
    阿里云邮件推送
    阿里云短信推送服务
  • 原文地址:https://www.cnblogs.com/huwei934/p/7116877.html
Copyright © 2011-2022 走看看