环境:python3
爬取网址:腾讯社招(http://hr.tencent.com/position.php?keywords=&tid=0&start=0#a)总共2202条数据
pipelines.py
1 from twisted.enterprise import adbapi 2 import pymysql 3 import pymysql.cursors 4 5 6 class MysqlTwistedPipeline(object): 7 def __init__(self,dbpool): 8 self.dbpool=dbpool 9 10 @classmethod 11 def from_settings(cls,settings): 12 dbpool=adbapi.ConnectionPool("pymysql",host=settings["MYSQL_HOST"],db=settings["MYSQL_DBNAME"],user=settings["MYSQL_USER"],password=settings["MYSQL_PASSWORD"],charset="utf8", cursorclass=pymysql.cursors.DictCursor, 13 use_unicode=True) 14 return cls(dbpool) 15 16 def process_item(self,item,spider): 17 # 使用twisted将mysql插入变成异步执行 18 self.dbpool.runInteraction(self.do_insert,item) 19 20 21 def do_insert(self,cursor,item): 22 # 执行具体的插入 23 # 根据不同的item 构建不同的sql语句并插入到mysql中 24 insert_sql, params = item.get_insert_sql() 25 cursor.execute(insert_sql, params)
items.py
1 import scrapy 2 3 4 class TencentItem(scrapy.Item): 5 6 positionname=scrapy.Field() 7 positionlink=scrapy.Field() 8 positionType=scrapy.Field() 9 positionNum=scrapy.Field() 10 positionLocation=scrapy.Field() 11 publishTime=scrapy.Field() 12 13 14 def get_insert_sql(self): 15 insert_sql=""" 16 insert into tencent(positionname,positionlink,positionType,positionNum,positionLocation,publishTime) 17 VALUES (%s,%s,%s,%s,%s,%s) 18 19 """ 20 params=( 21 self['positionname'], self['positionlink'], self['positionType'], self['positionNum'], 22 self['positionLocation'], self['publishTime'] 23 ) 24 return insert_sql,params 25
settings.py
BOT_NAME = 'tencent' SPIDER_MODULES = ['tencent.spiders'] NEWSPIDER_MODULE = 'tencent.spiders' ROBOTSTXT_OBEY = False (不用分布式可忽略下面三项) SCHEDULER = "scrapy_redis.scheduler.Scheduler" DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" SCHEDULER_PERSIST = True DOWNLOAD_DELAY = 2 DEFAULT_REQUEST_HEADERS = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', #'Accept-Language': 'en', } ITEM_PIPELINES = { 'scrapy_redis.pipelines.RedisPipeline':400,(不用分布式可忽略) 'tencent.pipelines.MysqlTwistedPipeline': 300, } REDIS_HOST = '172.21.118.56'(分布式主机ip 不用分布式可忽略) REDIS_PORT = 6379(不用分布式可忽略) MYSQL_HOST = "127.0.0.1" MYSQL_DBNAME = "tencent"(自己数据库名字) MYSQL_USER = "usrername"(用户名) MYSQL_PASSWORD = "userpassword"(密码)
spiders/Tencent.py
from scrapy.linkextractors import LinkExtractor from scrapy.spiders import Rule from scrapy_redis.spiders import RedisCrawlSpider from tencent.items import TencentItem class TencentSpider(RedisCrawlSpider): name = "Tencent" allowed_domains = ["tencent.com"] redis_key = 'TencentSpider:start_urls' page_link=LinkExtractor(allow=(r"start=d+")) rules=[ Rule(page_link,callback = "parseContent",follow=True) ] def parseContent(self, response): list=response.xpath('//tr[@class="even"] | //tr[@class="odd"]') for infos in list: item=TencentItem() item['positionname']=infos.xpath("./td[1]/a/text()").extract()[0] item['positionlink']=infos.xpath("./td[1]/a/@href").extract()[0] item['positionType']=infos.xpath("./td[2]/text()").extract() item['positionNum']=infos.xpath("./td[3]/text()").extract()[0] item['positionLocation']=infos.xpath("./td[4]/text()").extract()[0] item['publishTime']=infos.xpath("./td[5]/text()").extract()[0] yield item