主要采用ruby Parallel库提供的多线程方式:
require 'unirest' require 'open-uri' require 'parallel' require 'json' url = 'http://pvp.qq.com/web201605/js/herolist.json' response = Unirest.get(url) /([.*])/ =~ response.body.force_encoding('utf-8') hero_list = JSON $1 #返回ename, cname, skin_name组成的hash数组 hero_list.each do |hero| hero.select!{|key, _| key=='ename' || key == 'cname' || key == 'skin_name'} # hero.select!{|key, _| key =~ /[('ename')|('cname')|('skin_name')]/} end def download_hero_img(url, path, img_name) img_file = open(url,:ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE){ |f| f.read } puts "download #{path}/#{img_name}.jpg" open("#{path}/#{img_name}.jpg", 'wb'){ |f| f.write(img_file)} end a = Proc.new{hero_list.pop || Parallel::Stop} # 分成10个线程爬取 Parallel.map(a, in_threads:10) do |hero| path = "./#{hero['cname']}" Dir.mkdir(path) unless Dir.exists?(path) # url = "http://game.gtimg.cn/images/yxzj/img201606/heroimg/#{hero['ename']}/#{hero['ename']}" skin_list = hero['skin_name'].split('|') # puts skin_list skin_list.each_with_index do |skin, index| url = "https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/#{hero['ename']}/#{hero['ename']}-bigskin-#{index.to_i+1}.jpg" img_name = skin download_hero_img(url, path, img_name) end end
线程参考资料: