初学nodejs爬虫,记录一下:
var request = require('request'); var iconv = require('iconv-lite'); //转码 var cheerio = require('cheerio'); var fs = require("fs"); var hrefArr = [ 'categories/2017627968.html', 'categories/2017627967.html', 'categories/2017627966.html', 'categories/2017627965.html', 'categories/2017627964.html', 'categories/2017627963.html', 'categories/2017627962.html', 'categories/2017627961.html', 'categories/2017627960.html', 'categories/2017627977.html', 'categories/2017627976.html', 'categories/2017627975.html', 'categories/2017627974.html', 'categories/2017627973.html', 'categories/2017627972.html', 'categories/2017627971.html', 'categories/2017627970.html', 'categories/2017627969.html', 'categories/2017627978.html', 'categories/2017627983.html', 'categories/2017627986.html', 'categories/2017627985.html', 'categories/2017627984.html', 'categories/2017627982.html', 'categories/2017627981.html', 'categories/2017627980.html', 'categories/2017627979.html', 'categories/2017627994.html', 'categories/2017627993.html', 'categories/2017627992.html', 'categories/2017627991.html', 'categories/2017627990.html', 'categories/2017627989.html', 'categories/2017627988.html', 'categories/2017627987.html', 'categories/2017627950.html', 'categories/20176281003.html', 'categories/20176281002.html', 'categories/20176281001.html', 'categories/20176281000.html', 'categories/2017628999.html', 'categories/2017628998.html', 'categories/2017628997.html', 'categories/2017628996.html', 'categories/2017628995.html', 'categories/20176281012.html', 'categories/20176281011.html', 'categories/20176281010.html', 'categories/20176281009.html', 'categories/20176281008.html', 'categories/20176281007.html', 'categories/20176281006.html', 'categories/20176281005.html', 'categories/20176281004.html', 'categories/20176281022.html', 'categories/20176281021.html', 'categories/20176281020.html', 'categories/20176281019.html', 'categories/20176281018.html', 'categories/20176281017.html', 'categories/20176281016.html', 'categories/20176281015.html', 'categories/20176281014.html', 'categories/20176281031.html', 'categories/20176281030.html', 'categories/20176281029.html', 'categories/20176281028.html', 'categories/20176281027.html', 'categories/20176281026.html', 'categories/20176281025.html', 'categories/20176281024.html', 'categories/20176281023.html', 'categories/2017627959.html', 'categories/2017627958.html', 'categories/2017627957.html', 'categories/2017627956.html', 'categories/2017627955.html', 'categories/2017627954.html', 'categories/2017627952.html', 'categories/2017627951.html', 'categories/2017627949.html' ] var viewInfo = []; for(var i = 0; i < hrefArr.length; i++ ){ var href = "http://www.yinghexinxi.cn/"+hrefArr[i]; request.get({url:href,encoding:null},function(err,response,body){ var buf = iconv.decode(body, 'gb2312'); var $ = cheerio.load(buf); var data = []; $("[height='22']").each(function(index, element){ var info = $(element).text().trim(); var splitInfo = info.split(":"); data.push(splitInfo[1]); }) var obj = { id: href.replace(/[^0-9]/ig,""), name: data[0], tel: data[1], qq: data[2], date: data[3], area: data[4], email: data[5], location: data[6], ip: data[7], phone: data[8], } viewInfo.push(obj); }); } setTimeout(function(){ var data = JSON.stringify(viewInfo) console.log("准备写入文件"); fs.writeFile('hrefInView.json', data, function(err) { if (err) { return console.error(err); } console.log("数据写入成功!"); }); },6000)