zoukankan      html  css  js  c++  java
  • nodejs爬虫

    初学nodejs爬虫,记录一下:

    var request = require('request');
    var iconv = require('iconv-lite'); //转码
    var cheerio = require('cheerio');
    var fs = require("fs");
    
    var hrefArr = [ 'categories/2017627968.html',
    'categories/2017627967.html',
    'categories/2017627966.html',
    'categories/2017627965.html',
    'categories/2017627964.html',
    'categories/2017627963.html',
    'categories/2017627962.html',
    'categories/2017627961.html',
    'categories/2017627960.html',
    'categories/2017627977.html',
    'categories/2017627976.html',
    'categories/2017627975.html',
    'categories/2017627974.html',
    'categories/2017627973.html',
    'categories/2017627972.html',
    'categories/2017627971.html',
    'categories/2017627970.html',
    'categories/2017627969.html',
    'categories/2017627978.html',
    'categories/2017627983.html',
    'categories/2017627986.html',
    'categories/2017627985.html',
    'categories/2017627984.html',
    'categories/2017627982.html',
    'categories/2017627981.html',
    'categories/2017627980.html',
    'categories/2017627979.html',
    'categories/2017627994.html',
    'categories/2017627993.html',
    'categories/2017627992.html',
    'categories/2017627991.html',
    'categories/2017627990.html',
    'categories/2017627989.html',
    'categories/2017627988.html',
    'categories/2017627987.html',
    'categories/2017627950.html',
    'categories/20176281003.html',
    'categories/20176281002.html',
    'categories/20176281001.html',
    'categories/20176281000.html',
    'categories/2017628999.html',
    'categories/2017628998.html',
    'categories/2017628997.html',
    'categories/2017628996.html',
    'categories/2017628995.html',
    'categories/20176281012.html',
    'categories/20176281011.html',
    'categories/20176281010.html',
    'categories/20176281009.html',
    'categories/20176281008.html',
    'categories/20176281007.html',
    'categories/20176281006.html',
    'categories/20176281005.html',
    'categories/20176281004.html',
    'categories/20176281022.html',
    'categories/20176281021.html',
    'categories/20176281020.html',
    'categories/20176281019.html',
    'categories/20176281018.html',
    'categories/20176281017.html',
    'categories/20176281016.html',
    'categories/20176281015.html',
    'categories/20176281014.html',
    'categories/20176281031.html',
    'categories/20176281030.html',
    'categories/20176281029.html',
    'categories/20176281028.html',
    'categories/20176281027.html',
    'categories/20176281026.html',
    'categories/20176281025.html',
    'categories/20176281024.html',
    'categories/20176281023.html',
    'categories/2017627959.html',
    'categories/2017627958.html',
    'categories/2017627957.html',
    'categories/2017627956.html',
    'categories/2017627955.html',
    'categories/2017627954.html',
    'categories/2017627952.html',
    'categories/2017627951.html',
    'categories/2017627949.html' ]
    
    var viewInfo = [];
    for(var i = 0; i < hrefArr.length; i++ ){
        var href = "http://www.yinghexinxi.cn/"+hrefArr[i];
        request.get({url:href,encoding:null},function(err,response,body){
            var buf =  iconv.decode(body, 'gb2312');
            var $ = cheerio.load(buf);
            var data = [];
            
            $("[height='22']").each(function(index, element){
                var info = $(element).text().trim();
                var splitInfo = info.split(":");
                data.push(splitInfo[1]);
            })
            var obj = {
                id: href.replace(/[^0-9]/ig,""),
                name: data[0],
                tel: data[1],
                qq: data[2],
                date: data[3],
                area: data[4],
                email: data[5],
                location: data[6],
                ip: data[7],
                phone: data[8],
            }
            viewInfo.push(obj);
        });
    }
    
    setTimeout(function(){
        var data = JSON.stringify(viewInfo)
        console.log("准备写入文件");
        fs.writeFile('hrefInView.json', data, function(err) {
            if (err) {
                return console.error(err);
            }
            console.log("数据写入成功!");
        });
    },6000)
  • 相关阅读:
    《SPFA算法的优化及应用》——姜碧野(学习笔记)
    hdu 4691 Front compression
    hdu 4690 EBCDIC
    UVA 11478 Halum(用bellman-ford解差分约束)
    UVA 11090 Going in Cycle!!(二分答案+判负环)
    UVA 10537 The Toll! Revisited uva1027 Toll(最短路+数学坑)
    hdu 4674 Trip Advisor(缩点+倍增lca)
    canny算子原理
    二值图像连通区域标记
    C++全局变量的声明和定义
  • 原文地址:https://www.cnblogs.com/muou2125/p/9400958.html
Copyright © 2011-2022 走看看