zoukankan      html  css  js  c++  java
  • nodejs爬虫

    初学nodejs爬虫,记录一下:

    var request = require('request');
    var iconv = require('iconv-lite'); //转码
    var cheerio = require('cheerio');
    var fs = require("fs");
    
    var hrefArr = [ 'categories/2017627968.html',
    'categories/2017627967.html',
    'categories/2017627966.html',
    'categories/2017627965.html',
    'categories/2017627964.html',
    'categories/2017627963.html',
    'categories/2017627962.html',
    'categories/2017627961.html',
    'categories/2017627960.html',
    'categories/2017627977.html',
    'categories/2017627976.html',
    'categories/2017627975.html',
    'categories/2017627974.html',
    'categories/2017627973.html',
    'categories/2017627972.html',
    'categories/2017627971.html',
    'categories/2017627970.html',
    'categories/2017627969.html',
    'categories/2017627978.html',
    'categories/2017627983.html',
    'categories/2017627986.html',
    'categories/2017627985.html',
    'categories/2017627984.html',
    'categories/2017627982.html',
    'categories/2017627981.html',
    'categories/2017627980.html',
    'categories/2017627979.html',
    'categories/2017627994.html',
    'categories/2017627993.html',
    'categories/2017627992.html',
    'categories/2017627991.html',
    'categories/2017627990.html',
    'categories/2017627989.html',
    'categories/2017627988.html',
    'categories/2017627987.html',
    'categories/2017627950.html',
    'categories/20176281003.html',
    'categories/20176281002.html',
    'categories/20176281001.html',
    'categories/20176281000.html',
    'categories/2017628999.html',
    'categories/2017628998.html',
    'categories/2017628997.html',
    'categories/2017628996.html',
    'categories/2017628995.html',
    'categories/20176281012.html',
    'categories/20176281011.html',
    'categories/20176281010.html',
    'categories/20176281009.html',
    'categories/20176281008.html',
    'categories/20176281007.html',
    'categories/20176281006.html',
    'categories/20176281005.html',
    'categories/20176281004.html',
    'categories/20176281022.html',
    'categories/20176281021.html',
    'categories/20176281020.html',
    'categories/20176281019.html',
    'categories/20176281018.html',
    'categories/20176281017.html',
    'categories/20176281016.html',
    'categories/20176281015.html',
    'categories/20176281014.html',
    'categories/20176281031.html',
    'categories/20176281030.html',
    'categories/20176281029.html',
    'categories/20176281028.html',
    'categories/20176281027.html',
    'categories/20176281026.html',
    'categories/20176281025.html',
    'categories/20176281024.html',
    'categories/20176281023.html',
    'categories/2017627959.html',
    'categories/2017627958.html',
    'categories/2017627957.html',
    'categories/2017627956.html',
    'categories/2017627955.html',
    'categories/2017627954.html',
    'categories/2017627952.html',
    'categories/2017627951.html',
    'categories/2017627949.html' ]
    
    var viewInfo = [];
    for(var i = 0; i < hrefArr.length; i++ ){
        var href = "http://www.yinghexinxi.cn/"+hrefArr[i];
        request.get({url:href,encoding:null},function(err,response,body){
            var buf =  iconv.decode(body, 'gb2312');
            var $ = cheerio.load(buf);
            var data = [];
            
            $("[height='22']").each(function(index, element){
                var info = $(element).text().trim();
                var splitInfo = info.split(":");
                data.push(splitInfo[1]);
            })
            var obj = {
                id: href.replace(/[^0-9]/ig,""),
                name: data[0],
                tel: data[1],
                qq: data[2],
                date: data[3],
                area: data[4],
                email: data[5],
                location: data[6],
                ip: data[7],
                phone: data[8],
            }
            viewInfo.push(obj);
        });
    }
    
    setTimeout(function(){
        var data = JSON.stringify(viewInfo)
        console.log("准备写入文件");
        fs.writeFile('hrefInView.json', data, function(err) {
            if (err) {
                return console.error(err);
            }
            console.log("数据写入成功!");
        });
    },6000)
  • 相关阅读:
    string数组批量转换成Int数组
    TCP/IP 、 HTTP 、HTTPS
    静态布局、自适应布局、流式布局、响应式布局、弹性布局等的概念和区别
    Vue源码学习02 初始化模块init.js
    IOS8白屏
    VUE 源码学习01 源码入口
    http状态码
    vue全家桶(Vue+Vue-router+Vuex+axios)(Vue+webpack项目实战系列之二)
    Vue实战Vue-cli项目构建(Vue+webpack系列之一)
    module.exports,exports,export和export default,import与require区别与联系【原创】
  • 原文地址:https://www.cnblogs.com/muou2125/p/9400958.html
Copyright © 2011-2022 走看看