zoukankan      html  css  js  c++  java
  • Node.Js 简单的数据采集示例,使用 http+cheerio+iconv-lite

    源码如下:(collect-http.js)

    // 文件名:collect-http.js
    /**
     * Node.Js 简单的数据采集示例,使用 http+cheerio+iconv-lite
     * 安装依赖:npm install cheerio iconv-lite
     * 注意事项:gzip页面处理 gzip:true
     */
    const NmHttps = require('https')
    const NmHttp = require('http')
    const NmFs = require('fs');
    const NmPath = require('path');
    const NmCheerio = require('cheerio')
    // Node 环境当中不支持 GBK 编码,所以需要引用 iconv-lite 模块来转码
    const NmIconv = require('iconv-lite')
    // 图片数据缓冲区
    const NmBufferHelper = require('bufferhelper');
    
    let url = 'https://www.163.com';
    let imgurl = 'https://pic2020.lianzhixiu.com/2016/1123/19/2.jpg';
    
    collectHtml(url);
    collectImage(imgurl);
    collectImage2(imgurl)
    
    // 抓取Html页面内容
    function collectHtml(url) {
        NmHttps.get(url, function (res) {
            let chunks = [],
                size = 0;
            res.on('data', (chunk) => {
                chunks.push(chunk);
                size += chunk.length;
            })
            res.on('end', () => {
                let data = Buffer.concat(chunks, size);
                data = NmIconv.decode(data, 'GBK');
                let html = data.toString();
                let $ = NmCheerio.load(html);
                $('img').each(function (index, item) {
                    console.log('===', item.attribs['data-original']);
                    collectImage(item.attribs['data-original']);
                })
                NmFs.writeFile('./collect-http-163.html', html, () => {
                    console.log('write success');
                })
            })
        })
    
    }
    
    // 抓取图片
    function collectImage(url) {
        if (!url) {
            return;
        }
        let posQuery = url.indexOf('?');
        let pathInfo = NmPath.parse(posQuery > -1 ? url.substring(0, posQuery) : url);
        let savePath = './img/' + pathInfo.base;
        let stream = NmFs.createWriteStream(savePath);
        if (url.indexOf('https') == 0) {
            NmHttps.get(url, function (res) {
                res.pipe(stream);
            })
        } else {
            NmHttp.get(url, function (res) {
                res.pipe(stream);
            })
        }
    }
    
    // 采集图片,需要模块 bufferhelper
    function collectImage2(url) {
        if (!url) {
            return;
        }
        let posQuery = url.indexOf('?');
        let pathInfo = NmPath.parse(posQuery > -1 ? url.substring(0, posQuery) : url);
        let savePath = './img/' + pathInfo.base;
        if (url.indexOf('https') == 0) {
            NmHttps.get(url, function (res) {
                let buffer = new NmBufferHelper();
                res.on('data', function (chunk) {
                    buffer.concat(chunk);
                })
                res.on('end', function () {
                    NmFs.writeFile(savePath, buffer.toBuffer(), function (error) {
                        console.log('==', error);
                    })
                })
                res.on('error', function (error) {
                    console.log('===', error);
                })
            })
        } else {
            NmHttp.get(url, function (res) {
                let buffer = new NmBufferHelper();
                res.on('data', function (chunk) {
                    buffer.concat(chunk);
                })
                res.on('end', function () {
                    NmFs.writeFile(savePath, buffer.toBuffer(), function (error) {
                        console.log('==', error);
                    })
                })
                res.on('error', function (error) {
                    console.log('===', error);
                })
            })
        }
    }
    

    运行:

    node collect-http.js
    
  • 相关阅读:
    JQuery.Ajax()的data参数类型
    通过拖动表格行进行行排序
    jquery animate()背景色渐变的处理
    JavaScript代码不执行
    Java性能调优笔记
    tika提取pdf信息异常
    Solr字段配置错误
    Oracle查询字符集
    zookeeper中Watcher和Notifications
    zookeeper适用场景:分布式锁实现
  • 原文地址:https://www.cnblogs.com/sochishun/p/14378181.html
Copyright © 2011-2022 走看看