zoukankan      html  css  js  c++  java
  • Node.Js 简单的数据采集示例,使用 http+cheerio+iconv-lite

    源码如下:(collect-http.js)

    // 文件名:collect-http.js
    /**
     * Node.Js 简单的数据采集示例,使用 http+cheerio+iconv-lite
     * 安装依赖:npm install cheerio iconv-lite
     * 注意事项:gzip页面处理 gzip:true
     */
    const NmHttps = require('https')
    const NmHttp = require('http')
    const NmFs = require('fs');
    const NmPath = require('path');
    const NmCheerio = require('cheerio')
    // Node 环境当中不支持 GBK 编码,所以需要引用 iconv-lite 模块来转码
    const NmIconv = require('iconv-lite')
    // 图片数据缓冲区
    const NmBufferHelper = require('bufferhelper');
    
    let url = 'https://www.163.com';
    let imgurl = 'https://pic2020.lianzhixiu.com/2016/1123/19/2.jpg';
    
    collectHtml(url);
    collectImage(imgurl);
    collectImage2(imgurl)
    
    // 抓取Html页面内容
    function collectHtml(url) {
        NmHttps.get(url, function (res) {
            let chunks = [],
                size = 0;
            res.on('data', (chunk) => {
                chunks.push(chunk);
                size += chunk.length;
            })
            res.on('end', () => {
                let data = Buffer.concat(chunks, size);
                data = NmIconv.decode(data, 'GBK');
                let html = data.toString();
                let $ = NmCheerio.load(html);
                $('img').each(function (index, item) {
                    console.log('===', item.attribs['data-original']);
                    collectImage(item.attribs['data-original']);
                })
                NmFs.writeFile('./collect-http-163.html', html, () => {
                    console.log('write success');
                })
            })
        })
    
    }
    
    // 抓取图片
    function collectImage(url) {
        if (!url) {
            return;
        }
        let posQuery = url.indexOf('?');
        let pathInfo = NmPath.parse(posQuery > -1 ? url.substring(0, posQuery) : url);
        let savePath = './img/' + pathInfo.base;
        let stream = NmFs.createWriteStream(savePath);
        if (url.indexOf('https') == 0) {
            NmHttps.get(url, function (res) {
                res.pipe(stream);
            })
        } else {
            NmHttp.get(url, function (res) {
                res.pipe(stream);
            })
        }
    }
    
    // 采集图片,需要模块 bufferhelper
    function collectImage2(url) {
        if (!url) {
            return;
        }
        let posQuery = url.indexOf('?');
        let pathInfo = NmPath.parse(posQuery > -1 ? url.substring(0, posQuery) : url);
        let savePath = './img/' + pathInfo.base;
        if (url.indexOf('https') == 0) {
            NmHttps.get(url, function (res) {
                let buffer = new NmBufferHelper();
                res.on('data', function (chunk) {
                    buffer.concat(chunk);
                })
                res.on('end', function () {
                    NmFs.writeFile(savePath, buffer.toBuffer(), function (error) {
                        console.log('==', error);
                    })
                })
                res.on('error', function (error) {
                    console.log('===', error);
                })
            })
        } else {
            NmHttp.get(url, function (res) {
                let buffer = new NmBufferHelper();
                res.on('data', function (chunk) {
                    buffer.concat(chunk);
                })
                res.on('end', function () {
                    NmFs.writeFile(savePath, buffer.toBuffer(), function (error) {
                        console.log('==', error);
                    })
                })
                res.on('error', function (error) {
                    console.log('===', error);
                })
            })
        }
    }
    

    运行:

    node collect-http.js
    
  • 相关阅读:
    Hadoop Combiner的三次测试...
    加了@Accessors(chain = true),copy实体类出现问题
    zookeeper启动:Could not find or load main class org.apache.zookeeper.server.quorum.
    CentOS7 更新yum源
    SpringBoot外部静态资源的访问
    从背包问题说起——初学者角度看背包问题
    C++ 常用STL数据类型总结归纳 简单易懂 入门 教程 array vector list deque map set stack
    1.4 HTML5新增的表单属性
    1.3 HTML5新增的input类型
    1.2 HTML5新增的多媒体标签
  • 原文地址:https://www.cnblogs.com/sochishun/p/14378181.html
Copyright © 2011-2022 走看看