源码如下:(collect-http.js)
// 文件名:collect-http.js
/**
* Node.Js 简单的数据采集示例,使用 http+cheerio+iconv-lite
* 安装依赖:npm install cheerio iconv-lite
* 注意事项:gzip页面处理 gzip:true
*/
const NmHttps = require('https')
const NmHttp = require('http')
const NmFs = require('fs');
const NmPath = require('path');
const NmCheerio = require('cheerio')
// Node 环境当中不支持 GBK 编码,所以需要引用 iconv-lite 模块来转码
const NmIconv = require('iconv-lite')
// 图片数据缓冲区
const NmBufferHelper = require('bufferhelper');
let url = 'https://www.163.com';
let imgurl = 'https://pic2020.lianzhixiu.com/2016/1123/19/2.jpg';
collectHtml(url);
collectImage(imgurl);
collectImage2(imgurl)
// 抓取Html页面内容
function collectHtml(url) {
NmHttps.get(url, function (res) {
let chunks = [],
size = 0;
res.on('data', (chunk) => {
chunks.push(chunk);
size += chunk.length;
})
res.on('end', () => {
let data = Buffer.concat(chunks, size);
data = NmIconv.decode(data, 'GBK');
let html = data.toString();
let $ = NmCheerio.load(html);
$('img').each(function (index, item) {
console.log('===', item.attribs['data-original']);
collectImage(item.attribs['data-original']);
})
NmFs.writeFile('./collect-http-163.html', html, () => {
console.log('write success');
})
})
})
}
// 抓取图片
function collectImage(url) {
if (!url) {
return;
}
let posQuery = url.indexOf('?');
let pathInfo = NmPath.parse(posQuery > -1 ? url.substring(0, posQuery) : url);
let savePath = './img/' + pathInfo.base;
let stream = NmFs.createWriteStream(savePath);
if (url.indexOf('https') == 0) {
NmHttps.get(url, function (res) {
res.pipe(stream);
})
} else {
NmHttp.get(url, function (res) {
res.pipe(stream);
})
}
}
// 采集图片,需要模块 bufferhelper
function collectImage2(url) {
if (!url) {
return;
}
let posQuery = url.indexOf('?');
let pathInfo = NmPath.parse(posQuery > -1 ? url.substring(0, posQuery) : url);
let savePath = './img/' + pathInfo.base;
if (url.indexOf('https') == 0) {
NmHttps.get(url, function (res) {
let buffer = new NmBufferHelper();
res.on('data', function (chunk) {
buffer.concat(chunk);
})
res.on('end', function () {
NmFs.writeFile(savePath, buffer.toBuffer(), function (error) {
console.log('==', error);
})
})
res.on('error', function (error) {
console.log('===', error);
})
})
} else {
NmHttp.get(url, function (res) {
let buffer = new NmBufferHelper();
res.on('data', function (chunk) {
buffer.concat(chunk);
})
res.on('end', function () {
NmFs.writeFile(savePath, buffer.toBuffer(), function (error) {
console.log('==', error);
})
})
res.on('error', function (error) {
console.log('===', error);
})
})
}
}
运行:
node collect-http.js