zoukankan      html  css  js  c++  java
  • node爬虫进阶版

    手写了一个方便爬虫的小库:

    const url = require('url')
    const glib = require('zlib')
    
    //默认头部
    const _default_headers = {
        'Accept-Encoding': 'gzip, deflate, br',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
    }
    
    //options(url,method,header)--http头部信息 isDebug--是否开启调试状态
    module.exports = function(options, isDebug) {
        if(typeof options === "string") {
            options = {
                url: options,
                method: 'GET',
                headers: {}
            }
        } else {
            options = options || {}
            options.method = options.method || 'GET'
            options.headers = options.headers || {}
        }
        options.headers = Object.assign(_default_headers, options.headers)
        
        
        function debug(msg) {
            if(isDebug) {
                console.log(msg)
            }
        }
    
        return new Promise((resolve, reject) => {
            req(options)
    
            function req(options) {
                //判断是http还是https
                let urlObj = url.parse(options.url)
                let mod = null
                port = 0
    
                if(urlObj.protocol == 'https:') {
                    mod = require('https')
                    port = 443
                } else {
                    mod = require('http')
                    port = 80
                }
    
                let _req_options = {
                    hostname: urlObj.hostname,
                    port,
                    path: urlObj.path,
                    method: options.method,
                    headers: options.headers
                }
                //开始模拟,爬取信息
                let req_obj = mod.request(_req_options, (res) => {
                    if(res.statusCode!==200) {
                        //如果是重定向则重新在请求
                        if(res.statusCode == 301 || res.statusCode === 302) {
                            options.url = res.headers.location
                            debug('重定向: '+res.headers.location)
                            req(options)
                        } else {
                            reject(res.statusCode)
                        }
                    } else {
                        //statusCode是200时接受data buffer
                        let data = []
                        res.on('data', buffer => {
                            data.push(buffer)
                        })
                        res.on('end', () =>{
                            let buffer = Buffer.concat(data)
                            //判断是否传输有误
                            if (res.headers['content-length'] != buffer.length) {
                                debug('收到数据有误,正在重新获取')
                                req(options)
                            }
                            //判断是否有用gzip
                            else if (res.headers['content-encoding'] && res.headers['content-encoding'].includes('gzip')) {
                               buffer = glib.gunzip(buffer, (err,data) => {
                                   debug('gzip解压完成并成功返回')
                                   resolve(data)
                               })
                            } else {
                                debug('成功返回')
                                resolve(buffer)                     
                            }
                        })
                    }
                })
                req_obj.on('error', err => {
                    debug('爬虫失败')
                    reject(err)
                })
                req_obj.end()
            }
        })
    }

    require进来然后传入url或者options,就可以得到爬虫后返回的promise了

    举个例子:

    我要爬个bilibili的视频:

    const url = require('url')
    const fs = require('fs')
    
    function getVideo(options, headers, fileName) {
        if(typeof options === "string") {
            options = {
                url: options,
                method: 'GET',
                headers: {},
                timeout: 2000
            }
        } else {
            options = options || {}
            options.method = options.method || 'GET'
            options.headers = options.headers || {}
            options.timeout = options.timeout || 2000
        }
        options.headers = headers
    
        return new Promise((resolve, reject) => {
            req(options)
    
            function req(options) {
                //判断是http还是https
                let urlObj = url.parse(options.url)
                let mod = null
                port = 0
    
                if(urlObj.protocol == 'https:') {
                    mod = require('https')
                    port = 443
                } else {
                    mod = require('http')
                    port = 80
                }
    
                let _req_options = {
                    hostname: urlObj.hostname,
                    port,
                    path: urlObj.path,
                    method: options.method,
                    headers: options.headers,
                    timeout: options.timeout
                }
                //开始模拟,爬取信息
                let req_obj = mod.request(_req_options, (res) => {
                    // 视频路径
                    const filePath = `${__dirname}/${fileName}`;
                    if (fs.existsSync(filePath)) {
                        fs.unlinkSync(filePath)
                    }
                    res.on('data', buffer => {
                        fs.appendFileSync(filePath, buffer)
                        const size = fs.statSync(filePath).size;
                        console.log(`已下载${(size / 1024 / 1024).toFixed(2)}MB,完成${(size/res.headers['content-length'] * 100).toFixed(2)}%`)
                    })
                    res.on('end', () =>{
                        resolve()                     
                    })
                })
                req_obj.on('error', err => {
                    debug('爬虫失败')
                    reject(err)
                })
                req_obj.end()
            }
        })
    }
    
    // 生成文件名
    const fileName = '1.flv'
    // 链接
    const videoUrl = 'https://cn-sdyt-cu-v-05.acgvideo.com/upgcxcode/66/83/34548366/34548366-1-64.flv?expires=1545405600&platform=pc&ssig=ElhY4A2e-U4R2m8EI1eiGQ&oi=1928611810&nfa=uTIiNt+AQjcYULykM2EttA==&dynamic=1&hfa=2116953847&hfb=Yjk5ZmZjM2M1YzY4ZjAwYTMzMTIzYmIyNWY4ODJkNWI=&trid=45c5fdc464354b71bf599c224b7df8ea&nfb=maPYqpoel5MI3qOUX6YpRA==&nfc=1';
    // 头部
    const header = {
        'Origin': 'https://www.bilibili.com',
        'Referer': 'https://www.bilibili.com/video/av21061574',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
    }
    
    getVideo(videoUrl, header, fileName).then(res => {
        console.log('写入成功');
    })
  • 相关阅读:
    在线工具TOOL收藏
    HtmlDocument [代码碎片笔记]
    ChromiumWebBrowser [链接]
    PHP [开发汇总]
    Discuz[技术文献]
    [jvm] -- 监控和调优常用命令工具篇
    [jvm] -- 常用内存参数配置篇
    [日常摘要] -- 事务的隔离级别篇
    [日常摘要] -- ThreadLocal篇
    [日常摘要] -- zookeeper篇
  • 原文地址:https://www.cnblogs.com/amiezhang/p/8562095.html
Copyright © 2011-2022 走看看