zoukankan      html  css  js  c++  java
  • nodejs爬虫简单实现

    const fs = require('fs');
    const URL = require('url')
    const gbk = require('gbk')
    const { JSDOM } = require('jsdom')
    
    class Getdata {
        static http(url) {
            let Url = URL.parse(url)
            let http;
            if (Url.protocol == 'http:') {
                http = require('http')
                return { "http": http, 'hostname': Url.hostname, 'path': Url.path }
            } else {
                http = require('https')
                return {
                    "http": http, 'hostname': Url.hostname, 'path': Url.path
                };
            }
        }
        get(url, Filename) {
            Getdata.a += 1
            let app = Getdata.http(url);
            let https = app.http.request({
                'hostname': app.hostname,
                'path': app.path
            }, res => {
                if (res.statusCode == 200) {
                    // console.log(res.statusCode)
                    let arr = []
                    res.on('data', data => {
                        // console.log(data);
                        arr.push(data)
                    }).on('end', () => {
                        let b = Buffer.concat(arr)
                        // let html = gbk.toString('utf-8', b)
                        let dom = new JSDOM(b);
                        let doc = dom.window.document.querySelectorAll('.Left_list_cont2 img');
    
                        for (let i = 0; i < doc.length; i++) {
                            let a = doc[i].getAttribute('data-original')                    // console.log()
                            this.set(a, `${Filename}${i}`, 'jpg')
                        }
                    })
                } else {
                    console.log(Getdata.a);
                    console.log(res.statusCode, res.headers);
                    this.get(url, Filename)
                }
            })
            https.end()
        }
    
        set(url, Filename, kz = 'html') {
            Getdata.a += 1
            let app = Getdata.http(url);
            let https = app.http.request({
                'hostname': app.hostname,
                'path': app.path
            }, res => {
                if (res.statusCode == 200) {
                    // console.log(res.statusCode)
                    let arr = []
                    res.on('data', data => {
                        // console.log(data);
                        arr.push(data)
                    }).on('end', () => {
                        let b = Buffer.concat(arr)
                        fs.writeFile('img/' + Filename + '.' + kz, b, () => {
                            console.log('成功了');
                        })
    
                    })
                } else {
                    console.log(Getdata.a);
                    console.log(res.statusCode, res.headers);
                    this.get(url, Filename)
                }
            })
            https.end()
        }
    
    
    }
    Getdata.a = 0;
  • 相关阅读:
    如何检索某个字段在sqlserver中的哪个些存储过程中?很简单的SQL语句。
    如何去掉HTML代码来获取纯文本?
    ajax实现跨域请求
    ajax实现跨域提交
    lab3
    Lab2
    hw2
    Homework1
    Lab1
    CS61B_学习计划和进程
  • 原文地址:https://www.cnblogs.com/kjtt/p/10861601.html
Copyright © 2011-2022 走看看