zoukankan      html  css  js  c++  java
  • 使用nodejs的puppeteer库爬取瓜子二手车网站

    const puppeteer = require('puppeteer');
    
    (async () => {
        const fs = require("fs");
        const rootUrl = 'https://www.guazi.com'
        const workPath = './contents';
    
        if (!fs.existsSync(workPath)) {
            fs.mkdirSync(workPath)
        }
        const browser = await (puppeteer.launch({ headless: false }));
        const page = await browser.newPage()
        await page.setViewport({  1128, height: 736 });
        await page.setRequestInterception(true); // 拦截器
        page.on('request', request => { //拦截图片
            if (request.resourceType() === 'image')
                request.abort();
            else
                request.continue();
        });
    
        await page.goto("https://www.guazi.com/fuzhou/buy")
    
        const m_cityList = await page.evaluate(() => { //           获取所有城市
            const elements = Array.from(document.querySelectorAll('.all-city dl'))
            return elements.map(s => {
                let dd = s.getElementsByTagName("dd").item(0)
                let ddList = []
                for (let i = 0; i < dd.getElementsByTagName("a").length; i++) {
                    ddList.push({
                        "cityName": dd.getElementsByTagName("a").item(i).innerHTML,
                        "url": dd.getElementsByTagName("a").item(i).getAttribute("href")
                    })
                }
                return ddList
            })
        })
        //数组扁平化
        const flattenNew = arr => arr.reduce((prev, next) => Object.prototype.toString.call(next) == '[object Array]' ? prev.concat(flattenNew(next)) : prev.concat(next), [])
        const cityList = flattenNew(m_cityList)
        console.log("城市列表爬取完毕")
        await page.waitFor(2000 + Math.round(Math.random() * 100))
        for (let i = 0; i < cityList.length; i++) {
            await page.waitFor(1000 + Math.round(Math.random() * 100))
            await page.goto(rootUrl + cityList[i].url)
            console.log("跳转到" + cityList[i].cityName)
            console.log("开始爬取" + cityList[i].cityName + "的所有二手车品牌")
            try {
                let brandList = await page.evaluate(() => {  //品牌
                    let Array = []
                    const dl = document.querySelectorAll('.screen').item(0).getElementsByTagName("dl")
                    const div = dl.item(0).getElementsByTagName("dd").item(0).getElementsByTagName("div").item(1)
                    const ul = div.getElementsByTagName("ul")
                    for (let i = 0; i < ul.length; i++) {
                        let li = ul.item(i).getElementsByTagName("li")
                        for (let j = 0; j < li.length; j++) {
                            let a = li.item(j).getElementsByTagName("p").item(0).getElementsByTagName("a")
                            for (let k = 0; k < a.length; k++) {
                                Array.push({
                                    "brand": a.item(k).innerHTML,
                                    "url": a.item(k).getAttribute("href")
                                })
                            }
                        }
                    }
                    return Array
                })
    
    
                console.log(cityList[i].cityName + "的所有二手车品牌爬取完毕")
    
                for (let j = 0; j < brandList.length; j++) {
    
                    console.log("开始爬取" + cityList[i].cityName + "-" + brandList[j].brand + "的所有车系")
                    await page.waitFor(1000 + Math.round(Math.random() * 100))
                    await page.goto(rootUrl + brandList[j].url)
    
    
                    try {
                        const carTypeList = await page.evaluate(() => {  //车型
                            let Array = []
                            const dl = document.querySelectorAll('.screen').item(0).getElementsByTagName("dl")
                            const div = dl.item(1).getElementsByTagName("dd").item(0).getElementsByTagName("div").item(1)
                            const li = div.getElementsByTagName("ul").item(0).getElementsByTagName("li")
    
                            for (let j = 0; j < li.length; j++) {
                                let a = li.item(j).getElementsByTagName("p").item(0).getElementsByTagName("a")
                                for (let k = 0; k < a.length; k++) {
                                    Array.push({
                                        "carType": a.item(k).innerHTML.replace(/s*/g, ""),
                                        "url": a.item(k).getAttribute("href")
                                    })
                                }
                            }
                            return Array
                        })
    
                        console.log(cityList[i].cityName + "-" + brandList[j].brand + "的所有车系爬取完毕")
                        for (let k = 0; k < carTypeList.length; k++) {
                            await page.waitFor(1000 + Math.round(Math.random() * 100))
                            console.log("开始爬取" + cityList[i].cityName + "-" + brandList[j].brand + "-" + carTypeList[k].carType + "的所有二手车")
                            let newUrl = rootUrl + carTypeList[k].url
    
                            pathArray = newUrl.split("/") //拿到第一页url,得到后面的页面的url
                            let urlArray = []
    
                            try {
                                await page.goto(newUrl)
                                const pageNum = await page.evaluate(() => {   //获取总页数
                                    let li = document.querySelectorAll("ul.pageLink").item(0).getElementsByTagName("li")
                                    let liNum = li.length
                                    return li.item(li.length - 2).getElementsByTagName("a").item(0).getElementsByTagName("span").item(0).innerHTML
                                })
    
                                for (let i = 1; i <= pageNum; i++) { //将所有的页面存入数组中
                                    urlArray.push(newUrl.replace(new RegExp("/" + pathArray[pathArray.length - 1], 'g'), "/o" + i + "/" + pathArray[pathArray.length - 1]))
                                }
                                
                            } catch (error) {
                                console.log(cityList[i].cityName + "-" + brandList[j].brand + "-" + carTypeList[k].carType + "的所有二手车列表爬取失败,该车型可能只有少量或者没有")
                            }
                            if (urlArray.length != 0) {
                                for (let i = 0; i < urlArray.length; i++) {
    
                                    await page.goto(urlArray[i]);
    
                                    const list = await page.evaluate(() => {
                                        let carArray = []
                                        let li = document.querySelectorAll("ul.carlist").item(0).getElementsByTagName("li")
                                        for (let i = 0; i < li.length; i++) {
                                            a = li.item(i).getElementsByTagName("a").item(0)
                                            carArray.push({
                                                "url": a.getAttribute("href"),
                                                "imgUrl": a.getElementsByTagName("img").item(0).getAttribute("src"),
                                                "carName": a.getElementsByTagName("h2").item(0).innerHTML,
                                                "carData": (a.getElementsByTagName("div").item(0).innerHTML).replace(new RegExp('<span class="icon-pad">', 'g'), "").replace(new RegExp('</span>', 'g'), ""),
                                                "price": a.getElementsByTagName("div").item(1).getElementsByTagName("p").item(0).innerHTML.replace(new RegExp('<span>', 'g'), "").replace(new RegExp('</span>', 'g'), "").replace(/s*/g, "")
                                            })
    
                                        }
                                        return carArray
                                    })
                                    await page.waitFor(500 + Math.round(Math.random() * 100))
                                    console.log(list)
                                }
                            }else{
                                try {
                                   
                                    const list = await page.evaluate(() => {
                                        let carArray = []
                                        let li = document.querySelectorAll("ul.carlist").item(0).getElementsByTagName("li")
                                        console.log("该车型少量")
                                        for (let i = 0; i < li.length; i++) {
                                            
                                            a = li.item(i).getElementsByTagName("a").item(0)
                                            carArray.push({
                                                "url": a.getAttribute("href"),
                                                "imgUrl": a.getElementsByTagName("img").item(0).getAttribute("src"),
                                                "carName": a.getElementsByTagName("h2").item(0).innerHTML,
                                                "carData": (a.getElementsByTagName("div").item(0).innerHTML).replace(new RegExp('<span class="icon-pad">', 'g'), "").replace(new RegExp('</span>', 'g'), ""),
                                                "price": a.getElementsByTagName("div").item(1).getElementsByTagName("p").item(0).innerHTML.replace(new RegExp('<span>', 'g'), "").replace(new RegExp('</span>', 'g'), "").replace(/s*/g, "")
                                            })
        
                                        }
                                        return carArray
                                    })
                                    await page.waitFor(500 + Math.round(Math.random() * 100))
                                    console.log(list)
                                } catch (error) {
                                    console.log("该车型没有")
                                }
    
                            }
    
                        }
    
                    } catch (error) {
                        console.log(cityList[i].cityName + "-" + brandList[i].brand + "的所有车系爬取失败")
                    }
    
                }
            } catch (error) {
                console.log(cityList[i].cityName + "二手车品牌爬取失败")
            }
            await page.waitFor(1000 + Math.round(Math.random() * 100))
        }
    })();

    时间比较赶,先附上代码和运行截图

    有兴趣的可以 看一下项目地址

    https://gitee.com/xu_hui_hong/nodejs_puppeteer_guazi2

  • 相关阅读:
    时间类型:datetime,timestamp,date,time,year
    字符串类型:char,varchar,text,enum,set
    RHEL7安装ZABBIX 3.2
    Go-06-数据类型、常量、运算符
    GO-05-数据类型
    GO-04-变量
    GO-03-基础
    GO-02-helloworld
    Ambassador-09-prefix正则表达式
    Ambassador-08-跨域
  • 原文地址:https://www.cnblogs.com/BlackFungus/p/12199043.html
Copyright © 2011-2022 走看看