一、准备工作
1、使用tornado部署后端服务
架构图:
1、config.py 配置端口
2、application配置路由
3、添加Handler处理响应请求
4、启动服务
打开server.py,右击选择Run ‘server’。
2、使用nuxt开前端页面
1、使用脚手架create-nuxt-app创建项目
npx create-nuxt-app test
cd test
npm run dev
2、目录结构
3、修改nuxt.confjg.js扩展路由
router: { extendRoutes(routes, resolve){ routes.push({ name: 'index.html', path: '/index.html', component: resolve(__dirname, 'pages/index.vue'), }); } }
4、修改pages/index.vue文件
使用asyncData方法在页面组件加载之前调用接口获取数据,模板进行展示,如下:
5、编译部署
cd test
npm run build
npm run start
6、页面展示
事情准备工作结束,接下来我们爬取静态数据。
3、使用puppteer爬取网页
1、获取HTML内容
async function fetch(url, page){ /* load - 页面的load事件触发时 domcontentloaded - 页面的DOMContentLoaded事件触发时 networkidle0 - 不再有网络连接时触发(至少500毫秒后) networkidle2 - 只有2个网络连接时触发(至少500毫秒后) */ await page.goto(url, { waitUntil: 'networkidle0' }); let html = await page.content(); return html; }
2、提取css、js、img内容
解析HTML属于CPU计算,不必异步处理
function extract_urls(url, html){ url = new URL(url); let $ = cheerio.load(html); let stylesheets = $('link'); let scripts = $('script'); let images = $('img'); for(let stylesheet of stylesheets){ let filename = ''; let urlpath = ''; let src = stylesheet.attribs.href || ''; if(!src) continue; if(/^(http|https)/.test(src)){ continue; } else if(src.indexOf('/') !== 0){ let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`; filename = url2path(`${host}/${src}`); urlpath = `${host}/${src}`; } else{ let host = `${url.protocol}//${url.host}`; filename = url2path(`${host}${src}`); urlpath = `${host}${src}`; } let index = seen_stylesheets.findIndex(s => s.filename === filename); if(index < 0){ waitting_stylesheets.push({ filename: filename, urlpath: urlpath }); } } for(let script of scripts){ let filename = ''; let urlpath = ''; let src = script.attribs.src || ''; if(!src) continue; if(/^(http|https)/.test(src)){ continue; } else if(src.indexOf('/') !== 0){ let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`; filename = url2path(`${host}/${src}`); urlpath = `${host}/${src}`; } else{ let host = `${url.protocol}//${url.host}`; filename = url2path(`${host}${src}`); urlpath = `${host}${src}`; } let index = seen_scripts.findIndex(s => s.filename === filename); if(index < 0){ waitting_scripts.push({ filename: filename, urlpath: urlpath }); } } for(let image of images){ let filename = ''; let urlpath = ''; let src = image.attribs.src || ''; if(!src) continue; if(/.(jpg|png)$/.test(src)){ if(/^(http|https)/.test(src)){ continue; } else if(src.indexOf('/') !== 0){ let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`; filename = url2path(`${host}/${src}`); urlpath = `${host}/${src}`; } else{ let host = `${url.protocol}//${url.host}`; filename = url2path(`${host}${src}`); urlpath = `${host}${src}`; } let index = seen_images.findIndex(s => s.filename === filename); if(index < 0){ waitting_images.push({ filename: filename, urlpath: urlpath }); } } } }
3、保存下载
使用Promise封装了异步写入,判断目录是否存在递归创建。
// 递归创建目录 function mkdirs(filepath) { if (fs.existsSync(filepath)) { return true; } if (mkdirs(path.dirname(filepath))) { fs.mkdirSync(filepath); return true; } } // 写入文件 function write(filename, data) { return new Promise((resolve, reject) => { mkdirs(path.dirname(filename)); fs.writeFile(filename, data, (err) => { if (err) { reject(err); } resolve(filename) }); }) }
4、完整代码
'use strict'; const puppeteer = require('puppeteer'); const cheerio = require('cheerio'); const axios = require('axios'); const fs = require('fs'); const path = require('path'); let seen_htmls = []; let seen_scripts = []; let seen_stylesheets = []; let seen_images = []; let waitting_htmls = ["http://localhost:3000/index.html"]; let waitting_scripts = []; let waitting_stylesheets = []; let waitting_images = []; // 获取html内容 async function fetch(url, page){ /* load - 页面的load事件触发时 domcontentloaded - 页面的DOMContentLoaded事件触发时 networkidle0 - 不再有网络连接时触发(至少500毫秒后) networkidle2 - 只有2个网络连接时触发(至少500毫秒后) */ await page.goto(url, { waitUntil: 'networkidle0' }); let html = await page.content(); return html; } // 将url路径转成本地路径 function url2path(url){ url = new URL(url); return path.resolve(__dirname, './dist', url.hostname, ...url.pathname.split('/')); } // 提取css js img url路径 function extract_urls(url, html){ url = new URL(url); let $ = cheerio.load(html); let stylesheets = $('link'); let scripts = $('script'); let images = $('img'); for(let stylesheet of stylesheets){ let filename = ''; let urlpath = ''; let src = stylesheet.attribs.href || ''; if(!src) continue; if(/^(http|https)/.test(src)){ continue; } else if(src.indexOf('/') !== 0){ let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`; filename = url2path(`${host}/${src}`); urlpath = `${host}/${src}`; } else{ let host = `${url.protocol}//${url.host}`; filename = url2path(`${host}${src}`); urlpath = `${host}${src}`; } let index = seen_stylesheets.findIndex(s => s.filename === filename); if(index < 0){ waitting_stylesheets.push({ filename: filename, urlpath: urlpath }); } } for(let script of scripts){ let filename = ''; let urlpath = ''; let src = script.attribs.src || ''; if(!src) continue; if(/^(http|https)/.test(src)){ continue; } else if(src.indexOf('/') !== 0){ let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`; filename = url2path(`${host}/${src}`); urlpath = `${host}/${src}`; } else{ let host = `${url.protocol}//${url.host}`; filename = url2path(`${host}${src}`); urlpath = `${host}${src}`; } let index = seen_scripts.findIndex(s => s.filename === filename); if(index < 0){ waitting_scripts.push({ filename: filename, urlpath: urlpath }); } } for(let image of images){ let filename = ''; let urlpath = ''; let src = image.attribs.src || ''; if(!src) continue; if(/.(jpg|png)$/.test(src)){ if(/^(http|https)/.test(src)){ continue; } else if(src.indexOf('/') !== 0){ let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`; filename = url2path(`${host}/${src}`); urlpath = `${host}/${src}`; } else{ let host = `${url.protocol}//${url.host}`; filename = url2path(`${host}${src}`); urlpath = `${host}${src}`; } let index = seen_images.findIndex(s => s.filename === filename); if(index < 0){ waitting_images.push({ filename: filename, urlpath: urlpath }); } } } } // 递归创建目录 function mkdirs(filepath) { if (fs.existsSync(filepath)) { return true; } if (mkdirs(path.dirname(filepath))) { fs.mkdirSync(filepath); return true; } } // 写入文件 function write(filename, data) { return new Promise((resolve, reject) => { mkdirs(path.dirname(filename)); fs.writeFile(filename, data, (err) => { if (err) { reject(err); } resolve(filename) }); }) } (async () => { const browser = await puppeteer.launch({headless: false}); const page = await browser.newPage(); for(let start_url of waitting_htmls){ let html = await fetch(start_url, page); // html write(url2path(start_url), html) .then((filename) => { console.log(filename + ' 写入完成'); seen_htmls.push(start_url); }) .catch((err) => { console.log(err); }); // 解析html extract_urls(start_url, html); // css for(let stylesheet of waitting_stylesheets){ let resource = ''; let res = await axios.get(stylesheet.urlpath); if(res && res.status === 200){ resource = res.data; } write(stylesheet.filename, resource) .then((filename) => { console.log(filename + ' 写入完成'); seen_stylesheets.push(Object.assign({}, stylesheet)); }) .catch((err) => { console.log(err); }); } waitting_stylesheets = [] // js for(let script of waitting_scripts){ let resource = ''; let res = await axios.get(script.urlpath); if(res && res.status === 200){ resource = res.data; } write(script.filename, resource) .then((filename) => { console.log(filename + ' 写入完成'); seen_scripts.push(Object.assign({}, script)); }) .catch((err) => { console.log(err); }); } waitting_scripts = [] // image for(let image of waitting_images){ let resource = ''; let res = await axios.get(image.urlpath, { responseType: 'arraybuffer' }); if(res && res.status === 200){ resource = res.data; } write(image.filename, resource) .then((filename) => { console.log(filename + ' 写入完成'); seen_images.push(Object.assign({}, image)); }) .catch((err) => { console.log(err); }); } waitting_images = [] } waitting_htmls = [] await page.close(); await browser.close(); })();
5、执行展示
成功抓取到所有静态资源文件,就差最后部署。
4、使用express进行部署
const express = require('express'); const app = express(); // app.use(express.static('dist/app2.jg.eastmoney.com')); app.use(express.static('dist/localhost')); app.listen(3001);
使用node server.js启动express,打开http://localhost:3001,网页不会向后端请求http://localhost:8089/api/test,降低服务器压力。