zoukankan      html  css  js  c++  java
  • 关于动态页面静态化的技术探索

    一、准备工作

    1、使用tornado部署后端服务

    架构图:

    1config.py 配置端口

    2、application配置路由

    3、添加Handler处理响应请求

    4、启动服务

    打开server.py,右击选择Run ‘server’。

    2、使用nuxt开前端页面

    1、使用脚手架create-nuxt-app创建项目

    npx create-nuxt-app test

    cd test

    npm run dev

    2、目录结构

    3、修改nuxt.confjg.js扩展路由

      router: {
        extendRoutes(routes, resolve){
          routes.push({
            name: 'index.html',
            path: '/index.html',
            component: resolve(__dirname, 'pages/index.vue'),
          });
        }
      }

    4、修改pages/index.vue文件

    使用asyncData方法在页面组件加载之前调用接口获取数据,模板进行展示,如下:

    5、编译部署

    cd test

    npm run build

    npm run start

    6、页面展示

    事情准备工作结束,接下来我们爬取静态数据。

    3、使用puppteer爬取网页

    1、获取HTML内容

    async function fetch(url, page){
      /*
        load - 页面的load事件触发时
        domcontentloaded - 页面的DOMContentLoaded事件触发时
        networkidle0 - 不再有网络连接时触发(至少500毫秒后)
        networkidle2 - 只有2个网络连接时触发(至少500毫秒后)
      */
      await page.goto(url, {
        waitUntil: 'networkidle0'
      });
      let html = await page.content();
      return html;
    }

    2、提取cssjsimg内容

    解析HTML属于CPU计算,不必异步处理

    function extract_urls(url, html){
      url = new URL(url);
      let $ = cheerio.load(html);
      let stylesheets = $('link');
      let scripts = $('script');
      let images = $('img');
    
      for(let stylesheet of stylesheets){
        let filename = '';
        let urlpath = '';
        let src = stylesheet.attribs.href || '';
        if(!src) continue;
        if(/^(http|https)/.test(src)){
          continue;
        }
        else if(src.indexOf('/') !== 0){
          let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
          filename = url2path(`${host}/${src}`);
          urlpath = `${host}/${src}`;
        }
        else{
          let host = `${url.protocol}//${url.host}`;
          filename = url2path(`${host}${src}`);
          urlpath = `${host}${src}`;
        }
        let index = seen_stylesheets.findIndex(s => s.filename === filename);
        if(index < 0){
          waitting_stylesheets.push({
            filename: filename,
            urlpath: urlpath
          });
        }
      }
    
      for(let script of scripts){
        let filename = '';
        let urlpath = '';
        let src = script.attribs.src || '';
        if(!src) continue;
        if(/^(http|https)/.test(src)){
          continue;
        }
        else if(src.indexOf('/') !== 0){
          let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
          filename = url2path(`${host}/${src}`);
          urlpath = `${host}/${src}`;
        }
        else{
          let host = `${url.protocol}//${url.host}`;
          filename = url2path(`${host}${src}`);
          urlpath = `${host}${src}`;
        }
        let index = seen_scripts.findIndex(s => s.filename === filename);
        if(index < 0){
          waitting_scripts.push({
            filename: filename,
            urlpath: urlpath
          });
        }
      }
    
      for(let image of images){
        let filename = '';
        let urlpath = '';
        let src = image.attribs.src || '';
        if(!src) continue;
        if(/.(jpg|png)$/.test(src)){
          if(/^(http|https)/.test(src)){
            continue;
          }
          else if(src.indexOf('/') !== 0){
            let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
            filename = url2path(`${host}/${src}`);
            urlpath = `${host}/${src}`;
          }
          else{
            let host = `${url.protocol}//${url.host}`;
            filename = url2path(`${host}${src}`);
            urlpath = `${host}${src}`;
          }
          let index = seen_images.findIndex(s => s.filename === filename);
          if(index < 0){
            waitting_images.push({
              filename: filename,
              urlpath: urlpath
            });
          }
        }
      }
    }

    3、保存下载

    使用Promise封装了异步写入,判断目录是否存在递归创建。

    // 递归创建目录
    function mkdirs(filepath) {
      if (fs.existsSync(filepath)) {
        return true;
      }
      if (mkdirs(path.dirname(filepath))) {
          fs.mkdirSync(filepath);
          return true;
      }
    }
    
    // 写入文件
    function write(filename, data) {
      return new Promise((resolve, reject) => {
        mkdirs(path.dirname(filename));
        fs.writeFile(filename, data, (err) => {
          if (err) {
            reject(err);
          }
          resolve(filename)
        });
      })
    }

    4、完整代码

    'use strict';
    const puppeteer = require('puppeteer');
    const cheerio = require('cheerio');
    const axios = require('axios');
    const fs = require('fs');
    const path = require('path');
    
    let seen_htmls = [];
    let seen_scripts = [];
    let seen_stylesheets = [];
    let seen_images = [];
    
    let waitting_htmls = ["http://localhost:3000/index.html"];
    let waitting_scripts = [];
    let waitting_stylesheets = [];
    let waitting_images = [];
    
    // 获取html内容
    async function fetch(url, page){
      /*
        load - 页面的load事件触发时
        domcontentloaded - 页面的DOMContentLoaded事件触发时
        networkidle0 - 不再有网络连接时触发(至少500毫秒后)
        networkidle2 - 只有2个网络连接时触发(至少500毫秒后)
      */
      await page.goto(url, {
        waitUntil: 'networkidle0'
      });
      let html = await page.content();
      return html;
    }
    
    // 将url路径转成本地路径
    function url2path(url){
      url = new URL(url);
      return path.resolve(__dirname, './dist', url.hostname, ...url.pathname.split('/'));
    }
    // 提取css js img url路径
    function extract_urls(url, html){
      url = new URL(url);
      let $ = cheerio.load(html);
      let stylesheets = $('link');
      let scripts = $('script');
      let images = $('img');
    
      for(let stylesheet of stylesheets){
        let filename = '';
        let urlpath = '';
        let src = stylesheet.attribs.href || '';
        if(!src) continue;
        if(/^(http|https)/.test(src)){
          continue;
        }
        else if(src.indexOf('/') !== 0){
          let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
          filename = url2path(`${host}/${src}`);
          urlpath = `${host}/${src}`;
        }
        else{
          let host = `${url.protocol}//${url.host}`;
          filename = url2path(`${host}${src}`);
          urlpath = `${host}${src}`;
        }
        let index = seen_stylesheets.findIndex(s => s.filename === filename);
        if(index < 0){
          waitting_stylesheets.push({
            filename: filename,
            urlpath: urlpath
          });
        }
      }
    
      for(let script of scripts){
        let filename = '';
        let urlpath = '';
        let src = script.attribs.src || '';
        if(!src) continue;
        if(/^(http|https)/.test(src)){
          continue;
        }
        else if(src.indexOf('/') !== 0){
          let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
          filename = url2path(`${host}/${src}`);
          urlpath = `${host}/${src}`;
        }
        else{
          let host = `${url.protocol}//${url.host}`;
          filename = url2path(`${host}${src}`);
          urlpath = `${host}${src}`;
        }
        let index = seen_scripts.findIndex(s => s.filename === filename);
        if(index < 0){
          waitting_scripts.push({
            filename: filename,
            urlpath: urlpath
          });
        }
      }
    
      for(let image of images){
        let filename = '';
        let urlpath = '';
        let src = image.attribs.src || '';
        if(!src) continue;
        if(/.(jpg|png)$/.test(src)){
          if(/^(http|https)/.test(src)){
            continue;
          }
          else if(src.indexOf('/') !== 0){
            let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
            filename = url2path(`${host}/${src}`);
            urlpath = `${host}/${src}`;
          }
          else{
            let host = `${url.protocol}//${url.host}`;
            filename = url2path(`${host}${src}`);
            urlpath = `${host}${src}`;
          }
          let index = seen_images.findIndex(s => s.filename === filename);
          if(index < 0){
            waitting_images.push({
              filename: filename,
              urlpath: urlpath
            });
          }
        }
      }
    }
    
    // 递归创建目录
    function mkdirs(filepath) {
      if (fs.existsSync(filepath)) {
        return true;
      }
      if (mkdirs(path.dirname(filepath))) {
          fs.mkdirSync(filepath);
          return true;
      }
    }
    
    // 写入文件
    function write(filename, data) {
      return new Promise((resolve, reject) => {
        mkdirs(path.dirname(filename));
        fs.writeFile(filename, data, (err) => {
          if (err) {
            reject(err);
          }
          resolve(filename)
        });
      })
    }
    
    (async () => {
      const browser = await puppeteer.launch({headless: false});
      const page = await browser.newPage();
      for(let start_url of waitting_htmls){
        let html = await fetch(start_url, page);  
        // html
        write(url2path(start_url), html)
        .then((filename) => {
          console.log(filename + ' 写入完成');
          seen_htmls.push(start_url);
        })
        .catch((err) => {
          console.log(err);
        });
        // 解析html
        extract_urls(start_url, html);
        // css
        for(let stylesheet of waitting_stylesheets){
          let resource = '';
          let res = await axios.get(stylesheet.urlpath);
          if(res && res.status === 200){
            resource = res.data;
          }
          write(stylesheet.filename, resource)
          .then((filename) => {
            console.log(filename + ' 写入完成');
            seen_stylesheets.push(Object.assign({}, stylesheet));
          })
          .catch((err) => {
            console.log(err);
          });
        }
        waitting_stylesheets = []
        // js
        for(let script of waitting_scripts){
          let resource = '';
          let res = await axios.get(script.urlpath);
          if(res && res.status === 200){
            resource = res.data;
          }
          write(script.filename, resource)
          .then((filename) => {
            console.log(filename + ' 写入完成');
            seen_scripts.push(Object.assign({}, script));
          })
          .catch((err) => {
            console.log(err);
          });
        }
        waitting_scripts = []
        // image
        for(let image of waitting_images){
          let resource = '';
          let res = await axios.get(image.urlpath, {
            responseType: 'arraybuffer'
          });
          if(res && res.status === 200){
            resource = res.data;
          }
          write(image.filename, resource)
          .then((filename) => {
            console.log(filename + ' 写入完成');
            seen_images.push(Object.assign({}, image));
          })
          .catch((err) => {
            console.log(err);
          });
        }
        waitting_images = []
      }
      waitting_htmls = []
      await page.close();
      await browser.close();
    })();

    5、执行展示

    成功抓取到所有静态资源文件,就差最后部署。

    4、使用express进行部署

    const express = require('express');
    
    const app = express();
    
    // app.use(express.static('dist/app2.jg.eastmoney.com'));
    app.use(express.static('dist/localhost'));
    
    app.listen(3001);

    使用node server.js启动express,打开http://localhost:3001,网页不会向后端请求http://localhost:8089/api/test,降低服务器压力。

  • 相关阅读:
    Windows 科研软件推荐
    有关Python 包 (package) 的基本知识
    《Using Python to Access Web Data》Week4 Programs that Surf the Web 课堂笔记
    Coursera助学金申请模板
    《Using Databases with Python》 Week2 Basic Structured Query Language 课堂笔记
    Jupyter 解决单个变量输出问题
    解决 pandas 中打印 DataFrame 行列显示不全的问题
    《Using Python to Access Web Data》 Week3 Networks and Sockets 课堂笔记
    缓存击穿及解决方案
    jvm垃圾收集器
  • 原文地址:https://www.cnblogs.com/yejing-snake/p/14277189.html
Copyright © 2011-2022 走看看