zoukankan      html  css  js  c++  java
  • 关于动态页面静态化的技术探索

    一、准备工作

    1、使用tornado部署后端服务

    架构图:

    1config.py 配置端口

    2、application配置路由

    3、添加Handler处理响应请求

    4、启动服务

    打开server.py,右击选择Run ‘server’。

    2、使用nuxt开前端页面

    1、使用脚手架create-nuxt-app创建项目

    npx create-nuxt-app test

    cd test

    npm run dev

    2、目录结构

    3、修改nuxt.confjg.js扩展路由

      router: {
        extendRoutes(routes, resolve){
          routes.push({
            name: 'index.html',
            path: '/index.html',
            component: resolve(__dirname, 'pages/index.vue'),
          });
        }
      }

    4、修改pages/index.vue文件

    使用asyncData方法在页面组件加载之前调用接口获取数据,模板进行展示,如下:

    5、编译部署

    cd test

    npm run build

    npm run start

    6、页面展示

    事情准备工作结束,接下来我们爬取静态数据。

    3、使用puppteer爬取网页

    1、获取HTML内容

    async function fetch(url, page){
      /*
        load - 页面的load事件触发时
        domcontentloaded - 页面的DOMContentLoaded事件触发时
        networkidle0 - 不再有网络连接时触发(至少500毫秒后)
        networkidle2 - 只有2个网络连接时触发(至少500毫秒后)
      */
      await page.goto(url, {
        waitUntil: 'networkidle0'
      });
      let html = await page.content();
      return html;
    }

    2、提取cssjsimg内容

    解析HTML属于CPU计算,不必异步处理

    function extract_urls(url, html){
      url = new URL(url);
      let $ = cheerio.load(html);
      let stylesheets = $('link');
      let scripts = $('script');
      let images = $('img');
    
      for(let stylesheet of stylesheets){
        let filename = '';
        let urlpath = '';
        let src = stylesheet.attribs.href || '';
        if(!src) continue;
        if(/^(http|https)/.test(src)){
          continue;
        }
        else if(src.indexOf('/') !== 0){
          let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
          filename = url2path(`${host}/${src}`);
          urlpath = `${host}/${src}`;
        }
        else{
          let host = `${url.protocol}//${url.host}`;
          filename = url2path(`${host}${src}`);
          urlpath = `${host}${src}`;
        }
        let index = seen_stylesheets.findIndex(s => s.filename === filename);
        if(index < 0){
          waitting_stylesheets.push({
            filename: filename,
            urlpath: urlpath
          });
        }
      }
    
      for(let script of scripts){
        let filename = '';
        let urlpath = '';
        let src = script.attribs.src || '';
        if(!src) continue;
        if(/^(http|https)/.test(src)){
          continue;
        }
        else if(src.indexOf('/') !== 0){
          let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
          filename = url2path(`${host}/${src}`);
          urlpath = `${host}/${src}`;
        }
        else{
          let host = `${url.protocol}//${url.host}`;
          filename = url2path(`${host}${src}`);
          urlpath = `${host}${src}`;
        }
        let index = seen_scripts.findIndex(s => s.filename === filename);
        if(index < 0){
          waitting_scripts.push({
            filename: filename,
            urlpath: urlpath
          });
        }
      }
    
      for(let image of images){
        let filename = '';
        let urlpath = '';
        let src = image.attribs.src || '';
        if(!src) continue;
        if(/.(jpg|png)$/.test(src)){
          if(/^(http|https)/.test(src)){
            continue;
          }
          else if(src.indexOf('/') !== 0){
            let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
            filename = url2path(`${host}/${src}`);
            urlpath = `${host}/${src}`;
          }
          else{
            let host = `${url.protocol}//${url.host}`;
            filename = url2path(`${host}${src}`);
            urlpath = `${host}${src}`;
          }
          let index = seen_images.findIndex(s => s.filename === filename);
          if(index < 0){
            waitting_images.push({
              filename: filename,
              urlpath: urlpath
            });
          }
        }
      }
    }

    3、保存下载

    使用Promise封装了异步写入,判断目录是否存在递归创建。

    // 递归创建目录
    function mkdirs(filepath) {
      if (fs.existsSync(filepath)) {
        return true;
      }
      if (mkdirs(path.dirname(filepath))) {
          fs.mkdirSync(filepath);
          return true;
      }
    }
    
    // 写入文件
    function write(filename, data) {
      return new Promise((resolve, reject) => {
        mkdirs(path.dirname(filename));
        fs.writeFile(filename, data, (err) => {
          if (err) {
            reject(err);
          }
          resolve(filename)
        });
      })
    }

    4、完整代码

    'use strict';
    const puppeteer = require('puppeteer');
    const cheerio = require('cheerio');
    const axios = require('axios');
    const fs = require('fs');
    const path = require('path');
    
    let seen_htmls = [];
    let seen_scripts = [];
    let seen_stylesheets = [];
    let seen_images = [];
    
    let waitting_htmls = ["http://localhost:3000/index.html"];
    let waitting_scripts = [];
    let waitting_stylesheets = [];
    let waitting_images = [];
    
    // 获取html内容
    async function fetch(url, page){
      /*
        load - 页面的load事件触发时
        domcontentloaded - 页面的DOMContentLoaded事件触发时
        networkidle0 - 不再有网络连接时触发(至少500毫秒后)
        networkidle2 - 只有2个网络连接时触发(至少500毫秒后)
      */
      await page.goto(url, {
        waitUntil: 'networkidle0'
      });
      let html = await page.content();
      return html;
    }
    
    // 将url路径转成本地路径
    function url2path(url){
      url = new URL(url);
      return path.resolve(__dirname, './dist', url.hostname, ...url.pathname.split('/'));
    }
    // 提取css js img url路径
    function extract_urls(url, html){
      url = new URL(url);
      let $ = cheerio.load(html);
      let stylesheets = $('link');
      let scripts = $('script');
      let images = $('img');
    
      for(let stylesheet of stylesheets){
        let filename = '';
        let urlpath = '';
        let src = stylesheet.attribs.href || '';
        if(!src) continue;
        if(/^(http|https)/.test(src)){
          continue;
        }
        else if(src.indexOf('/') !== 0){
          let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
          filename = url2path(`${host}/${src}`);
          urlpath = `${host}/${src}`;
        }
        else{
          let host = `${url.protocol}//${url.host}`;
          filename = url2path(`${host}${src}`);
          urlpath = `${host}${src}`;
        }
        let index = seen_stylesheets.findIndex(s => s.filename === filename);
        if(index < 0){
          waitting_stylesheets.push({
            filename: filename,
            urlpath: urlpath
          });
        }
      }
    
      for(let script of scripts){
        let filename = '';
        let urlpath = '';
        let src = script.attribs.src || '';
        if(!src) continue;
        if(/^(http|https)/.test(src)){
          continue;
        }
        else if(src.indexOf('/') !== 0){
          let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
          filename = url2path(`${host}/${src}`);
          urlpath = `${host}/${src}`;
        }
        else{
          let host = `${url.protocol}//${url.host}`;
          filename = url2path(`${host}${src}`);
          urlpath = `${host}${src}`;
        }
        let index = seen_scripts.findIndex(s => s.filename === filename);
        if(index < 0){
          waitting_scripts.push({
            filename: filename,
            urlpath: urlpath
          });
        }
      }
    
      for(let image of images){
        let filename = '';
        let urlpath = '';
        let src = image.attribs.src || '';
        if(!src) continue;
        if(/.(jpg|png)$/.test(src)){
          if(/^(http|https)/.test(src)){
            continue;
          }
          else if(src.indexOf('/') !== 0){
            let host = `${url.protocol}//${url.host}/${url.pathname.substring(0, url.pathname.lastIndexOf('/'))}`;
            filename = url2path(`${host}/${src}`);
            urlpath = `${host}/${src}`;
          }
          else{
            let host = `${url.protocol}//${url.host}`;
            filename = url2path(`${host}${src}`);
            urlpath = `${host}${src}`;
          }
          let index = seen_images.findIndex(s => s.filename === filename);
          if(index < 0){
            waitting_images.push({
              filename: filename,
              urlpath: urlpath
            });
          }
        }
      }
    }
    
    // 递归创建目录
    function mkdirs(filepath) {
      if (fs.existsSync(filepath)) {
        return true;
      }
      if (mkdirs(path.dirname(filepath))) {
          fs.mkdirSync(filepath);
          return true;
      }
    }
    
    // 写入文件
    function write(filename, data) {
      return new Promise((resolve, reject) => {
        mkdirs(path.dirname(filename));
        fs.writeFile(filename, data, (err) => {
          if (err) {
            reject(err);
          }
          resolve(filename)
        });
      })
    }
    
    (async () => {
      const browser = await puppeteer.launch({headless: false});
      const page = await browser.newPage();
      for(let start_url of waitting_htmls){
        let html = await fetch(start_url, page);  
        // html
        write(url2path(start_url), html)
        .then((filename) => {
          console.log(filename + ' 写入完成');
          seen_htmls.push(start_url);
        })
        .catch((err) => {
          console.log(err);
        });
        // 解析html
        extract_urls(start_url, html);
        // css
        for(let stylesheet of waitting_stylesheets){
          let resource = '';
          let res = await axios.get(stylesheet.urlpath);
          if(res && res.status === 200){
            resource = res.data;
          }
          write(stylesheet.filename, resource)
          .then((filename) => {
            console.log(filename + ' 写入完成');
            seen_stylesheets.push(Object.assign({}, stylesheet));
          })
          .catch((err) => {
            console.log(err);
          });
        }
        waitting_stylesheets = []
        // js
        for(let script of waitting_scripts){
          let resource = '';
          let res = await axios.get(script.urlpath);
          if(res && res.status === 200){
            resource = res.data;
          }
          write(script.filename, resource)
          .then((filename) => {
            console.log(filename + ' 写入完成');
            seen_scripts.push(Object.assign({}, script));
          })
          .catch((err) => {
            console.log(err);
          });
        }
        waitting_scripts = []
        // image
        for(let image of waitting_images){
          let resource = '';
          let res = await axios.get(image.urlpath, {
            responseType: 'arraybuffer'
          });
          if(res && res.status === 200){
            resource = res.data;
          }
          write(image.filename, resource)
          .then((filename) => {
            console.log(filename + ' 写入完成');
            seen_images.push(Object.assign({}, image));
          })
          .catch((err) => {
            console.log(err);
          });
        }
        waitting_images = []
      }
      waitting_htmls = []
      await page.close();
      await browser.close();
    })();

    5、执行展示

    成功抓取到所有静态资源文件,就差最后部署。

    4、使用express进行部署

    const express = require('express');
    
    const app = express();
    
    // app.use(express.static('dist/app2.jg.eastmoney.com'));
    app.use(express.static('dist/localhost'));
    
    app.listen(3001);

    使用node server.js启动express,打开http://localhost:3001,网页不会向后端请求http://localhost:8089/api/test,降低服务器压力。

  • 相关阅读:
    MapReduce -- 统计天气信息
    设计模式--策略模式
    基于物品的协同过滤(二)
    Hadoop应用开发,常见错误
    基于物品的协同过滤(一)
    基于用户的协同过滤
    MapReduce开发程序,运行环境配置
    3DES加解密 C语言
    js获取对象位置的方法
    icheck.js的一个简单demo
  • 原文地址:https://www.cnblogs.com/yejing-snake/p/14277189.html
Copyright © 2011-2022 走看看