zoukankan      html  css  js  c++  java
  • ui爬虫工具-未完成

    抓取页面的一块ui,将属于ui的html、css分离出来,需要配合浏览器机器人抓取html

    const TinyCss=require('./utils/TinyCss')
    var getCssText = require("./utils/getCssText");
    var getText = require("./utils/getText");
    var setText = require("./utils/setText");
    //ui 爬虫
    async function init() {
        const htmlText=await getText('./src/test.html');
        const cssText=await getText('./src/test.css');
        // const cssText=await getCssText('https://cloud.baidu.com/product/bcd/search.html?keyword=%E5%85%AB%E6%88%92%E7%AE%97%E5%91%BD','html')
        // console.log(htmlText)
        // console.log(cssText);
        const app=new TinyCss([htmlText],cssText);
        const css=app.getTinyAst('.note-list-wrapper').toString();
        // console.log(css);
        setText('./src/testmin.css',css);
    
    }
    init();

    TinyCss.js

    //TinyCss.js
    const Api=require('./Api');
    //解析成语法树
    const compiler = require('vue-template-compiler');
    const postcss  = require('postcss');
    const querySelectorList=require('./querySelectorList')
    
    //构建出一个css语法树和多个html语法书,分析css的使用率。
    class TinyCss{
        constructor(htmlTextArr,cssText){
    
            //多个html书法树
            this.htmlTextArr=htmlTextArr;
    
            //一个css书法树
            this.cssAst=postcss.parse(cssText);
            this.cssList=Api.depthSearch(this.cssAst,'nodes').filter(function (node) {
                return node.type==='rule'&&!/keyframes/.test(node.parent.name);
            })
    
            //输出的部分
            this.bigMap=null;
            this.map=null;
            this.data=null;
            this.emptyCss=null;
            this.emptyKeyFrames=null;
    
    
        }
    
        //移除数组中的子元素
        removeObj(item,arr){
            for(let i=0;i<arr.length;i++){
                if(arr[i]===item){
                    arr.splice(i,1)
                    break;
                }
            }
        }
        //获取矩阵数据
        getBigMap(){
            if(this.bigMap){
                return this.bigMap;
            }
            let map=[];
            for(let i=0;i<this.htmlTextArr.length;i++){
                const htmlAst=compiler.compile(this.htmlTextArr[i]).ast;
    
                const ccRect=new querySelectorList(htmlAst,this.cssList);
                const rect=ccRect.analysis();
                map.push(rect)
            }
            this.bigMap=map;
            return map;
        }
        //获取小数据,矩阵数据
        getMap(){
            if(this.map){
                return this.map;
            }
            let map=[];
            for(let i=0;i<this.htmlTextArr.length;i++){
                const htmlText=this.htmlTextArr[i];
                const htmlAst=compiler.compile(htmlText).ast;
                const ccRect=new querySelectorList(htmlAst,this.cssList);
                const arr=ccRect.analysis().map(function (item) {
                    return item.reduce((x,y)=>x+y);
                });
                for(let j=0;j<arr.length;j++){
                    if(!map[j])map[j]=[];
                    map[j].push(arr[j])
                }
            }
            this.map=map;
            return map;
        }
        getUiMap(selector){
            if(this.uiMap){
                return this.uiMap;
            }
            let map=[];
            for(let i=0;i<this.htmlTextArr.length;i++){
                const htmlText=this.htmlTextArr[i];
                const htmlAst=compiler.compile(htmlText).ast;
                const ccRect=new querySelectorList(htmlAst,this.cssList);
                const uiArr=ccRect.querySelectorAndChild(selector)
                const arr=ccRect.analysis().map(function (item) {
                    let index=0;
                    for(let k=0;k<item.length;k++){
                        if(item[k]===1&&uiArr[k]===1){
                            index++;
                        }
                    }
                    return index;
                });
                for(let j=0;j<arr.length;j++){
                    if(!map[j])map[j]=[];
                    map[j].push(arr[j])
                }
            }
            this.uiMap=map;
            return map;
        }
        //移除无用的css
        getEmptyCss(selector){
            if(this.emptyCss){
                return this.emptyCss;
            }
            const cssList=this.cssList;
            const data=[];
            const map=selector?this.getUiMap(selector):this.getMap();
            for(let i=0;i<map.length;i++){
                //存在比0大的就是用到的,都是0就是无用的css
                if(map[i].every(function (n) {
                    return n===0
                })){
                    //从ast中移除节点
                    this.removeObj(cssList[i],cssList[i].parent.nodes);
                    data.push(cssList[i].selector);
                }
            }
    
            this.emptyCss=data;
            return data;
        }
        //移除空的动画
        getEmptyKeyFrames(){
            if(this.emptyKeyFrames){
                return this.emptyKeyFrames;
            }
            const keyframesList=Api.depthSearch(this.cssAst,'nodes').filter(function (node) {
                return node.type==='atrule'&&/keyframes/.test(node.name);
            })
            const vals=Api.depthSearch(this.cssAst,'nodes').filter(function (node) {
                return node.type==='decl'&&/animation/.test(node.prop);
            })
            const delArr=keyframesList.filter(function (node) {
                return !vals.some(function (node2) {
                    return node2.value.split(' ').indexOf(node.params)>-1
                })
            })
            const emptyKeyFrames=[];
            delArr.forEach( (node) =>{
                //从ast中移除节点
                this.removeObj(node,node.parent.nodes);
                emptyKeyFrames.push('@'+node.name+' '+node.params)
            })
            this.emptyKeyFrames=emptyKeyFrames;
            return emptyKeyFrames;
        }
        //移除注释
        removeComment(){
            const commentArr=Api.depthSearch(this.cssAst,'nodes').filter(function (node) {
                return node.type==='comment';
            })
            commentArr.forEach((node)=>{
                this.removeObj(node,node.parent.nodes);
            })
        }
        getTinyAst(selector){
            this.getEmptyCss(selector);
            this.getEmptyKeyFrames();
            this.removeComment();
    
            return this.cssAst;
        }
    }
    module.exports=TinyCss;
    querySelectorList.js
    //querySelectorList.js
    const Api=require('./Api');
    //命中规则
    
    /*css rule矩阵,3*6
    行对应selector['.id','.class1','.class2']
    列对应html节点 ['body','body div','body div div','body div p','body div span','body div span a']
    [
        [0,0,0,0,1,0],
        [0,0,0,0,1,0],
        [0,0,0,0,1,0]
    ]
    */
    class querySelectorList{
    
        constructor(htmlAst,cssList){
    
            //记录selector查找历史
            this.selectotCache={};
    
            //构建html语法树和矩阵bitmap
            this.htmlAst=htmlAst;
            this.htmlList=Api.depthSearch(this.htmlAst).filter(function (node) {
                return node.type===1;
            })
    
            //构建css语法树和矩阵bitmap
            this.cssList=cssList;
        }
        //分析
        analysis(){
            const cssList=this.cssList;
            const map=[]
            for(let i=0;i<cssList.length;i++){
                map[i]=this.querySelector(cssList[i].selector);
            }
            return map;
        }
        //获取选择器和它得子元素
        querySelectorAndChild(selector){
            const arr=this.querySelector(selector);
            for(let i=0;i<arr.length;i++){
                if(arr[i]===1){
                    const cLen=Api.depthSearch(this.htmlList[arr[i]]).filter(function (node) {
                        return node.type===1;
                    }).length;
                    for(let k=1;k<cLen;k++){
                        i++;
                        arr[i]=1;
                    }
                }
            }
            return arr;
        }
        //可能是多选择器
        querySelector(selector){
            if(/,/.test(selector)){
                const arr=selector.split(',');
                const data=[];
                for(let i=0;i<arr.length;i++){
                    const item=this.queryOneSelector(arr[i]);
                    for(let k=0;k<item.length;k++){
                        if(item[k]===1){
                            data[k]=1;
                        }else{
                            data[k]=0;
                        }
                    }
                }
                return data;
            }else{
                return this.queryOneSelector(selector)
            }
        }
        //查询css_rule,返回[array astNode]
        queryOneSelector(selector){
            selector=selector.trim();//去掉左右空格
    
            //解析css rule
            const selectorArr=[]
            selector.replace(/(.+?)([ >~+]+(?!d)(?! *:)|$)/ig,function (m,p1,p2) {
                selectorArr.push(p1,p2);
            })
            // console.log(selectorArr)
            this.selectorArr=selectorArr;
            // console.log(selectorArr)
            //设置缓存
    
            let preSelector='';
            for(let i=0;i<selectorArr.length;i=i+2){
                const exec=selectorArr[i-1]||'';
                const curSelector=selectorArr[i];
    
                this.setSelectotCache(preSelector,exec,curSelector);
                preSelector=preSelector+exec+curSelector
            }
            const arr=new Array(this.htmlList.length).fill(0);
            // if(/ ::/.test(selector))
            // console.log(selector,selectorArr)
            this.selectotCache[selector].forEach( (node) =>{
                arr[this.htmlList.indexOf(node)]=1;
            })
            return arr;
        }
        //记录selector查询html语法树
        setSelectotCache(preSelector,exec,curSelector){
    
            const nextSelector=preSelector+exec+curSelector;
            //已有缓存
            if(this.selectotCache[nextSelector]){return;}
            if(!preSelector&&!exec){
                this.selectotCache[curSelector]=this.breadthHit(curSelector,this.htmlAst)
                return;
            }
            const arr=this.selectotCache[preSelector];
    
            this.selectotCache[nextSelector]=[];
            if(/^ +$/.test(exec)){
                arr.forEach((node)=>{
                    this.selectotCache[nextSelector]=this.selectotCache[nextSelector].concat(this.breadthHit(curSelector,node));
                })
            }else if(/^ *> *$/.test(exec)){
                arr.forEach((node)=>{
                    this.selectotCache[nextSelector]=this.selectotCache[nextSelector].concat(this.childHit(curSelector,node));
                })
            }else if(/^ *+ *$/.test(exec)){
                arr.forEach((node)=>{
                    this.selectotCache[nextSelector]=this.selectotCache[nextSelector].concat(this.sublingHit(curSelector,node));
                })
            }else if(/^ *~ *$/.test(exec)){
                arr.forEach((node)=>{
                    this.selectotCache[nextSelector]=this.selectotCache[nextSelector].concat(this.sublingsHit(curSelector,node));
                })
            }else{
                console.log('exec异常:'+exec)
            }
    
        }
        //css_rule:element+element
        sublingHit(tag,astNode){
            if(!astNode.parent){
                return [astNode].filter( (node) =>{
                    return this.hitNode(tag,node);
                })
            }
            return Api.nextSublingSearch(astNode,astNode.parent).filter( (node) =>{
                return this.hitNode(tag,node);
            })
        }
        //css_rule:element~element
        sublingsHit(tag,astNode){
            return Api.nextSublingsSearch(astNode,astNode.parent).filter(function (node) {
                return this.hitNode(tag,node);
            })
        }
        //css_rule:element element
        breadthHit(tag,astNode){
            return Api.breadthSearch(astNode).filter( (node)=> {
                return node.type===1&&this.hitNode(tag,node);
            })
        }
        //css_rule:element>element
        childHit(tag,astNode){
            return Api.childSearch(astNode).filter( (node)=> {
                return node.type===1&&this.hitNode(tag,node);
            })
        }
        //tag是否命中ast节点,返回true、false
        hitNode(selector,astNode) {
    
            //分割字符串 (tag)、(id、class)(val)
            if(selector==='*'){
                return true;
            }else if(/:root/.test(selector)){
                return astNode.tag==='html';
            }else{
                const arr=[];
                //tag
                if(/(^[a-z]+)/i.test(selector)){
                    const tag=RegExp.$1;
                    arr.push(astNode.tag===tag)
                }
                //class
                if(/.([w-]+)/.test(selector)){
                    const val=RegExp.$1;
                    arr.push(astNode.attrsMap.class&&astNode.attrsMap.class.split(' ').indexOf(val)>-1);
                }
                //id
                if(/#(w+)/.test(selector)){
                    const val=RegExp.$1;
                    arr.push(astNode.attrsMap.id===val);
                }
                //属性
                if(/[([w-]+)(~=|=||=)?(w+)?]/.test(selector)){
                    const key=RegExp.$1;
                    const exec=RegExp.$2;
                    const val=RegExp.$3;
                    // console.log(selector,'属性选择器,只判断是否存在属性')
                    arr.push(astNode.attrsMap.hasOwnProperty(key));
                }
                //伪类选择器
                if(/(:.+)/.test(selector)){
                    const key=RegExp.$1;
                    // console.log(selector,'解析->',selector.replace(/:.+$/,''))
                    arr.push(true)
                    // arr.push(astNode.attrsMap.id===val);
                }
                if(arr.length==0){
                    // console.log(this.selectorArr)
                    console.log(selector,this.selectorArr,'css 解析异常')
                }
                return arr.every((item)=>item);
            }
    
    
    
        }
    }
    module.exports=querySelectorList;
  • 相关阅读:
    2.GO-可变参数函数、匿名函数和函数变量
    1.Go-copy函数、sort排序、双向链表、list操作和双向循环链表
    第四章、Go-面向“对象”
    第三章、Go-内建容器
    第二章、Go-基础语法
    第一章、Go安装与Goland破解
    arthas使用分享
    redis如何后台启动
    java.io.IOException: Could not locate executable nullinwinutils.exe in the Hadoop binaries
    安装redis出现cc adlist.o /bin/sh:1:cc:not found的解决方法
  • 原文地址:https://www.cnblogs.com/caoke/p/11269720.html
Copyright © 2011-2022 走看看