抓取页面的一块ui,将属于ui的html、css分离出来,需要配合浏览器机器人抓取html
const TinyCss=require('./utils/TinyCss') var getCssText = require("./utils/getCssText"); var getText = require("./utils/getText"); var setText = require("./utils/setText"); //ui 爬虫 async function init() { const htmlText=await getText('./src/test.html'); const cssText=await getText('./src/test.css'); // const cssText=await getCssText('https://cloud.baidu.com/product/bcd/search.html?keyword=%E5%85%AB%E6%88%92%E7%AE%97%E5%91%BD','html') // console.log(htmlText) // console.log(cssText); const app=new TinyCss([htmlText],cssText); const css=app.getTinyAst('.note-list-wrapper').toString(); // console.log(css); setText('./src/testmin.css',css); } init();
TinyCss.js
//TinyCss.js const Api=require('./Api'); //解析成语法树 const compiler = require('vue-template-compiler'); const postcss = require('postcss'); const querySelectorList=require('./querySelectorList') //构建出一个css语法树和多个html语法书,分析css的使用率。 class TinyCss{ constructor(htmlTextArr,cssText){ //多个html书法树 this.htmlTextArr=htmlTextArr; //一个css书法树 this.cssAst=postcss.parse(cssText); this.cssList=Api.depthSearch(this.cssAst,'nodes').filter(function (node) { return node.type==='rule'&&!/keyframes/.test(node.parent.name); }) //输出的部分 this.bigMap=null; this.map=null; this.data=null; this.emptyCss=null; this.emptyKeyFrames=null; } //移除数组中的子元素 removeObj(item,arr){ for(let i=0;i<arr.length;i++){ if(arr[i]===item){ arr.splice(i,1) break; } } } //获取矩阵数据 getBigMap(){ if(this.bigMap){ return this.bigMap; } let map=[]; for(let i=0;i<this.htmlTextArr.length;i++){ const htmlAst=compiler.compile(this.htmlTextArr[i]).ast; const ccRect=new querySelectorList(htmlAst,this.cssList); const rect=ccRect.analysis(); map.push(rect) } this.bigMap=map; return map; } //获取小数据,矩阵数据 getMap(){ if(this.map){ return this.map; } let map=[]; for(let i=0;i<this.htmlTextArr.length;i++){ const htmlText=this.htmlTextArr[i]; const htmlAst=compiler.compile(htmlText).ast; const ccRect=new querySelectorList(htmlAst,this.cssList); const arr=ccRect.analysis().map(function (item) { return item.reduce((x,y)=>x+y); }); for(let j=0;j<arr.length;j++){ if(!map[j])map[j]=[]; map[j].push(arr[j]) } } this.map=map; return map; } getUiMap(selector){ if(this.uiMap){ return this.uiMap; } let map=[]; for(let i=0;i<this.htmlTextArr.length;i++){ const htmlText=this.htmlTextArr[i]; const htmlAst=compiler.compile(htmlText).ast; const ccRect=new querySelectorList(htmlAst,this.cssList); const uiArr=ccRect.querySelectorAndChild(selector) const arr=ccRect.analysis().map(function (item) { let index=0; for(let k=0;k<item.length;k++){ if(item[k]===1&&uiArr[k]===1){ index++; } } return index; }); for(let j=0;j<arr.length;j++){ if(!map[j])map[j]=[]; map[j].push(arr[j]) } } this.uiMap=map; return map; } //移除无用的css getEmptyCss(selector){ if(this.emptyCss){ return this.emptyCss; } const cssList=this.cssList; const data=[]; const map=selector?this.getUiMap(selector):this.getMap(); for(let i=0;i<map.length;i++){ //存在比0大的就是用到的,都是0就是无用的css if(map[i].every(function (n) { return n===0 })){ //从ast中移除节点 this.removeObj(cssList[i],cssList[i].parent.nodes); data.push(cssList[i].selector); } } this.emptyCss=data; return data; } //移除空的动画 getEmptyKeyFrames(){ if(this.emptyKeyFrames){ return this.emptyKeyFrames; } const keyframesList=Api.depthSearch(this.cssAst,'nodes').filter(function (node) { return node.type==='atrule'&&/keyframes/.test(node.name); }) const vals=Api.depthSearch(this.cssAst,'nodes').filter(function (node) { return node.type==='decl'&&/animation/.test(node.prop); }) const delArr=keyframesList.filter(function (node) { return !vals.some(function (node2) { return node2.value.split(' ').indexOf(node.params)>-1 }) }) const emptyKeyFrames=[]; delArr.forEach( (node) =>{ //从ast中移除节点 this.removeObj(node,node.parent.nodes); emptyKeyFrames.push('@'+node.name+' '+node.params) }) this.emptyKeyFrames=emptyKeyFrames; return emptyKeyFrames; } //移除注释 removeComment(){ const commentArr=Api.depthSearch(this.cssAst,'nodes').filter(function (node) { return node.type==='comment'; }) commentArr.forEach((node)=>{ this.removeObj(node,node.parent.nodes); }) } getTinyAst(selector){ this.getEmptyCss(selector); this.getEmptyKeyFrames(); this.removeComment(); return this.cssAst; } } module.exports=TinyCss;
querySelectorList.js
//querySelectorList.js const Api=require('./Api'); //命中规则 /*css rule矩阵,3*6 行对应selector['.id','.class1','.class2'] 列对应html节点 ['body','body div','body div div','body div p','body div span','body div span a'] [ [0,0,0,0,1,0], [0,0,0,0,1,0], [0,0,0,0,1,0] ] */ class querySelectorList{ constructor(htmlAst,cssList){ //记录selector查找历史 this.selectotCache={}; //构建html语法树和矩阵bitmap this.htmlAst=htmlAst; this.htmlList=Api.depthSearch(this.htmlAst).filter(function (node) { return node.type===1; }) //构建css语法树和矩阵bitmap this.cssList=cssList; } //分析 analysis(){ const cssList=this.cssList; const map=[] for(let i=0;i<cssList.length;i++){ map[i]=this.querySelector(cssList[i].selector); } return map; } //获取选择器和它得子元素 querySelectorAndChild(selector){ const arr=this.querySelector(selector); for(let i=0;i<arr.length;i++){ if(arr[i]===1){ const cLen=Api.depthSearch(this.htmlList[arr[i]]).filter(function (node) { return node.type===1; }).length; for(let k=1;k<cLen;k++){ i++; arr[i]=1; } } } return arr; } //可能是多选择器 querySelector(selector){ if(/,/.test(selector)){ const arr=selector.split(','); const data=[]; for(let i=0;i<arr.length;i++){ const item=this.queryOneSelector(arr[i]); for(let k=0;k<item.length;k++){ if(item[k]===1){ data[k]=1; }else{ data[k]=0; } } } return data; }else{ return this.queryOneSelector(selector) } } //查询css_rule,返回[array astNode] queryOneSelector(selector){ selector=selector.trim();//去掉左右空格 //解析css rule const selectorArr=[] selector.replace(/(.+?)([ >~+]+(?!d)(?! *:)|$)/ig,function (m,p1,p2) { selectorArr.push(p1,p2); }) // console.log(selectorArr) this.selectorArr=selectorArr; // console.log(selectorArr) //设置缓存 let preSelector=''; for(let i=0;i<selectorArr.length;i=i+2){ const exec=selectorArr[i-1]||''; const curSelector=selectorArr[i]; this.setSelectotCache(preSelector,exec,curSelector); preSelector=preSelector+exec+curSelector } const arr=new Array(this.htmlList.length).fill(0); // if(/ ::/.test(selector)) // console.log(selector,selectorArr) this.selectotCache[selector].forEach( (node) =>{ arr[this.htmlList.indexOf(node)]=1; }) return arr; } //记录selector查询html语法树 setSelectotCache(preSelector,exec,curSelector){ const nextSelector=preSelector+exec+curSelector; //已有缓存 if(this.selectotCache[nextSelector]){return;} if(!preSelector&&!exec){ this.selectotCache[curSelector]=this.breadthHit(curSelector,this.htmlAst) return; } const arr=this.selectotCache[preSelector]; this.selectotCache[nextSelector]=[]; if(/^ +$/.test(exec)){ arr.forEach((node)=>{ this.selectotCache[nextSelector]=this.selectotCache[nextSelector].concat(this.breadthHit(curSelector,node)); }) }else if(/^ *> *$/.test(exec)){ arr.forEach((node)=>{ this.selectotCache[nextSelector]=this.selectotCache[nextSelector].concat(this.childHit(curSelector,node)); }) }else if(/^ *+ *$/.test(exec)){ arr.forEach((node)=>{ this.selectotCache[nextSelector]=this.selectotCache[nextSelector].concat(this.sublingHit(curSelector,node)); }) }else if(/^ *~ *$/.test(exec)){ arr.forEach((node)=>{ this.selectotCache[nextSelector]=this.selectotCache[nextSelector].concat(this.sublingsHit(curSelector,node)); }) }else{ console.log('exec异常:'+exec) } } //css_rule:element+element sublingHit(tag,astNode){ if(!astNode.parent){ return [astNode].filter( (node) =>{ return this.hitNode(tag,node); }) } return Api.nextSublingSearch(astNode,astNode.parent).filter( (node) =>{ return this.hitNode(tag,node); }) } //css_rule:element~element sublingsHit(tag,astNode){ return Api.nextSublingsSearch(astNode,astNode.parent).filter(function (node) { return this.hitNode(tag,node); }) } //css_rule:element element breadthHit(tag,astNode){ return Api.breadthSearch(astNode).filter( (node)=> { return node.type===1&&this.hitNode(tag,node); }) } //css_rule:element>element childHit(tag,astNode){ return Api.childSearch(astNode).filter( (node)=> { return node.type===1&&this.hitNode(tag,node); }) } //tag是否命中ast节点,返回true、false hitNode(selector,astNode) { //分割字符串 (tag)、(id、class)(val) if(selector==='*'){ return true; }else if(/:root/.test(selector)){ return astNode.tag==='html'; }else{ const arr=[]; //tag if(/(^[a-z]+)/i.test(selector)){ const tag=RegExp.$1; arr.push(astNode.tag===tag) } //class if(/.([w-]+)/.test(selector)){ const val=RegExp.$1; arr.push(astNode.attrsMap.class&&astNode.attrsMap.class.split(' ').indexOf(val)>-1); } //id if(/#(w+)/.test(selector)){ const val=RegExp.$1; arr.push(astNode.attrsMap.id===val); } //属性 if(/[([w-]+)(~=|=||=)?(w+)?]/.test(selector)){ const key=RegExp.$1; const exec=RegExp.$2; const val=RegExp.$3; // console.log(selector,'属性选择器,只判断是否存在属性') arr.push(astNode.attrsMap.hasOwnProperty(key)); } //伪类选择器 if(/(:.+)/.test(selector)){ const key=RegExp.$1; // console.log(selector,'解析->',selector.replace(/:.+$/,'')) arr.push(true) // arr.push(astNode.attrsMap.id===val); } if(arr.length==0){ // console.log(this.selectorArr) console.log(selector,this.selectorArr,'css 解析异常') } return arr.every((item)=>item); } } } module.exports=querySelectorList;