由于博主的个人网站(:http://www.johnnyzen.cn/),每学期都需要更新呈现课程的静态信息,由于课程量多,而且手动爬取很冗杂,特别想自动化实现。这不,今天终于有点时间了,把之前写nodejs的爬虫的思路转移到前端js上,同时更新了抓取数据的算法,比起之前的来说,自然是更加灵活高效了。
声明:如读者需引用,必须在文章显著处声明或者与博主取得联系,以示尊重劳动成果,非常感谢 0.0
var Course = function(seletorForTds){ var tds = []; var courseUnitCount = 0; //将dom对象数组转换为text文本数组 //TextnumFilter [以设置判断是否是课程的td格子的字符串长度作为特征判断值,如果没有达到此长度则会被删除 | 20] var tdsToTextArray = function($tds,TextnumFilter){ if($tds == undefined) throw new Error("$tds is not defined!"); if(TextnumFilter == undefined) TextnumFilter = 20; var array = []; for(var i = 0; i < $tds.length; i++){ if($tds[i].innerText.replace(/s*/g,"").length > TextnumFilter){ array.push($tds[i].innerText.replace(/s*/g,"").replace(/(<fontcolor="red">(.[^font]*)</font>)*/g,"")); } } return array; }; // (UTF-8)汉字转换为英文数字 var chineseToEnglishNumber = function(chiNum){ if(chiNum == undefined) throw new Error("$ don't load html!"); switch(chiNum){ case "零":return 0;break; case "一":return 1;break; case "二":return 2;break; case "三":return 3;break; case "四":return 4;break; case "五":return 5;break; case "六":return 6;break; case "七":return 7;break; case "八":return 8;break; case "九":return 9;break; } } //从td中生成课程单元数组(但仍未被解析,属于课程单元的原始信息数组) function generateCelltoCourseRawUnits(cell){ var cellItems = cell.innerText.split(" ");//以换行符为标志分割 // console.log('[generateCelltoCourseRawUnits] cellItems:', cellItems); // console.log('[generateCelltoCourseRawUnits] cellItems[0]:', cellItems[0]); var courses = []; var previousStopFlag = false; //是否上一个数组元素也是停用标志元素:(调、(换、""等 var count = 0;//记录原始课程单元的元素的长度(4 or 6 等) for(var i = 0, length = cellItems.length; i < length; i++){ // console.log('cellItems[' + i + '].indexOf("(换"): ', cellItems[i].indexOf("(换")); // console.log('cellItems[' + i + '].indexOf("(调"): ', cellItems[i].indexOf("(调")); // console.log('cellItems[' + i + ']: ', cellItems[i]); //假如当前元素是最后一个元素时 if(i == length-1){ if((cellItems[i].indexOf("(换") != -1) || (cellItems[i].indexOf("(调") != -1) || (cellItems[i] === "")){//如果当前元素为停用标志元素时 count++; courses.push(cellItems.slice(i - count + 1, i)); // console.log('【1】cellItems.slice(' + i + ' - ' + count + ' + 1, ' + i + '):', cellItems.slice(i - count + 1, i)); }else if(previousStopFlag == false){//如果当前元素为非停用标志元素,且上一个元素非停用标志元素时 count++; courses.push(cellItems.slice(i - count + 1, i + 1)); // console.log('【2】cellItems.slice(' + i + ' - ' + count + ' + 1, ' + i + '):', cellItems.slice(i - count + 1, i + 1)); } } //如果当前元素是停用标志元素时 if( (cellItems[i].indexOf("(换") != -1) || (cellItems[i].indexOf("(调") != -1) || (cellItems[i] === "")){ if(previousStopFlag == false && (i != length-1)){//如果上一门课程未被填充且非最后一个元素时(即 上一个元素非停用标志元素且非最后一个元素时,push当前的course) // console.log('push raw course:count:', count); courses.push(cellItems.slice(i - count, i)); // console.log('【3】cellItems.slice(' + i + ' - ' + count + ', ' + i + '):', cellItems.slice(i - count, i)); } else {//上一个元素为停用标志元素时 } count = 0; // console.log("count " + count + " 【" + i + "】" + cellItems[i] + ' test 3'); previousStopFlag = true;//表示已经将上一门课程是停用标志元素 } else {//如果当前元素不是停用标志元素 if( (previousStopFlag == true) || (i == 0)) { //如果上一个元素是停用标志元素或者当前元素属于第一个元素,则说明当前元素已经属于一门新的课程信息的单元对象的课程名了,需要创建一个新的课程单元数组 } else { //如果上一个元素不是停用标志元素且非首元素,则说明当前元素已经属于正在填充的课程单元 } // if(i != length-1){ count++; // console.log("count " + count + " 【" + i + "】" + cellItems[i] + ' test 1'); previousStopFlag = false; // } } } // console.log('courses:', courses); return courses; } //根据原始的课程单元生成课程对象 var generateCourses = function(CourseRawUnits,courses){ for(var i = 0,CourseUnitSize = CourseRawUnits.length; i < CourseUnitSize; i++){ switch(CourseRawUnits[i].length) {//根据课程单元的元素长度解析成对应课程对象 case 4: case 6:{ var course = {}; course.name = CourseRawUnits[i][0]; course.week_index =chineseToEnglishNumber( CourseRawUnits[i][1].charAt( CourseRawUnits[i][1].search(/周[一二三四五六七]/gi) + 1 )); try { var patCourse_index = new RegExp("第\d*[,\d*]*节","gi"); // console.log('test CourseRawUnits[i][1]:', CourseRawUnits[i][1]); course.course_index = patCourse_index.exec(CourseRawUnits[i][1])[0].replace("第","").replace("节","").split(",").map(function(ele,index,array){ return parseInt(ele); }) // console.log('course.course_index:', course.course_index ); } catch(error){ console.log('[generateCourses] error.message:', error.message); } var patWeeks = new RegExp("第\d*[-]*[\d*]*周","gi"); // var course_indexArray = pat.exec(CourseRawUnits[i][1])[0].split("-"); var course_Weeks = patWeeks.exec(CourseRawUnits[i][1])[0].replace("第","").replace("周","").split("-").map(function(ele,index,array){ return parseInt(ele); }) // console.log('test course_Weeks:', course_Weeks); course.week_start = course_Weeks[0]; course.week_end = course_Weeks[1]; course.teacher = CourseRawUnits[i][2]; course.location = CourseRawUnits[i][3]; courses.push(course); break; } case 0: break; } } return courses; } //清除无关dom节点 var ArraysClearEmptyItem = function(array,condition){ if(tds == undefined) throw new Error("tds is not defined!"); // console.log('ArrayClearEmptyItem array', array); var newArray = []; for(var i = 0; i < array.length; i++){ // console.log('[ArrayClearEmptyItem] array[i].innerText:', array[i].innerText); if(array[i].innerText.length > 20 && array[i].innerText != " "){ newArray.push(array[i]); // console.log('push:', array[i]); } } return newArray; }; // [ArrayClearEmptyItem 清除数组内为空字串""的元素] var ArrayClearEmptyItem = function(array,condition){ if(tds == undefined) throw new Error("tds is not defined!"); var newArray = []; for(var i = 0;i < array.length;i++){ if(array[i].length > 1){ newArray.push(array[i]); // console.log('push:',array[i]); } } return newArray; } var tdsItemsToCourses = function($tds){ //$tds.length if($tds == undefined) throw new Error("$tds is not defined!"); $tds = ArraysClearEmptyItem($tds);//清除数组内空字串""的元素 var courses = []; for(var j = 0; j < $tds.length; j++){ courses = generateCourses(generateCelltoCourseRawUnits($tds[j]), courses); } return courses; }; this.load = function(){ tds = document.querySelectorAll(seletorForTds); // console.log('tds:', tds); tdsToTextArray(tds, 20); var courses = tdsItemsToCourses(tds); // console.log('courses: ', courses); // window.courses = courses; // console.log("课表课程解析: ",JSON.stringify(courses)); return courses; } } var Student = function(seletorOption){ var that = this; that.load = function(){ return { sno : document.querySelectorAll(seletorOption.sno)[0].innerText.trim().replace(/学号:/gi, ""), sname : document.querySelectorAll(seletorOption.sname)[0].innerText.trim().replace(/姓名:/gi, ""), college : document.querySelectorAll(seletorOption.college)[0].innerText.trim().replace(/学院:/gi, ""), profession : document.querySelectorAll(seletorOption.profession)[0].innerText.trim().replace(/专业:/gi, ""), clazz : document.querySelectorAll(seletorOption.clazz)[0].innerText.trim().replace(/行政班:/gi, ""), courses: (new Course(seletorOption.courseTable)).load() } }; that.stringify = function(){ return JSON.stringify(that.load()); } } //demo var seletorOption = { sno:"#Label5", sname:"#Label6", college:"#Label7", profession:"#Label8", clazz:"#Label9", courseTable:"#Table1 td" }; var student = (new Student(seletorOption));