zoukankan      html  css  js  c++  java
  • 第一阶段冲刺4

    今天终于实现了爬虫,爬取到了一定的信息

    代码:

    # -*- coding:utf-8 -*-

    import requests

    from bs4 import BeautifulSoup

    import bs4

    from selenium import webdriver

    from time import sleep

    import time

    from PIL import Image

    class Login(object):

           def __init__(self):

                 

                 

                  options=webdriver.ChromeOptions()

                  options.add_argument('--headless')

                  options.add_argument('--disable-gpu')

                  options.add_argument('--ignore-certificate-errors')

                  options.add_argument("--disable-gpu")

                  self.driver=webdriver.Chrome(options=options)

                  self.driver.maximize_window()

                  self.driver.set_window_size('1920','1080') #设置浏览器宽480,高800 

                  self.driver.get('http://tiedao.vatuu.com/service/login.html?returnUrl=return')

                  cookie = self.driver.get_cookies()

                  print(type(cookie))

                  print(cookie[0]['value'])

                  self.cookie = cookie[0]['value']

                  print(type(self.cookie))

                  self.headers = {

                                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',

                                'Accept-Encoding': 'gzip, deflate',

                                'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',

                                'Connection': 'keep-alive',

                                'Cookie': self.cookie,

                                'Host': 'tiedao.vatuu.com',

                                'Upgrade-Insecure-Requests': '1',

                                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36

                                       (KHTML, like Gecko)Chrome/72.0.3626.121 Safari/537.36'

                         }

                  print(self.headers)

                  self.url = 'http://tiedao.vatuu.com/vatuu/StudentScoreInfoAction?setAction=studentScoreQuery&viewType=

                         studentScore&orderType=submitDate&orderValue=desc'

                 

           # 时间格式进行格式化

           def time_format(self):

                  current_time = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))

                  return current_time

           def cut(self):

                 

                  # 截取全屏

                  self.driver.save_screenshot("C:\Users\kang\Desktop\pachong\full.png")

                  # 要截屏的目标元素

                  element = self.driver.find_element_by_id("randomPhoto")

                  print(element.location)

                  print(element.size)

                  # 获取element的顶点坐标

                  xPiont = element.location['x']

                  yPiont = element.location['y']

                  # 获取element的宽、高

                  element_width = xPiont + element.size['width']

                  element_height = yPiont + element.size['height']

            

                  picture = Image.open('C:\Users\kang\Desktop\pachong\full.png')

            

                  '''

                  crop()--  一个显式的参数:一个4元组

                Image.crop(box=None):图像返回一个矩形区域,box是一个四元组 限定所述左,上,右,和下像素坐标

                  参数:box--裁剪矩形,作为(左,上,右,下)-tuple;返回类型:Image;返回:一个Image对象

                  所以你应该重写它:

                  img.crop((414,122,650,338))

                  #        ^    4-tuple    ^

                  '''

                  picture = picture.crop((xPiont, yPiont, element_width-30, element_height))

                  src = "C:\Users\kang\Desktop\pachong\"+self.time_format()+".png"

                  picture.save(src)

                 

          

          

                 

           def login(self):

                  self.cut()

                  user = self.driver.find_element_by_id('username')

                  passwor = self.driver.find_element_by_id('password')

                  ranstring = self.driver.find_element_by_id('ranstring')

                  confirm = self.driver.find_element_by_id('submit2')

                  name = input("请输入:username")

                  password = input("请输入:password")

                  ran = input("请输入:验证码")

                  user.send_keys(name)

                  passwor.send_keys(password)

                  ranstring.send_keys(ran)

                  confirm.click()

                 

                 

    def get_soup(url,cookies=''):

           if cookies:

                  html = get_html(url,cookies)

                  soup = BeautifulSoup(html, 'lxml')

                  return soup

           else:

                  html = get_html(url,'')

                  soup = BeautifulSoup(html, 'lxml')

                 

                  return soup  

                 

    def html_utf(content):

           html=content

           html_doc=str(html,'utf-8') #html_doc=html.decode("utf-8","ignore")

           return html_doc   

    def get_html(url,cookies):

           try:

                  r = requests.get(url, timeout=5, cookies=cookies)

                  r.raise_for_status()

                  return html_utf(r.content)

           except:

                  return "ERROR"   

    def score(url,cookie):

           # print(get_html(url,{'JSESSIONID':cookie}))

           soup = get_soup(url,{'JSESSIONID':cookie})

           table = soup.find('table', attrs={'id':'table3'})

           print(table)

           trs = table.find_all('tr')

           contents = []

           for tr in trs:

                  ths = tr.find_all('th')

                  if ths:

                         for th in ths:

                                print(th.string.strip(),' | ',end='')

                         print('')

                  else:

                         content = []

                         tds = tr.find_all('td')

                         for td in tds:

                                if td.string is None:

                                       pass

                                else:

                                       str = td.string.strip()

                                       content.append(str)

                                       print(str,' | ',end='')

                         contents.append(content)

                         print('')

           for con in contents:

                  for c in con:

                         print(c,'||',end='')

                  print('')  

    def course(url,cookie):

           soup = get_soup(url,{'JSESSIONID':cookie})

           table = soup.find('table',attrs={'class':'table_border'})

           trs = table.find_all('tr')

           contents = []

           for tr in trs:

                  tds = tr.find_all('td')

                  for td in tds:

                         res = td.find_all(text=True)

                         strs = ''

                         for s in res:

                                strs+=s

                         # print(strs)

                         contents.append(strs)

                         print('+++++++++++++++++++++++++++++')

           '''

           classList 按时间顺序排好的课程

           '''

           classList = []                

           for i in range(1,8):

                  for j in range(1,13):

                         content = '星期'+str(i)+':第'+str(j)+'节:'+contents[j*8+i]

                         print(content)

                         classList.append(content)

          

                        

    def room(url,cookie):

           '''

           首先构建表单

           '''

           week = input('请输入周数:')

           day_num = input('请输入星期几:')

           class_num = input('请输入:

           1:第一二节

           2:第三四节

           3:整个上午

           4:第六七节

           5:第八九节

           6:整个下午

           7:晚自习')

           week = pow(2,int(week)-1)

           day_time = ''

          

           if class_num == '1':

                  day_time='0000000000011'

           elif class_num == '2':

                  day_time='0000000001100'

           elif class_num == '3':

                  day_time='0000000001111'

           elif class_num == '4':

                  day_time='0000001100000'

           elif class_num == '5':

                  day_time='0000110000000'

           elif class_num == '6':

                  day_time='0000111100000'

           elif class_num == '7':

                  day_time='0110000000000'

           param = {'setAction': 'classroomQuery',

                         'PageAction': 'Query',

                         'day_time_text': day_time,

                         'school_area_code': '1',

                         'building': '',

                         'week_no': str(week),

                         'day_no': str(int(day_num)),

                         'B1': '查询'

                  }

           cookie_copy = 'JSESSIONID='+cookie

           print(param)

          

           headers = {

                                'Connection': 'keep-alive',

                                'Cache-Control': 'max-age=0',

                                'Origin': 'http://tiedao.vatuu.com',

                                'Upgrade-Insecure-Requests': '1',

                                'Content-Type': 'application/x-www-form-urlencoded',

                                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36

                                       (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',

                                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,

                                       image/apng,*/*;q=0.8',

                                'Referer': 'http://tiedao.vatuu.com/vatuu/CourseAction',

                                'Accept-Encoding': 'gzip, deflate',

                                'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',

                                'Cookie':  cookie_copy

                         }

           r = requests.post(url,data=param,headers=headers)

           soup = BeautifulSoup(html_utf(r.content), 'lxml')

           table = soup.find('table',attrs={'class':'table_gray'})

           trs = table.find_all('tr')

           contents = []

           for tr in trs:

                  ths = tr.find_all('th')

                  if ths:

                         for th in ths:

                                print(th.string.strip(),' | ',end='')

                         print('')

                  else:

                         content = []

                         tds = tr.find_all('td')

                         for td in tds:

                                res = td.find_all(text=True)

                                strs = ''

                                for s in res:

                                       strs+=s

                                strs = strs.strip()

                                print(strs,' || ',end='')

                                contents.append(strs)

                  print('+++++++++++++++++++++++++++++')

          

                        

                               

          

    if __name__ == '__main__':

           login = Login()

           # login.login()

           time.sleep(2)

           course_url = 'http://tiedao.vatuu.com/vatuu/CourseAction?setAction=userCourseScheduleTable

                  &viewType=studentQueryCourseList&selectTableType=ThisTerm&queryType=student'

           room_url = 'http://tiedao.vatuu.com/vatuu/CourseAction'

           # score(login.url,login.cookie)

           print(login.cookie)

           cookie='8CD73FB791382490DA6F32187893B80E'

           course(course_url,cookie)

           for i in range(1,100):

                  room(room_url,cookie)

    下一步准备进行遍历这些数据,展示空教室信息。

  • 相关阅读:
    Oracle 字符集的查看和修改
    Hibernate查询方法与缓存的关系
    Oracle Sql语句整理
    Android动画效果
    Acrobat9键盘快捷键
    Head区的设置
    JAR,WAR,EAR区别
    ASCII码表完整版
    HTML的meta标签详解
    .NET中TextBox控件设置ReadOnly=true后台取不到值三种解决方法
  • 原文地址:https://www.cnblogs.com/shumouren/p/13090460.html
Copyright © 2011-2022 走看看