zoukankan      html  css  js  c++  java
  • 第五次作业

    作业①

    爬取手机信息

    要求:
    熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
    使用Selenium框架爬取京东商城某类商品信息及图片。
    候选网站:http://www.jd.com/
    关键词:学生自由选择
    输出信息:MYSQL的输出信息如下:

    代码:

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    import urllib.request
    import threading
    import sqlite3
    import os
    import datetime
    from selenium.webdriver.common.keys import Keys
    import time
    
    class MySpider:
        headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
        imagePath = "download"
    
        def startUp(self, url, key):
            # # Initializing Chrome browser
            chrome_options = Options()
            # chrome_options.add_argument('--headless')
            # chrome_options.add_argument('--disable-gpu')
            self.driver = webdriver.Chrome(options=chrome_options)
    
            # Initializing variables
            self.threads = []
            self.No = 0
            self.imgNo = 0
            # Initializing database
            try:
                self.con = sqlite3.connect("phones.db")
                self.cursor = self.con.cursor()
                try:
                    # 如果有表就删除
                    self.cursor.execute("drop table phones")
                except:
                    pass
                try:
                    #  建立新的表
                    sql = "create  table  phones  (mNo  varchar(32) primary key, mMark varchar(256),mPrice varchar(32),mNote varchar(1024),mFile varchar(256))"
                    self.cursor.execute(sql)
                except:
                    pass
    
            except Exception as err:
                print(err)
                # Initializing images folder
            try:
                if not os.path.exists(MySpider.imagePath):
                    os.mkdir(MySpider.imagePath)
                images = os.listdir(MySpider.imagePath)
                for img in images:
                    s = os.path.join(MySpider.imagePath, img) #删除目录下的文件
                    os.remove(s)
            except Exception as err:
                print(err)
            self.driver.get(url)
            keyInput = self.driver.find_element_by_id("key")
            keyInput.send_keys(key)    #传入参数
            keyInput.send_keys(Keys.ENTER)   #回车键
    
        def closeUp(self):       #关闭数据库连接和浏览器驱动
            try:
                self.con.commit()
                self.con.close()
                self.driver.close()
            except Exception as err:
                print(err);    
    
        def insertDB(self, mNo, mMark, mPrice, mNote, mFile):  #插入数据
            try:
                sql = "insert into phones (mNo,mMark,mPrice,mNote,mFile) values (?,?,?,?,?)"
                self.cursor.execute(sql, (mNo, mMark, mPrice, mNote, mFile))
            except Exception as err:
                print(err) 
    
        def showDB(self):    #数据库表的展示
            try:
                con = sqlite3.connect("phones.db")
                cursor = con.cursor()
                print("%-8s%-16s%-8s%-16s%s" % ("No", "Mark", "Price", "Image", "Note"))
                cursor.execute("select mNo,mMark,mPrice,mFile,mNote from phones  order by mNo")
                rows = cursor.fetchall()
                for row in rows:
                    print("%-8s %-16s %-8s %-16s %s" % (row[0], row[1], row[2], row[3], row[4]))
                con.close()
            except Exception as err:
                print(err)
    
        def download(self, src1, src2, mFile):
            data = None
            if src1:
                try:
                    req = urllib.request.Request(src1, headers=MySpider.headers)
                    resp = urllib.request.urlopen(req, timeout=10)
                    data = resp.read()
                except:
                    pass
            if not data and src2:
                try:
                    req = urllib.request.Request(src2, headers=MySpider.headers)
                    resp = urllib.request.urlopen(req, timeout=10)
                    data = resp.read()
                except:
                    pass
            if data:
                print("download begin", mFile)
                fobj = open(MySpider.imagePath + "\" + mFile, "wb")
                fobj.write(data)
                fobj.close()
                print("download finish", mFile)  
    
        def processSpider(self):
            try:
                time.sleep(1)
                print(self.driver.current_url)
                lis = self.driver.find_elements_by_xpath("//div[@id='J_goodsList']//li[@class='gl-item']")
                for li in lis:
                    # We find that the image is either in src or in data-lazy-img attribute
                    try:
                        src1 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
                    except:
                        src1 = ""
    
                    try:
                        src2 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("data-lazy-img")
                    except:
                        src2 = ""
                    try:
                        price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
                    except:
                        price = "0"
    
                    try:
                        note = li.find_element_by_xpath(".//div[@class='p-name p-name-type-2']//em").text
                        mark = note.split(" ")[0]
                        mark = mark.replace("爱心东东
    ", "")
                        mark = mark.replace(",", "")
                        note = note.replace("爱心东东
    ", "")
                        note = note.replace(",", "")
    
                    except:
                        note = ""
                        mark = ""
                    self.No = self.No + 1
                    no = str(self.No)
                    while len(no) < 6:
                        no = "0" + no
                    print(no, mark, price)
                    if src1:
                        src1 = urllib.request.urljoin(self.driver.current_url, src1)
                        p = src1.rfind(".")
                        mFile = no + src1[p:]
                    elif src2:
                        src2 = urllib.request.urljoin(self.driver.current_url, src2)
                        p = src2.rfind(".")
                        mFile = no + src2[p:]
                    if src1 or src2:
                        T = threading.Thread(target=self.download, args=(src1, src2, mFile))
                        T.setDaemon(False)
                        T.start()
                        self.threads.append(T)
                    else:
                        mFile = ""
                    self.insertDB(no, mark, price, note, mFile)
                try:
                    self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next disabled']")
                except:
                    nextPage = self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next']")
                    time.sleep(10)
                    nextPage.click()
                    self.processSpider()
            except Exception as err:
                print(err)       
    
        def executeSpider(self, url, key):
            starttime = datetime.datetime.now()
            print("Spider starting......")
            self.startUp(url, key)
            print("Spider processing......")
            self.processSpider()
            print("Spider closing......")
            self.closeUp()
            for t in self.threads:
                t.join()
            print("Spider completed......")
            endtime = datetime.datetime.now()
            elapsed = (endtime - starttime).seconds
            print("Total ", elapsed, " seconds elapsed")
    
    url = "http://www.jd.com"
    spider = MySpider()
    while True:
        print("1.爬取")
        print("2.显示")
        print("3.退出")
        s = input("请选择(1,2,3):")
        if s == "1":
            spider.executeSpider(url, "手机")
            continue
        elif s == "2":
            spider.showDB()
            continue
        elif s == "3":
            break
    

    运行结果部分展示:

    心得体会

    主要就是复现一下课上的代码,增加了解

    实验②

    爬取股票信息

    要求:
    熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
    使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
    候选网站:东方财富网:http://quote.eastmoney.com/center/gridlist.html#hs_a_board

    输出信息:MYSQL数据库存储和输出格式如下,表头应是英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计表头:

    代码:

    from selenium import webdriver
    from selenium.webdriver import ChromeOptions
    from time import sleep
    import pymysql
    
    option = ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    driver = webdriver.Chrome(options=option)
    
    driver.get("http://quote.eastmoney.com/center/gridlist.html#hs_a_board")
    driver.maximize_window()
    conn = None
    cursor = None
    conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='root', db='spider', charset='utf8')
    cursor = conn.cursor()
    modules = [ "hs_a_board","sh_a_board", "sz_a_board"]   #遍历板块
    for module in modules:
        sleep(1)
        driver.find_element_by_xpath('//*[@id="nav_{}"]'.format(module)).click()   #板块点击
        sleep(2)
        driver.execute_script("var q=document.documentElement.scrollTop=10000")   #下拉底部
        while True:
            trs = driver.find_elements_by_xpath('//*[@id="table_wrapper-table"]/tbody/tr')
            for tr in trs:
                id = tr.find_element_by_xpath("./td[1]").text
                no = tr.find_element_by_xpath("./td[2]/a").text
                name = tr.find_element_by_xpath("./td[3]/a").text
                latest_price = tr.find_element_by_xpath("./td[5]/span").text
                range = tr.find_element_by_xpath("./td[6]/span").text
                amount = tr.find_element_by_xpath("./td[7]/span").text
                trading = tr.find_element_by_xpath("./td[8]").text
                transaction = tr.find_element_by_xpath("./td[9]").text
                print(id, no, name, latest_price, range, amount, trading, transaction)
                try:
                    cursor.execute('insert into stocks values("%s","%s","%s","%s","%s","%s","%s","%s")' %
                                   (id, no, name, latest_price, range, amount, trading, transaction))   #插入数据
                    conn.commit()
                except:
                    conn.rollback()
            try:
                driver.find_element_by_class_name("next paginate_button disabled")
            except:
                driver.find_element_by_xpath('//*[@id="main-table_paginate"]/a[@class="next paginate_button"]').click()   #翻页
                sleep(2)
        driver.execute_script("var q=document.documentElement.scrollTop=0")    #返回顶部
    driver.quit()
    cursor.close()
    conn.close()
    

    运行结果部分展示:

    心得体会:

    加深对selenium的了解程度

    实验③

    模拟登录mooc以及爬取课程信息

    要求:

    熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
    使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
    候选网站:中国mooc网:https://www.icourse163.org

    输出信息:MYSQL数据库存储和输出格式:

    代码:

    from selenium import webdriver
    from selenium.webdriver import ChromeOptions
    from time import sleep
    import pymysql
    option = ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    driver = webdriver.Chrome(options=option)
    
    driver.get("https://www.icourse163.org/")
    driver.maximize_window()
    sleep(2)
    
    driver.find_element_by_xpath('//div[@class="unlogin"]//a[@class="f-f0 navLoginBtn"]').click()   #登录或注册
    sleep(2)
    driver.find_element_by_class_name('ux-login-set-scan-code_ft_back').click()              #其他登录方式
    sleep(2)
    driver.find_element_by_xpath("//ul[@class='ux-tabs-underline_hd']//li[@class='']").click()
    sleep(2)
    driver.switch_to.frame(driver.find_element_by_xpath("//div[@class='ux-login-set-container']//iframe"))
    driver.find_element_by_xpath('//input[@id="phoneipt"]').send_keys("******")        #输入账号
    sleep(2)
    driver.find_element_by_xpath('//input[@placeholder="请输入密码"]').send_keys("******")   #输入密码
    sleep(2)
    driver.find_element_by_xpath('//div[@class="f-cb loginbox"]//a[@id="submitBtn"]').click()  #点击登录
    sleep(3)
    driver.find_element_by_xpath(
        '//div[@class="u-baseinputui"]/input[@class="j-textarea inputtxt"]').send_keys("python")    #输入要找的课程
    sleep(2)
    driver.find_element_by_xpath('//div[@class="u-search-icon"]/span[@class="u-icon-search2 j-searchBtn"]').click()   #点击搜索
    sleep(2)
    conn = None
    cursor = None
    conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='root', db='spider', charset='utf8')
    cursor = conn.cursor()
    id=0
    while True:
        sleep(2)
        divs = driver.find_elements_by_xpath('//div[@class="m-course-list"]/div/div')
        for i in range(len(divs)):
            try:
                div = driver.find_elements_by_xpath('//div[@class="m-course-list"]/div/div')[i + 1]
                div.click()
                sleep(3)
                current_window = driver.window_handles[-1]   #切换到最新打开的页面
                driver.switch_to.window(current_window)
                sleep(2)
                id += 1
                course = driver.find_element_by_xpath('//span[@class="course-title f-ib f-vam"]').text
                process = driver.find_element_by_xpath(
                    '//div[@class="course-enroll-info_course-info_term-info_term-time"]/span[2]').text
                college = driver.find_element_by_xpath('//*[@id="j-teacher"]/div/a/img').get_attribute("alt")
                count = driver.find_element_by_xpath(
                    '//span[@class="course-enroll-info_course-enroll_price-enroll_enroll-count"]').text
                brief = driver.find_element_by_xpath('//*[@id="j-rectxt2"]').text
                teacher = driver.find_element_by_xpath('//div[@class="cnt f-fl"][1]/h3').text
                team = ""
                teas = driver.find_elements_by_xpath('//div[@class="um-list-slider_con"]/div')
                if len(teas)>1:
                    for tea in teas:
                        team = team + tea.find_element_by_xpath('.//div[@class="cnt f-fl"]/h3').text   #教师拼接
                else:
                    team=teacher
                print(course, college, teacher, team, process, brief)
                try:
                    cursor.execute('insert into mooc values("%s","%s","%s","%s","%s","%s","%s","%s")' %
                                   (id, course, college, teacher, team, count, process, brief))  # 插入数据
                    conn.commit()
                except:
                    conn.rollback()
                driver.close()
                sleep(2)
                previous_window = driver.window_handles[0]   #切换回最开始打开的界面
                sleep(2)
                driver.switch_to.window(previous_window)
                sleep(2)
                try:
                    driver.find_element_by_xpath(
                        '//li[@class="ux-pager_btn ux-pager_btn__next"]/a[@class="th-bk-disable-gh"]')
                except:
                    driver.find_element_by_xpath(
                        '//li[@class="ux-pager_btn ux-pager_btn__next"]/a[@class="th-bk-main-gh"]').click()
                    sleep(3)
            except Exception as e:
                print(e)
    driver.quit()
    cursor.close()
    conn.close()
    

    运行结果部分展示:

    心得体会:

    遇到了问题:Message: stale element reference: element is not attached to the page document
    divs = driver.find_elements_by_xpath('//div[@class="m-course-list"]/div/div')
    for div in divs:
    div.click()
    ...
    这样就容易点击了第一个div之后,页面出现刷新的情况,再想点第二个就会报这个错
    可以改成:
    divs = driver.find_elements_by_xpath('//div[@class="m-course-list"]/div/div')
    for i in range(len(divs)):
    driver.find_element_by_xpath('//a[@class="class name"][i+1]').click()
    需要在页面刷新后重新定位再进行操作。

  • 相关阅读:
    Python+selenium常见报错
    jmeter压力测试指标解释
    jemte自动生成测试报告方法
    iview清空Date-picker组件后请求报错解决方法
    【求助】Django+vue项目时候遇到一个问题,table的data数据怎么更新
    django_数据库操作—增、删、改、查
    django-关于manage.py migrate无效的问题
    python家里的环境和公司环境怎么同步
    python class类的属性应用及init初始化实践 ---勇者斗恶龙
    IDLE与pycharm执行相同代码结果却不同,原因分析
  • 原文地址:https://www.cnblogs.com/11uxx/p/14002129.html
Copyright © 2011-2022 走看看