zoukankan      html  css  js  c++  java
  • 数据采集与融合第六次作业

    作业一

    (1)要求:

        * 用requests和BeautifulSoup库方法爬取豆瓣电影Top250数据。
        * 每部电影的图片,采用多线程的方法爬取,图片名字为电影名
        * 了解正则的使用方法
    
    graph TD A[浏览器发起请求获取当前页面源码数据] -->|获取lis| B(通过select) B --> C{获取每一个li里面的数据} C -->|正常获取| D[获得数据] C -->|特殊情况判断| F[获取数据]
    #!/usr/bin/env python
    # _*_ coding:utf-8 _*_
    # author: xm time:2020/11/27
    import requests
    from bs4 import BeautifulSoup
    import re,os
    import threading
    import pymysql
    import urllib
    
    
    
    class MySpider:
    
        def startUp(self,url):
            headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
            }
            self.open = False
            try:
                self.con = pymysql.connect(host='localhost',port=3306,user='root',passwd='031804114.hao',database='movies',charset='utf8')
                self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
                self.open = True
                try:
                    self.cursor.execute("drop table if exists movies")
                except Exception as err:
                    # print(err)
                    pass
                try:
                    sql = """create table movies(
                        rank varchar(32),
                        movie_name varchar(32),
                        direct varchar(64),
                        main_act varchar(128),
                        show_time varchar(64),
                        country varchar(128),
                        movie_type varchar(64),
                        score varchar(32),
                        count varchar(32),
                        quote varchar(128),
                        path varchar(64)
                    )character set =utf8;
                    """
                    self.cursor.execute(sql)
                    print("表格创建成功")
                except Exception as err:
                    print(err)
                    print("表格创建失败")
            except Exception as err:
                print(err)
            self.no = 0
            # self.page = 0
            self.Threads = []
            # page_text = requests.get(url=url,headers=headers).text
            # soup = BeautifulSoup(page_text,'lxml')
            # print(soup)
            # li_list = soup.select("ol[class='grid_view'] li"c)
            urls = []
            for i in range(5):
                url = 'https://movie.douban.com/top250?start=' + str(i*25) + '&filter='
                req = urllib.request.Request(start_url,headers=headers)
                data = urllib.request.urlopen(req)
                data = data.read()
                dammit = UnicodeDammit(data, ["utf-8", "gbk"])
                data = dammit.unicode_markup
                soup = BeautifulSoup(data, "lxml")
                # print(soup)
                lis = soup.select("ol[class='grid_view'] li")
                for li in lis:
                # number = li.select('em')[0].text
                # hd = li.select("div[class='hd'] a")[0]
                # title = hd.select("span[class='title']")[0].text
                # two = li.select("div[class='bd'] p")[0].text
                # d= ''.join(two.split())
                # director = d.split(":")[1]
                # director = director[:-2]
                 if len(d.split(":"))==2:
            #         spl = d.split(":")[1]
            #         director_ = spl.split("...")[0]
            #         director = director_[:-1]
            #         personator = '...'
            #         other = spl.split("...")[1]
            #         time = other.split("/")[0]
            #         country = other.split("/")[1]
            #         type = other.split("/")[2]
            #     else:
            #         others = d.split(":")[2]
            #         pat = re.compile("d{4}")
            #         time = re.findall(pat,others)[0]
            #         country = others.split("/")[-2]
            #         type = others.split("/")[-1]
            #         personator = others.split("...")[0]
            # 
            #     print(number,title,len(d.split(":")),director,personator,time,country,type)
            # i+=25
            # next_page = re.sub(r'.*start=0','start=str(i)',start_url)
            # start_url =next_page
                    rank = li.select("div[class='item'] div em")[0].text
                    movie_name = li.select("div[class='info'] div a span[class='title']")[0].text
                    print(movie_name)
    
                    dir_act = li.select("div[class='info'] div[class='bd'] p")[0].text
                    dir_act = ' '.join(dir_act.split())
                    try:
                        direct = re.search(':.*:',dir_act).group()[1:-3]
                    except:
                        direct = "奥利维·那卡什 Olivier Nakache / 艾力克·托兰达 Eric Toledano "
                    # print(direct)
                    # print(dir_act)
    
                    s = dir_act.split(':')
                    # print(s)
                    try:
                        main_act = re.search(r'(D)*',s[2]).group()
                    except:
                        main_act = "..."
                    # print(main_act)
                    pattern = re.compile('d+',re.S)
                    show_time = pattern.search(dir_act).group()
                    # print(show_time)
                    countryAndmovie_type = dir_act.split('/')
                    country = countryAndmovie_type[-2]
                    movie_type = countryAndmovie_type[-1]
    
                    score = li.select("div[class='info'] div[class='star'] span")[1].text
                    # print(score)
                    count = re.match(r'd+',li.select("div[class='info'] div[class='star'] span")[3].text).group()
                    # print(score,count,quote)
                    img_name = li.select("div[class='item'] div a img")[0]["alt"]
                    try:
                        quote = li.select("div[class='info'] p[class='quote'] span")[0].text
                    except:
                        quote = ""
                    # print(img_name)
                    img_src = li.select("div[class='item'] div a img[src]")[0]["src"]
                    path = 'movie_img\' + img_name + '.jpg'
                    # print(img_name,img_src,path)
                    print(rank,'2',movie_name,'3',direct,'4',main_act,'5',show_time,'6',country,'7',movie_type,'8',score,'9',count,'10',quote,'11',path)
                    try:
                        self.insertdb(rank,movie_name,direct,main_act,show_time,country,movie_type,score,count,quote,path)
                        self.no += 1
                    except Exception as err:
                        # print(err)
                        print("数据插入失败")
                    if url not in urls:
                        T = threading.Thread(target=self.download,args=(img_name,img_src))
                        T.setDaemon(False)
                        T.start()
                        self.Threads.append(T)
                # print(len(li_list))
        def download(self,img_name,img_src):
            dir_path = 'movie_img'
            if not os.path.exists(dir_path):
                os.mkdir(dir_path)
                # for img in os.listdir(movie_img):
                #     os.remove(os.path.join(movie_img,img))
            file_path = dir_path + '/' + img_name + '.jpg'
            with open(file_path,'wb') as fp:
                data = urllib.request.urlopen(img_src)
                data = data.read()
                # print("正在下载:" + img_name)
                fp.write(data)
                # print(img_name+ "下载完成")
            fp.close()
    
        def insertdb(self,rank,movie_name,direct,main_act,show_time,country,movie_type,score,count,quote,path):
            if self.open:
                self.cursor.execute("insert into movies(rank,movie_name,direct,main_act,show_time,country,movie_type,score,count,quote,path)values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                    (rank,movie_name,direct,main_act,show_time,country,movie_type,score,count,quote,path))
            else:
                print("数据库未连接")
        def closeUp(self):
            if self.open:
                self.con.commit()
                self.con.close()
                self.open = False
                print("一共爬取了" ,self.no,"条数据")
    
    
    url = 'https://movie.douban.com/top250'
    myspider = MySpider()
    myspider.startUp(url)
    myspider.closeUp()
    for t in myspider.Threads:
        t.join()
    print("End")
    
    
    
    

    (2)心得体会

    获取源码后的数据略有不全,只通过当前页面的源码获得想要的数据需要进行一些特别判断,其实可以在详情页面里面获取,当然会麻烦一些,一些判断条件比较难弄

    作业二

    (1)要求:

         熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取科软排名信息
         爬取科软学校排名,并获取学校的详细链接,进入下载学校Logo存储、获取官网Url、院校信息等内容。
    
    graph TD A[发起请求 ] -->|获取源码并解析 | B(获得部分数据) B --> C(获得的每个大学的详情url) C -->|拼接成完整的url| D[获取当前页面的源码数据并进行提取想要的数据] C -->|等待剩余数据存到MySQL| F[和前面的数据一起存到MySQL]

    spider

    #!/usr/bin/env python
    # _*_ coding:utf-8 _*_
    # author: xm time:2020/12/1
    import scrapy
    from bs4 import UnicodeDammit
    import requests
    from bs4 import BeautifulSoup
    from ..items import RuankeItem
    import os
    
    class spider_r(scrapy.Spider):
        name = 'spider_rk'
        def start_requests(self):
            url = 'https://www.shanghairanking.cn/rankings/bcur/2020'
            print(url)
            yield scrapy.Request(url=url,callback=self.parse)
        def parse(self, response):
            try:
                dammit = UnicodeDammit(response.body,["utf-8",'gbk'])
                data = dammit.unicode_markup
                selector = scrapy.Selector(text=data)
                trs = selector.xpath("//div[@class='rk-table-box']/table/tbody/tr")
                
                for tr in trs:
                    no= tr.xpath("./td[position()=1]/text()").extract_first().strip()
                    name = tr.xpath("./td[position()=2]/a/text()").extract_first()
                    next = tr.xpath("./td[position()=2]/a/@href").extract_first()
                    city = tr.xpath("./td[position()=3]/text()").extract_first().strip()
                    # print(no,name,city)
                    start_url = 'https://www.shanghairanking.cn/'+ next
                    # print(start_url)
                    html = requests.get(url=start_url)
                    dammit = UnicodeDammit(html.content,['utf-8','gbk'])
                    newdata = dammit.unicode_markup
                    soup = BeautifulSoup(newdata,'lxml')
                    try:
                        url = soup.select("div[class='univ-website'] a")[0].text
                        # print(url)
                        mFileq = soup.select("td[class='univ-logo'] img")[0]["src"]
                        File = str(no) + '.jpg'
                        logodata = requests.get(url=mFileq).content
                        path = r'D:anacondaexample
    uankelogo'
                        if not os.path.exists(path):
                            os.mkdir(path)
                        file_path = path+'/'+ File
                        with open(file_path,'wb') as fp:
                            fp.write(logodata)
                        fp.close()
                        # print(File)
                        brief = soup.select("div[class='univ-introduce'] p")[0].text
                        # print(brief)
                    except Exception as err:
                        print(err)
                    item = RuankeItem()
                    item["sNo"] = no
                    item["schoolName"] = name
                    item["city"] = city
                    item["officalUrl"] = url
                    item["info"] = brief
                    item["mFile"] = File
                    yield item
    
            except Exception as err:
                    print(err)
    
    

    item

    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class RuankeItem(scrapy.Item):
        sNo = scrapy.Field()
        schoolName = scrapy.Field()
        city = scrapy.Field()
        officalUrl = scrapy.Field()
        info = scrapy.Field()
        mFile = scrapy.Field()
    
    

    pipelines

    from itemadapter import ItemAdapter
    import pymysql
    class RuankePipeline:
        def open_spider(self, spider):
            print("建立连接")
            try:
                self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd='031804114.hao', db='mydb',
                                           charset='utf8')
                self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
    
                try:
                    self.cursor.execute("drop table if exists ruanke")
                    sql = """create table ruanke(
                        sNo varchar(32) primary key,
                        schoolName varchar(32),
                        city varchar(32),
                        officalUrl varchar(64),
                        info text,
                        mFile varchar(32)
                    )character set = utf8
                    """
                    self.cursor.execute(sql)
                except Exception as err:
                    print(err)
                    print("表格创建失败")
                self.open = True
                self.count = 1
            except Exception as err:
                print(err)
                self.open = False
                print("数据库连接失败")
    
        def process_item(self, item, spider):
            print(item['sNo'], item['schoolName'], item['city'], item['officalUrl'], item['info'], item['mFile'])
            if self.open:
                try:
                    self.cursor.execute(
                        "insert into ruanke(sNo,schoolName,city,officalUrl,info,mFile) values(%s,%s,%s,%s,%s,%s)", 
                        (item['sNo'], item['schoolName'], item['city'], item['officalUrl'], item['info'], item['mFile']))
                    self.count += 1
                except:
                    print("数据插入失败")
            else:
                print("数据库未连接")
            return item
    
        def close_spider(self, spider):
            if self.open:
                self.con.commit()
                self.con.close()
                self.open = False
                print('closed')
                print("一共爬取了", self.count, "条")
    
    
    
    



    (2)心得体会

    获取网页源码信息后,用selector.xpath获取trs,再通过循环遍历每一个tr标签,获取部分信息后,再通过获取的
    详情url,得到每个大学的url,发送get请求,通过beautifulsoup解析获取想要的数据,总的来说还可以,又熟悉了
    一次scrapy框架,不过感觉还是selenium爬取网站信息好用

    作业三

    (1)要求:

         熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素加载、网页跳转等内容。
         使用Selenium框架+ MySQL数据库存储技术模拟登录慕课网,并获取学生自己账户中已学课程的信息并保存在MYSQL中。
         其中模拟登录账号环节需要录制gif图。
    

    #!/usr/bin/env python
    # _*_ coding:utf-8 _*_
    # author: xm time:2020/11/25
    from selenium import webdriver
    import time
    import os
    import selenium.webdriver.support.ui as ui
    from selenium.webdriver.common.by import By
    import pymysql
    from selenium.webdriver.common.action_chains import ActionChains
    from webdriver_manager.chrome import ChromeDriverManager
    #声明并调用浏览器,实例化一个浏览器对象(传入浏览器驱动)
    browser = webdriver.Chrome(ChromeDriverManager().install())
    #定义存数据到MySQL
    class down_mysql:
        def __init__(self, i,course,college,teacher,team,count,process,brief):
            self.id = i
            self.course = course
            self.college = college
            self.teacher = teacher
            self.team = team
            self.count = count
            self.process = process
            self.brief = brief
            self.connect = pymysql.connect(
                host='localhost',
                db = 'gupiao',
                port = 3306,
                user = 'root',
                passwd = '031804114.hao',
                charset = 'utf8',
                use_unicode =False
            )
            self.cursor = self.connect.cursor()
    
        # 保存数据到MySQL中
        def save_mysql(self):
            sql = "insert into selenium_mooc(id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief) values (%s,%s,%s,%s,%s,%s,%s,%s)"
    
            try:
                self.cursor.execute(sql, (self.id, self.course, self.college,self.teacher,self.team,
                                          self.count,self.process,self.brief))
    
                self.connect.commit()
                print('数据插入成功')
            except:
                print('数据插入错误')
    
    # 新建对象,然后将数据传入类中
    def mysql(i,course,college,teacher,team,count,process,brief):
        down = down_mysql(i,course,college,teacher,team,count,process,brief)
        down.save_mysql()
    
    def main():
    
        start_url = "https://www.icourse163.org/"
        wait = ui.WebDriverWait(browser,10)
        browser.get(start_url)
        wait.until(lambda broswser:browser.find_element_by_xpath("//div[@id='g-container']/div[@id='j-indexNav-bar']/div/div/div/div/div[@class='web-nav-right-part']/div[@class='u-navLogin-loginBox']/div/div/div/a"))
    
        print("you did it")
        browser.find_element_by_xpath("//div[@id='g-container']/div[@id='j-indexNav-bar']/div/div/div/div/div[@class='web-nav-right-part']/div[@class='u-navLogin-loginBox']/div/div/div/a").click()
        time.sleep(2)
    
        browser.find_element_by_xpath("//div[@class='mooc-login-set']/div/div[@class='ux-login-set-scan-code_ft']/span").click()
        time.sleep(2)
        browser.find_element_by_xpath("//ul[@class='ux-tabs-underline_hd']/li[position()=2]").click()
        time.sleep(2)
        wait.until(lambda broswser:browser.find_element_by_xpath("//div[@class='ux-login-urs-phone-wrap f-pr']/div[@class='ux-login-set-container']/iframe"))
    
        iframe=browser.find_element_by_xpath("//div[@class='ux-login-urs-phone-wrap f-pr']/div[@class='ux-login-set-container']/iframe")
        # browser.switch_to.frame(browser.find_element_by_xpath('//iframe[contains(@src,"https://reg.icourse163.org/webzj/v1.0.1/pub/index_dl2_new.html?cd=%2F%2Fcmc.stu.126.net%2Fu%2Fcss%2Fcms%2F&cf=mooc_urs_login_css.css&MGID=1606315255717.7847&wdaId=UA1438236666413&pkid=cjJVGQM&product=imooc")]'))
        print("qianhuan")
        browser.switch_to.frame(iframe)
        print("didi")
        browser.find_element_by_id("phoneipt").clear()
        browser.find_element_by_id("phoneipt").send_keys("13055720097")
        time.sleep(3)
        browser.find_element_by_xpath("//div[@class='u-input box']/input[@class='j-inputtext dlemail']").clear()
        browser.find_element_by_xpath("//div[@class='u-input box']/input[@class='j-inputtext dlemail']").send_keys("*******")
        time.sleep(3)
        browser.find_element_by_xpath("//div[@class='f-cb loginbox']/a").click()
        browser.switch_to.default_content()
        time.sleep(6)
        # browser.maximize_window()
        # browser.find_element_by_xpath("//div[@class='u-navLogin-container']/div[@class='e-hover-source u-navLogin-course']/a").click()
        a=wait.until(lambda broswser:browser.find_element_by_xpath("//div[@class='web-nav-right-part']/div[@class='ga-click u-navLogin-myCourse']/div[@class='u-navLogin-myCourse-t']/div[@class='ga-click u-navLogin-myCourse u-navLogin-center-container']/a"))
        xiangqing = a.get_attribute("href")
        print(xiangqing)
        browser.find_element_by_xpath("//div[@class='web-nav-right-part']/div[@class='ga-click u-navLogin-myCourse']/div[@class='u-navLogin-myCourse-t']/div[@class='ga-click u-navLogin-myCourse u-navLogin-center-container']/a").click()
        time.sleep(3)
        #个人中心
        browser.forward()
        for a in range(3):
            wait.until(lambda b2:browser.find_element_by_xpath("//div[@class='course-panel-wrapper']/div[@class='course-panel-body-wrapper']"))
            one_page = browser.find_element_by_xpath("//div[@class='course-panel-wrapper']/div[@class='course-panel-body-wrapper']")
            # print("找到第一页了")
            i = 0
            for one in one_page.find_elements_by_xpath("./div[@class='course-card-wrapper']"):
                i+=1
                sandian = one.find_element_by_xpath("./div[@class='menu-btn']")
                action = ActionChains(browser)
                action.move_to_element(sandian).perform()
                time.sleep(1)
                one.find_element_by_xpath("./div[@class='menu']/div[position()=1]/a").click()
                time.sleep(1)
                #处理点击连接出现新网页
                windows = browser.window_handles
                browser.switch_to.window(windows[1])#切到新页面
                time.sleep(2)
                wait.until(lambda browser2: browser2.find_element_by_xpath(
                    "//div[@class='f-fl course-title-wrapper']/span[position()=1]"))
                name = browser.find_element_by_xpath("//div[@class='f-fl course-title-wrapper']/span[position()=1]")
                course = name.text
                Process = browser.find_element_by_xpath(
                    "//div[@class='course-enroll-info_course-info_term-info_term-time']/span[position()=2]")
                process = Process.text
                people = browser.find_element_by_xpath("//div[@class='course-enroll-info_course-enroll_price-enroll']/span")
                count = people.text.split()[1]
                jieshao = browser.find_element_by_xpath("//div[@class='course-heading-intro']/div[position()=1]")
                brief = jieshao.text
                college = browser.find_element_by_xpath("//div[@class='m-teachers']/a").get_attribute("data-label")
                wait.until(lambda browser2: browser2.find_element_by_xpath(
                    "//div[@class='um-list-slider f-pr']/div[@class='um-list-slider_con']"))
                teams = browser.find_element_by_xpath("//div[@class='um-list-slider f-pr']/div[@class='um-list-slider_con']")
                T = teams.find_element_by_xpath("./div[position()=1]/div/div[@class='cnt f-fl']/h3")
                teacher = T.text
                team = ''
                for each in teams.find_elements_by_xpath("./div"):
                    team_part = each.find_element_by_xpath("./div/div[@class='cnt f-fl']/h3")
                    part = team_part.text
                    team += part
                print(course,team,count,process,brief)
                mysql(i, course, college, teacher, team, count, process, brief)
                browser.close()#关闭当前页面
                time.sleep(1)
                browser.switch_to.window(windows[0])#切回到原来界面
                time.sleep(1)
            # 点击下一页
            wait.until(lambda browser:browser.find_element_by_xpath("//li[@class='ux-pager_btn ux-pager_btn__next']/a"))
            browser.find_element_by_xpath("//li[@class='ux-pager_btn ux-pager_btn__next']/a").click()
    
    
    if __name__ == '__main__':
    
        main()
    #处理点击连接出现新网页
    # windows = browser.window_handles
    # browser.switch_to.window(windows[1])
    # time.sleep(2)
    
    
    
    


    (2)心得体会

    定位到登录按钮,再定位其他登录方式click,找到手机号登录click,
    定位输入框send_keys输入账户密码,click登录,定位个人中心click
    网页前进forward,鼠标移动到每一门课程图片的右上角,定位到元素
    然后点击查看课程介绍,switch到新页面,获取想要的信息后,close
    再switch到原来的界面,定位到下一页的标签click进行翻页,完事

  • 相关阅读:
    Docker基本概念
    Docker 基本指令整理(一)
    其他软件技巧收藏
    Java ThreadPool的正确打开方式花钱的年华 | 江南白衣(5星推荐)
    java主线程等待所有子线程执行完毕在执行(常见面试题)
    (个人)Zookeeper集群环境部署
    linux下git怎么保存账号密码
    PHP跳出循环的方法及continue、break、exit的区别
    Linux 安装composer
    MySQL 删除数据库中重复数据方法
  • 原文地址:https://www.cnblogs.com/lmmlm/p/14073779.html
Copyright © 2011-2022 走看看