zoukankan      html  css  js  c++  java
  • [Python爬虫] 之三十:Selenium +phantomjs 利用 pyquery抓取栏目

      

      一、介绍

        本例子用Selenium +phantomjs爬取栏目(http://tv.cctv.com/lm/)的信息

       

      二、网站信息

        

        

        

        

      

      三、数据抓取

        首先抓取所有要抓取网页链接,共39页,保存到数据库里面

        

        def getUrls(self):
            urls = []
            urls.append('http://tv.cctv.com/lm/')
            for index in range(2,40):
                urls.append("javascript:window.scroll(0,145);DataInteraction({0});showPageTitle_fenyei2('ELMT1413526954890942',{0});".format(index))
            self.db.SaveCCTVColumnUrls(urls,'0')
    View Code

        针对上面的网站信息,来进行抓取

        1、首先抓取信息列表

          

          抓取代码:Elements = doc("div[id='text_box_0']").find('dl').find('dd')

        2、栏目名称,链接

          

          column1Element = element.find('div[class="text"]').find('h3').find('a')

          columnName = column1Element.text().encode('utf8').replace(',', ',').replace(' ', '')

          columnUrl = column1Element.attr('href')

      四,实现代码

        

    # coding=utf-8
    import os
    import re
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from datetime import datetime,timedelta
    import selenium.webdriver.support.ui as ui
    import time
    from pyquery import PyQuery as pq
    import columnData
    import mongoDB
    class cctvColumnInfo:
    
        def __init__(self):
            #通过配置文件获取IEDriverServer.exe路径
            # self.urls = self.getUrls()
            # IEDriverServer ='C:Program FilesInternet ExplorerIEDriverServer.exe'
            # self.driver = webdriver.Ie(IEDriverServer)
            # self.driver.maximize_window()
            self.driver = webdriver.PhantomJS(service_args=['--load-images=false'])#service_args=['--load-images=false']
            self.driver.set_page_load_timeout(10)
            self.driver.maximize_window()
            self.db = mongoDB.mongoDbBase()
    
        def WriteUrl(self,url):
            fileName = os.path.join(os.getcwd(), 'cctvColumn/cctvColumn_url.txt')
            with open(fileName, 'a') as f:
                f.write('
    '+url)
    
        def getUrls(self):
            urls = []
            urls.append('http://tv.cctv.com/lm/')
            for index in range(2,40):
                urls.append("javascript:window.scroll(0,145);DataInteraction({0});showPageTitle_fenyei2('ELMT1413526954890942',{0});".format(index))
            self.db.SaveCCTVColumnUrls(urls,'0')
            # return urls
    
        def WriteLog(self, message,date):
            fileName = os.path.join(os.getcwd(), 'cctvColumn/cctvColumn-'+date + '.txt')
            with open(fileName, 'a') as f:
                f.write(message)
    
        def getColumnInfo(self, colInfo):
            ts = colInfo.split('主持人')
            firstBroadcastTime = ts[0]
            ts1 = ts[1].split('播出频道')
            columnHost = '主持人' + ts1[0]
            broadcastChannel = '播出频道' + ts1[1]
            return firstBroadcastTime, columnHost, broadcastChannel
    
        def CatchData(self):
    
            urlIndex = 0
            urls = self.db.GetCCTVColumnUrls()
            itemIndex = 0
            for u in urls:
                url = u['url']
                try:
                    if url == 'http://tv.cctv.com/lm/':
                        self.driver.get(url)
                    else:
                        self.driver.execute_script(url)
                    urlIndex += 1
                    time.sleep(2)
                    selenium_html = self.driver.execute_script("return document.documentElement.outerHTML")
                    doc = pq(selenium_html)
                    # Elements = doc("div[@id='text_box_0']/dl/dd")
                    Elements = doc("div[id='text_box_0']").find('dl').find('dd')
                    message = ''
    
                    # for element in Elements:
                    column_name = url.encode('utf8')
                    print url
                    for element in Elements.items():
                        colobj = columnData.columnData()
                        itemIndex+=1
                        firstBroadcastTime = ''
                        ReplayBroadcastTime = ''
                        firstBroadcastChannel = ''
                        # column1Element = element.find('div[@class="text"]/h3/a')
                        # column1Element = element.find_element_by_xpath("//div[@class='ui-page-next']")
                        column1Element = element.find('div[class="text"]').find('h3').find('a')
                        columnName = column1Element.text().encode('utf8').replace(',', '').replace('
    ', '')
                        columnUrl = column1Element.attr('href')
    
                        colobj.setColumnName(columnName)
                        colobj.setColumnUrl(columnUrl)
                        column_name += '
    ' + columnName
                        # time.sleep(3)
                        print columnName
    
                        # column2Element = element.find('div[@class="text"]/p/a')
                        column2Element = element.find('div[class="text"]').find('p').find('a')
                        columnTimeName = column2Element.text().encode('utf8').replace(',', '').replace('
    ', '')
                        columnTimeUrl = column2Element.attr('href')
                        colobj.setColumnTimeName(columnTimeName)
                        colobj.setColumnTimeUrl(columnTimeUrl)
                        # print columnTimeName + '; ' + columnTimeUrl
    
                        # column34Elements = element.find('div[@class="text"]/span/a')
                        column34Elements = element.find('div[class="text"]').find('span').find('a')
    
                        # for column34Element in column34Elements:
                        column34Index = 0
                        pastVideoUrl = ''
                        officialWebsiteUrl = ''
                        for column34Element in column34Elements.items():
                            if column34Index == 0:
                                pastVideoUrl = column34Element.attr('href')
                                colobj.setPastVideoUrl(pastVideoUrl)
                            else:
                                officialWebsiteUrl = column34Element.attr('href')
                                colobj.setOfficialWebsiteUrl(officialWebsiteUrl)
                            column34Index += 1
    
                        # columnImageElement = element.find('div[@class="img"]/a/img')
                        columnImageElement = element.find('div[class="img"]').find('a').find('img')
                        colImgUrl = columnImageElement.attr('src')
    
                        if colImgUrl == None:
                            columnImageElement = element.find('div[class="image"]').find('a').find('img')
                            colImgUrl = columnImageElement.attr('src')
                        # print colImgUrl
                        colobj.setColImgUrl(colImgUrl)
                        # 首播时间
                        firstBroadcastTime1 = ''
                        # 主持人
                        columnHost = ''
                        # 播出频道
                        firstBroadcastChannel1 =''
                        # columnInfos = element.find('div[@class="lr"]/div')
                        columnInfos = element.find('div[class="lr"]').find('div')
                        if columnInfos:
                            for colInfo in columnInfos.items():
                                firstBroadcastTime1, columnHost, firstBroadcastChannel1 = self.getColumnInfo(
                                    colInfo.text().encode('utf8').replace(',', '').replace('
    ', ''))
                                columnHost = columnHost.replace(',', '')
                            if not firstBroadcastTime:
                                firstBroadcastTime = firstBroadcastTime1
                            if not firstBroadcastChannel:
                                firstBroadcastChannel = firstBroadcastChannel1
                        colobj.setColumnHost(columnHost)
                        colobj.setFirstBroadcastChannel(firstBroadcastChannel1)
                        colobj.setFirstBroadcastTime(firstBroadcastTime1)
                        # 栏目名称,首播时间,重播时间,播出频道,主持人,栏目url,栏目名称1(带时间的),栏目名称1url,往期视频url,栏目官网url,),栏目对应图片url
                        mess = '
    {0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}'.format(columnName, firstBroadcastTime,
                                                                                       ReplayBroadcastTime,
                                                                                       firstBroadcastChannel, columnHost,
                                                                                       columnUrl, columnTimeName,
                                                                                       columnTimeUrl, pastVideoUrl,
                                                                                       officialWebsiteUrl, colImgUrl)
    
                        # print mess
                        message += mess
    
                        self.db.SaveCCTVColumnData(colobj,itemIndex)
                        self.db.SaveCCTVColumnUrl(columnUrl, '1', columnName)
    
                    date = time.strftime('%Y-%m-%d')
                    self.WriteLog(message, date)
                    self.WriteUrl(column_name)
                    self.db.SetCCTVColumnUrlCrawlState(url)
                except TimeoutException,e:
                    print 'timeout url:  '+url
    
            self.driver.close()
            self.driver.quit()
    
        def getBroadCast(self):
            urls = self.db.GetSubCCTVColumnUrls()
    
            for u in urls:
                firstBroadcastTime = ''
                ReplayBroadcastTime = ''
                firstBroadcastChannel = ''
                messsage = ''
                url = u['url']
                # url='http://tv.cctv.com/lm/xqds'
                # url='http://tv.cctv.com/lm/24xiaoshi/'
                columnName = u['columnName']
    
                #     u'http://tv.cctv.com/lm/kanjian'
                try:
                    self.driver.get(url)
                    time.sleep(2)
                    selenium_html = self.driver.execute_script("return document.documentElement.outerHTML")
                    doc = pq(selenium_html)
                    Elements = doc("p[class='p_1']")
    
                    index = 0
                    for element in Elements.items():
                        if index == 0:
                            firstBroadcastTime = element.text().encode('utf8').replace(',', '').replace('
    ', '')
                        elif index == 1:
                            ReplayBroadcastTime = element.text().encode('utf8').replace(',', '').replace('
    ', '')
                        elif index == 2:
                            firstBroadcastChannel = element.text().encode('utf8').replace(',', '').replace('
    ', '')
                            break
                        index += 1
                    if index == 0:
                        Elements = doc("div[class='head_msg']").find('table').find('tbody').find('tr')
    
                        for element in Elements.items():
                            messsage+=element.text().encode('utf8').replace(',', '').replace('
    ', '')
    
                        if messsage:
                            firstBroadcastTime, ReplayBroadcastTime, firstBroadcastChannel= self.getBroadInfo(columnName.encode('utf8'),messsage)
                    self.db.SetCCTVColumnUrlCrawlState(url)
    
                    if firstBroadcastChannel:
                        colobj = columnData.columnData()
                        colobj.setColumnName(columnName)
                        colobj.setFirstBroadcastTime(firstBroadcastTime)
                        colobj.setFirstBroadcastChannel(firstBroadcastChannel)
                        colobj.setReplayBroadcastTime(ReplayBroadcastTime)
                        self.db.UpdateCCTVColumnData(colobj)
                        print '
    '
                        print url
                        print columnName
                        print firstBroadcastTime
                        print firstBroadcastChannel
                        print ReplayBroadcastTime
    
                except TimeoutException, e:
                    print 'TimeoutException:'+url
    
    
    
        def getBroadInfo(self,columnName,column):
            # column ='首播频道: CCTV-14首播时间: 周三17:15'
            firstBroadcastTime = ''
            ReplayBroadcastTime = ''
            firstBroadcastChannel = ''
            column=column.replace('栏目大全','')
            if '>>' in column:
                index = column.index('>>')
                column = column[0:index]
    
            if 'CCTV13' in column:
                column = column.replace('CCTV13', 'CCTV-13')
            if 'CCTV6' in column:
                column = column.replace('CCTV6', 'CCTV-6')
            if 'CCTV1' in column:
                column = column.replace('CCTV1','CCTV-1')
    
            if '官方微信' in column:
                index = column.index('官方微信')
                column = column[0:index]
    
    
            # if '停播' in column or '关闭' in column:
            #     return firstBroadcastTime, ReplayBroadcastTime, firstBroadcastChannel
            # elif '>>' in column:
            #     index = column.index('>>')
            #     column = column[0:index]
    
            if '首播时间' in column:
                if '重播时间' in column:
                    cols = column.split('重播时间')
                    firstBroadcastTime = cols[0]
                    if '独播频道' in cols[1]:
                        ReplayBroadcastTime = '重播时间' + cols[1].split('独播频道')[0]
                        firstBroadcastChannel = '独播频道' + cols[1].split('独播频道')[1]
                    elif '首播频道' in cols[1]:
                        ReplayBroadcastTime = '重播时间' + cols[1].split('首播频道')[0]
                        firstBroadcastChannel = '首播频道' + cols[1].split('首播频道')[1]
    
                    elif '播出频道' in cols[1]:
                        ReplayBroadcastTime = '重播时间' + cols[1].split('播出频道')[0]
                        firstBroadcastChannel = '播出频道' + cols[1].split('播出频道')[1]
                elif '独播频道' in column:
                    cols = column.split('独播频道')
                    firstBroadcastTime = cols[0]
                    firstBroadcastChannel = '独播频道' + cols[1]
                elif '播出频道' in column:
                    cols = column.split('播出频道')
                    firstBroadcastTime = cols[0]
                    firstBroadcastChannel = '播出频道' + cols[1]
    
                elif '首播频道' in column:
                    cols = column.split('首播频道')
                    index = column.index('首播频道')
                    if index==0:
                        cols = column.split('首播时间')
                        firstBroadcastChannel = cols[0]
                        firstBroadcastTime = '首播时间' + cols[1]
                    else:
                        firstBroadcastTime = cols[0]
                        firstBroadcastChannel = '首播频道' + cols[1]
            else:
                if '首播(' in column and '重播(' in column:
                    if '独播频道' in column:
                        cols = column.split('独播频道')
                        firstBroadcastChannel = '独播频道' + cols[1]
                        firstBroadcastTime = cols[0]
                        # '首播(生活): 一-六18:52 日18:42重播(生活): 一-五 日16:08首播(文史): 一-五22:43六日22:33/30重播(文史): 二-五06:46六日06:24'
                        if '(生活版)' in columnName:
                            if '首播(文史)' in firstBroadcastTime:
                                temp = firstBroadcastTime.split('首播(文史)')[0]
                                if '重播(生活)' in temp:
                                    firstBroadcastTime = '首播时间: '+temp.split('重播(生活)')[0].replace('首播(生活): ','')
                                    ReplayBroadcastTime = '重播时间: '+temp.split('重播(生活)')[1].replace(': ','')
    
                        # 首播(文史): 一-五22:43六日22:33/30重播(文史): 二-五06:46六日06:24首播(生活): 一-六18:52 日18:42重播(生活): 一-五 日16:08
                        elif '(文史版)' in columnName:
                            if '首播(生活)' in firstBroadcastTime:
                                temp = firstBroadcastTime.split('首播(生活)')[0]
                                if '重播(文史)' in temp:
                                    firstBroadcastTime = '首播时间: '+temp.split('重播(文史)')[0].replace('首播(文史): ','')
                                    ReplayBroadcastTime = '重播时间: '+ temp.split('重播(文史)')[1].replace(': ','')
    
                    elif '播出频道' in column:
                        cols = column.split('播出频道')
                        firstBroadcastTime = cols[0]
                        firstBroadcastChannel = '播出频道' + cols[1]
    
                    elif '首播频道' in column:
                        cols = column.split('首播频道')
                        firstBroadcastTime = cols[0]
                        firstBroadcastChannel = '首播频道' + cols[1]
            return firstBroadcastTime,ReplayBroadcastTime,firstBroadcastChannel
    
    
        def exportColumnInfo(self):
            columns = self.db.GetCCTVColumnData()
    
            for col in columns:
                columnName = col['columnName'].encode('utf8')
                firstBroadcastTime = col['firstBroadcastTime'].encode('utf8')
                firstBroadcastTime=firstBroadcastTime.replace('首播时间: ','')
    
                firstBroadcastChannel = col['firstBroadcastChannel'].encode('utf8').replace("播出频道:", "").replace("独播频道:", "").replace("首播频道:", "")
                firstBroadcastChannel =firstBroadcastChannel.replace(")","").replace("(","").replace("CCTV-8电视剧","CCTV-8 电视剧")
                firstBroadcastChannel = firstBroadcastChannel.replace("CCTV-1综合频道", "CCTV-1 综合频道")
                firstBroadcastChannel = firstBroadcastChannel.replace("CCTV-1高清频道", "CCTV-1 高清频道")
                firstBroadcastChannel = firstBroadcastChannel.replace("CCTV13", "CCTV-13")
                firstBroadcastChannel = firstBroadcastChannel.replace("CCTV1", "CCTV-1")
                firstBroadcastChannel = firstBroadcastChannel.replace("CCTV-少儿", "CCTV-14 少儿")
                firstBroadcastChannel = firstBroadcastChannel.replace("CCTV6", "CCTV-6")
                firstBroadcastChannel = firstBroadcastChannel.replace("CCTV-12社会与法", "CCTV-12 社会与法")
    
                replayBroadcastTime = col['replayBroadcastTime'].encode('utf8')
                replayBroadcastTime = replayBroadcastTime.replace('重播时间:', '')
                columnHost = col['columnHost'].encode('utf8')
                columnUrl = col['columnUrl'].encode('utf8')
                columnTimeName = col['columnTimeName'].encode('utf8')
                columnTimeUrl = col['columnTimeUrl']
                if columnTimeUrl:
                    columnTimeUrl = columnTimeUrl.encode('utf8')
                officialWebsiteUrl = col['officialWebsiteUrl'].encode('utf8')
                pastVideoUrl = col['pastVideoUrl'].encode('utf8')
                colImgUrl = col['colImgUrl'].encode('utf8')
    
                # 栏目名称,首播时间,重播时间,播出频道,主持人,栏目url,栏目名称1(带时间的),栏目名称1url,往期视频url,栏目官网url,),栏目对应图片url
                message = '
    {0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}'.format(columnName, firstBroadcastTime,
                                                                               replayBroadcastTime,
                                                                               firstBroadcastChannel, columnHost,
                                                                               columnUrl, columnTimeName,
                                                                               columnTimeUrl, pastVideoUrl,
                                                                               officialWebsiteUrl, colImgUrl)
    
                date = time.strftime('%Y-%m-%d')
                self.WriteLog(message, date)
    
    obj = cctvColumnInfo()
    # obj.getUrls()
    # obj.CatchData()
    # obj.getBroadCast()
    obj.exportColumnInfo()
    View Code
    # coding=utf-8
    import os
    from pymongo import MongoClient
    from pymongo import ASCENDING, DESCENDING
    import codecs
    import time
    import columnData
    import datetime
    import re
    
    class mongoDbBase:
        # def __init__(self, databaseIp = '127.0.0.1',databasePort = 27017,user = "ott",password= "ott", mongodbName='OTT_DB'):
        def __init__(self, connstr='mongodb://ott:ott@127.0.0.1:27017/', mongodbName='OTT'):
            # client = MongoClient(connstr)
            # self.db = client[mongodbName]
            client = MongoClient('127.0.0.1', 27017)
            self.db = client.OTT
            self.db.authenticate('ott', 'ott')
    
    
        def SaveCCTVColumnData(self,columnData,index):
            count = self.db.column_data.find({'columnName': columnData.getColumnName()}).count()
            if count == 0:
                dictM ={'columnName':columnData.getColumnName(),
                        'firstBroadcastTime':columnData.getFirstBroadcastTime(),
                        'replayBroadcastTime':'',
                        'firstBroadcastChannel':columnData.getFirstBroadcastChannel(),
                        'columnHost':columnData.getColumnHost(),
                        'columnUrl':columnData.getColumnUrl(),
                        'columnTimeName':columnData.getColumnTimeName(),
                        'columnTimeUrl':columnData.getColumnTimeUrl(),
                        'officialWebsiteUrl':columnData.getOfficialWebsiteUrl(),
                        'pastVideoUrl': columnData.getPastVideoUrl(),
                        'colImgUrl':columnData.getColImgUrl(),
                        'index':index}
                self.db.column_data.insert(dictM)
    
        def GetCCTVColumnData(self):
            columns = self.db.column_data.find({},{'_id':0})
            return columns
    
        def UpdateCCTVColumnData(self, columnData):
            dictM ={'$set':{'replayBroadcastTime':columnData.getReplayBroadcastTime(),
                            'firstBroadcastTime':columnData.getFirstBroadcastTime(),
                            'firstBroadcastChannel': columnData.getFirstBroadcastChannel()}}
            self.db.column_data.update({"columnName":columnData.getColumnName()},dictM)
    
        def SaveCCTVColumnUrl(self, url,suburl,columnName):
            dictM = {'url': url, 'iscrawl': '0','suburl':suburl,'columnName':columnName}
            # db.urls.find({iscrawl:'1'}).count()
            count = self.db.columnurls.find({'url': url}).count()
            if count == 0:
                self.db.columnurls.insert(dictM)
    
        def SaveCCTVColumnUrls(self, urlList,suburl):
            index = 0
            for url in urlList:
    
                # db.urls.find({iscrawl:'1'}).count()
                count = self.db.columnurls.find({'url': url}).count()
                if count == 0:
                    dictM = {'url': url, 'iscrawl': '0', 'suburl': suburl,'index':index}
                    self.db.columnurls.insert(dictM)
                    index += 1
                    # self.db.Meeting.update({'title': meet["title"],'date': meet["date"]}, {'$set': dictM}, {'upsert': True})
    
        def GetCCTVColumnUrls(self):
            urls = self.db.columnurls.find({'iscrawl': '0','suburl':'0'}, {'_id': 0, 'url': 1})
            # for url in urls:
            #     #http://top.chinaz.com/hangye/index_yule.html
            #     print urls['url']
            #     break
            return urls
    
        def GetSubCCTVColumnUrls(self):
            urls = self.db.columnurls.find({'iscrawl': '0', 'suburl': '1'}, {'_id': 0, 'url': 1,'columnName':1})
            # urls = self.db.columnurls.find({'firstBroadcastChannel': re.compile('栏目'), 'suburl': '1'}, {'_id': 0, 'url': 1, 'columnName': 1})
            return urls
            # def SetUrlCrawlState(self,urlList):
            #     for url in urlList:
            #         self.db.urls.update({'url':url},{'$set':{'iscrawl':'1'}})
    
        def SetCCTVColumnUrlCrawlState(self, url):
            # db.urls.update({iscrawl:'1'},{'$set':{iscrawl:'0'}},false,true)
            self.db.columnurls.update({'url': url}, {'$set': {'iscrawl': '1'}})
    
        
    
    
    # d = mongoDbBase()
    
    # urls = []
    # urls.append('abc')
    # # d.SaveUrls(urls)
    # d.SetUrlCrawlState(urls)
    View Code
        def download(self, url, name):
            try:
                # url='http://pp.myapp.com/ma_icon/0/icon_10910_1523714409/96'
                # name='D:workpython_crawldown2019.jpg'
                pic = requests.get(url, timeout=5)
                with open(name, 'wb') as f:
                    f.write(pic.content)
            except requests.exceptions.ConnectionError:
                print('当前图片无法下载')
  • 相关阅读:
    tableviewCell折叠状态1
    iOS中--NSArray调用方法详解 (李洪强)
    NSNumber的使用
    Fedora13下编译busybox-1.15.0出现can not find lcrypt错误
    【独立开发人员er Cocos2d-x实战 013】Cocos2dx 网络编程实战之星座运势
    JAVA序列化的作用
    我买网B轮融资成功,五周年豪掷千万回馈会员
    一步步教你搭建TinyOS2.1.2开发环境
    POJ2947 DAZE [Gauss]
    慢慢理解RESTful架构
  • 原文地址:https://www.cnblogs.com/shaosks/p/8759388.html
Copyright © 2011-2022 走看看