zoukankan html css js c++ java

[Python爬虫] 之三十一：Selenium +phantomjs 利用 pyquery抓取消费主张信息

　　一、介绍

　　　　本例子用Selenium +phantomjs爬取央视栏目（http://search.cctv.com/search.php?qtext=消费主张&type=video）的信息（标题，时间）

　　二、网站信息

　　python 代码

# coding=utf-8
import os
import re
from selenium import webdriver
from datetime import datetime,timedelta
import time
from pyquery import PyQuery as pq
import re
import mongoDB
import datetime

class consumer:

    def __init__(self):
        #通过配置文件获取IEDriverServer.exe路径
        # IEDriverServer ='C:Program FilesInternet ExplorerIEDriverServer.exe'
        # self.driver = webdriver.Ie(IEDriverServer)
        # self.driver.maximize_window()
        self.driver = webdriver.PhantomJS(service_args=['--load-images=false'])
        # self.driver = driver = webdriver.Chrome()
        self.driver.set_page_load_timeout(10)
        self.driver.maximize_window()
        self.db = mongoDB.mongoDbBase()


    def WriteLog(self, message,date):
        fileName = os.path.join(os.getcwd(), 'consumer/' + date  +   '.txt')
        with open(fileName, 'a') as f:
            f.write(message)
    # http://search.cctv.com/search.php?qtext=消费主张&type=video
    def CatchData(self,url='http://search.cctv.com/search.php?qtext=%E6%B6%88%E8%B4%B9%E4%B8%BB%E5%BC%A0&type=video'):
        error = ''
        try:
            self.driver.get(url)
            time.sleep(1)
            selenium_html = self.driver.execute_script("return document.documentElement.outerHTML")
            doc = pq(selenium_html)

            filename = datetime.datetime.now().strftime('%Y-%m-%d')
            message = '{0},{1}'.format( '标题', '时间')
            filename = datetime.datetime.now().strftime('%Y-%m-%d')
            self.WriteLog(message, filename)
            pages = doc("div[class='page']").find("a")
            # 2018-06-05 00:12:21
            pattern = re.compile("d{4}-d{2}-d{2}sd{2}:d{2}:d{2}")
            for index in range(1,6):
                url = "get_data('{0}', '消费主张', 'relevance', 'video', '-1', '1', '', '20', '1')".format(index)

                self.driver.execute_script(url)
                selenium_html = self.driver.execute_script("return document.documentElement.outerHTML")
                doc = pq(selenium_html)

                Elements = doc("div[class='jvedio']").find("a")
                for sub in Elements.items():
                    title = sub.attr('title').encode('utf8')
                    ts = pattern.findall(title)
                    strtime = ''
                    if ts and len(ts) == 1:
                        strtime = ts[0]
                    if strtime:
                        index = title.index(strtime)
                        title = title[0:index]
                    title = '
{0},{1}'.format(title,strtime)
                    self.WriteLog(title, filename)

        except Exception, e1:
            error = e1.message
            
    # def CatchData(self,url='http://search.cctv.com/search.php?qtext=%E6%B6%88%E8%B4%B9%E4%B8%BB%E5%BC%A0&type=video'):
    #     error = ''
    #     try:
    #         self.driver.get(url)
    #         time.sleep(1)
    #         selenium_html = self.driver.execute_script("return document.documentElement.outerHTML")
    #         doc = pq(selenium_html)
    #
    #         filename = datetime.datetime.now().strftime('%Y-%m-%d')
    #
    #         pages = doc("div[class='page']").find("a")
    #
    #         for element in pages.items():
    #             url = element.attr('onclick').encode('utf8')
    #             # get_data('1','消费主张','relevance','video','-1','1','','20','1')
    #             # get_data('2', '消费主张', 'relevance', 'video', '-1', '1', '', '20', '1')
    #             print url
    #             self.driver.execute_script(url)
    #             selenium_html = self.driver.execute_script("return document.documentElement.outerHTML")
    #             doc = pq(selenium_html)
    #
    #             Elements = doc("div[class='jvedio']").find("a")
    #             for sub in Elements.items():
    #                 title = sub.attr('title').encode('utf8')
    #                 print title
    #                 title = '
{0}'.format(title)
    #                 self.WriteLog(title, filename)
    #     except Exception, e1:
    #         error = e1.message




obj = consumer()

obj.CatchData()
# obj.CatchContent('')
# obj.export('')

View Code

查看全文

相关阅读:
《Flutter实战入门》下拉刷新组件的使用方法
 百度HTTPS认证失败解决方法
 unity踩过的音频坑
 如何解决flutter中gradle慢的问题
 如何在ubuntu里面关掉后台的meteor
ruby生成随机成绩
 Gemfile分平台加载gem
sublime text2在windows中以命令行启动
 右键添加 CMD 命令提示符
 修复sublime text系统右键菜单

原文地址：https://www.cnblogs.com/shaosks/p/9138834.html