1、查看安装的chrome浏览器版本
2、查看版本对应的驱动
https://sites.google.com/a/chromium.org/chromedriver/downloads
下载后拷贝到/usr/local/bin/目录下
结果在运行程序的时候右报错:selenium.common.exceptions.WebDriverException: Message: 'chromedriver' executable needs to be in PATH
后来网上查下修改如下代码:
chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(chrome_options=chrome_options)
就正常运行了
3、完整代码
# coding=utf-8 import os import re from selenium import webdriver # from pyvirtualdisplay import Display from selenium.webdriver.chrome.options import Options from datetime import datetime,timedelta import time from pyquery import PyQuery as pq import re import datetime class consumer: def __init__(self): #通过配置文件获取IEDriverServer.exe路径 # IEDriverServer ='C:Program FilesInternet ExplorerIEDriverServer.exe' # self.driver = webdriver.Ie(IEDriverServer) # self.driver.maximize_window() # self.driver = webdriver.PhantomJS(service_args=['--load-images=false']) # self.driver = driver = webdriver.Chrome() chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(chrome_options=chrome_options) # options = webdriver.FirefoxOptions() # # options.set_headless() # options.add_argument('--headless') # options.add_argument('--disable-gpu') # self.driver = webdriver.Firefox(firefox_options=options) # self.driver.set_page_load_timeout(10) # self.driver.maximize_window() # display = Display(visible=0, size=(800, 600)) # display.start() # self.driver = webdriver.Firefox() # self.driver.maximize_window() def WriteLog(self, message,date): dir = os.path.join(os.getcwd(), 'consumer1') if not os.path.exists(dir): os.mkdir(dir) fileName = os.path.join(dir, date + '.txt') with open(fileName, 'a') as f: f.write(message) # http://search.cctv.com/search.php?qtext=消费主张&type=video def CatchData(self,url='http://search.cctv.com/search.php?qtext=%E6%B6%88%E8%B4%B9%E4%B8%BB%E5%BC%A0&type=video'): error = '' try: self.driver.get(url) selenium_html = self.driver.execute_script("return document.documentElement.outerHTML") doc = pq(selenium_html) message = '{0},{1}'.format( '标题', '时间') filename = datetime.datetime.now().strftime('%Y-%m-%d') self.WriteLog(message, filename) pages = doc("div[class='page']").find("a") # 2018-06-05 00:12:21 pattern = re.compile("d{4}-d{2}-d{2}sd{2}:d{2}:d{2}") for index in range(1,6): url = "get_data('{0}', '消费主张', 'relevance', 'video', '-1', '1', '', '20', '1')".format(index) self.driver.execute_script(url) selenium_html = self.driver.execute_script("return document.documentElement.outerHTML") doc = pq(selenium_html) print(index) try: Elements = doc("div[class='jvedio']").find("a") for sub in Elements.items(): title = sub.attr('title') print(title) ts = pattern.findall(title) strtime = '' if ts and len(ts) == 1: strtime = ts[0] if strtime: index1 = title.index(strtime) title = str(title[0:index1]).replace("•","") title = ' {0},{1}'.format(title, strtime) self.WriteLog(title, filename) except Exception as e: print("OS error: {0}".format(e)) except Exception as e1: error = "ex" # python "C:Program Files (x86)JetBrainsPyCharm 2016.2.3helperspydevsetup_cython.py" build_ext --inplace obj = consumer() obj.CatchData() # obj.CatchContent('') # obj.export('')