zoukankan      html  css  js  c++  java
  • 关于这个该死的报错:TypeError

    在利用Selenium爬取页面信息的时候突然报错,第一条信息爬取的时候还好好的,第二条就不行了。

    请参考网上的爬取代码:

     1 # coding=utf-8
     2 """
     3 Created on 2015-12-10 @author: Eastmount
     4 利用Selenium爬取百度百科5A级景区的内容介绍的代码
     5 """
     6 
     7 import time
     8 import re
     9 import os
    10 import sys
    11 import codecs
    12 import shutil
    13 from selenium import webdriver
    14 from selenium.webdriver.common.keys import Keys
    15 import selenium.webdriver.support.ui as ui
    16 from selenium.webdriver.common.action_chains import ActionChains
    17 
    18 # Open PhantomJS
    19 # driver = webdriver.PhantomJS(executable_path="D:phantomjs-2.1.1-windowssbinphantomjs.exe")
    20 driver = webdriver.PhantomJS(executable_path="D:phantomjs-1.9.8-windowsphantomjs.exe")
    21 # driver = webdriver.Firefox()
    22 wait = ui.WebDriverWait(driver, 10)  # 显示等待时间(实例,最大等待时间)
    23 
    24 
    25 # Get the Content of 5A tourist spots
    26 def getInfobox(entityName, fileName):
    27     try:
    28         # create paths and txt files
    29         print(u'文件名称: ', fileName)
    30         info = codecs.open(fileName, 'w', 'utf-8')
    31 
    32         # locate input  notice: 1.visit url by unicode 2.write files
    33         # Error: Message: Element not found in the cache
    34         #       Perhaps the page has changed since it was looked up
    35         # 解决方法: 使用Selenium和Phantomjs
    36 
    37         print(u'实体名称: ', entityName.rstrip('
    '))
    38         driver.get("http://baike.baidu.com/")
    39         elem_inp = driver.find_element_by_xpath("//form[@id='searchForm']/input")
    40         # elem_inp = driver.find_elements_by_xpath("//div[@class='lemma-summary']/div")
    41         elem_inp.send_keys(entityName)  
    42         elem_inp.send_keys(Keys.RETURN)
    43         info.write(entityName.rstrip('
    ') + '
    ')  # codecs不支持'
    '换行
    44 
    45         # load content 摘要
    46         elem_value = driver.find_elements_by_xpath("//div[@class='lemma-summary']/div")
    47         for value in elem_value:
    48             print(value.text)
    49             info.writelines(value.text + '
    ')
    50 
    51         # 爬取文本信息
    52         # 爬取所有段落<div class='para'>的内容 class='para-title'为标题 [省略]
    53 
    54         time.sleep(2)
    55     # except Exception as e:  # 'utf8' codec can't decode byte
    56     #     print("Error: ", e)
    57     finally:
    58         print('
    ')
    59         info.close()
    60 
    61 
    62 # Main function
    63 def main():
    64     # By function get information
    65     path = "BaiduSpider\"
    66     if os.path.isdir(path):
    67         shutil.rmtree(path, True)
    68     os.makedirs(path)
    69     source = open("Tourist_spots_5A.txt", 'r')
    70     num = 1
    71     for entityName in source:
    72         # entityName = unicode(entityName, "utf-8")
    73         if u'故宫' in entityName:  # else add a '?'
    74             entityName = '北京故宫'
    75         # else: Name = entityName.rstrip('
    ')
    76         name = "%04d" % num
    77         fileName = path + str(name) + ".txt"
    78         getInfobox(entityName, fileName)
    79         num = num + 1
    80     print('End Read Files!')
    81     source.close()
    82     driver.close()
    83 
    84 
    85 if __name__ == '__main__':
    86     main()

     执行报错信息为:

    Traceback (most recent call last):
    File "D:/pycharm/untitled_DB/wordcloud/selenium爬取百度百科/Selenium_baidu.py", line 85, in <module>
    main()
    File "D:/pycharm/untitled_DB/wordcloud/selenium爬取百度百科/Selenium_baidu.py", line 77, in main
    getInfobox(entityName, fileName)
    File "D:/pycharm/untitled_DB/wordcloud/selenium爬取百度百科/Selenium_baidu.py", line 41, in getInfobox
    elem_inp.send_keys(Keys.RETURN)
    File "C:UsersAdministratorAppDataLocalProgramsPythonPython36libsite-packagesseleniumwebdriver
    emotewebelement.py", line 479, in send_keys
    
    
    'value': keys_to_typing(value)})
    File "C:UsersAdministratorAppDataLocalProgramsPythonPython36libsite-packagesseleniumwebdriver
    emotewebelement.py", line 628, in _execute
    return self._parent.execute(command, params)
    File "C:UsersAdministratorAppDataLocalProgramsPythonPython36libsite-packagesseleniumwebdriver
    emotewebdriver.py", line 312, in execute
    self.error_handler.check_response(response)
    File "C:UsersAdministratorAppDataLocalProgramsPythonPython36libsite-packagesseleniumwebdriver
    emoteerrorhandler.py", line 208, in check_response
    raise exception_class(value)
    selenium.common.exceptions.WebDriverException: Message: TypeError - 'undefined' is not a function (evaluating '_getTagName(currWindow).toLowerCase()')

    找了1天都没找到原因,真的 死烦 ,找到原因是71行代码写死,然而要是不加判断也会出现这样的报错,比较郁闷,后来查了半天资料,在Stackoverflow的评论中找到思路,很有可能是read文件的时候,读取到的内容格式有问题,于是查看了一下格式发现,果不其然,多了一个"/n",修改代码:

    if u'故宫' in entityName:  # else add a '?'
      entityName = '北京故宫'
    else: 
      entityName = entityName.rstrip(' ') name = "%04d" % num fileName = path + str(name) + ".txt" getInfobox(entityName, fileName) num = num + 1

    在执行,ok,请忽略渣渣排版

  • 相关阅读:
    数据类型转换(日期格式转换)
    TextArea控件实时计算总字数,总行数,和每行显示的最大字数
    Java--->判断IP和端口是否可连接
    JavaFX校验IP和端口的合法性
    JavaFX与NetBeans开发工具的一些总结
    Web Service深度剖析
    Spring中AOP和IOC深入理解
    Spring aspect 两种方式实现五种增强
    Struts2错题总结
    Hibernate检索方式和Criteria查询的讲解
  • 原文地址:https://www.cnblogs.com/xiapu5150/p/8528774.html
Copyright © 2011-2022 走看看