zoukankan      html  css  js  c++  java
  • [Python爬虫] 之八:Selenium +phantomjs抓取微博数据

      基本思路:在登录状态下,打开首页,利用高级搜索框输入需要查询的条件,点击搜索链接进行搜索。如果数据有多页,每页数据是20条件,读取页数 然后循环页数,对每页数据进行抓取数据。

      在实践过程中发现一个问题,利用IE驱动,在利用高级搜索后,抓取数据时,抓取不到,只能抓取第一条数据,其它的数据是空的,很奇怪,不知道什么原因,后来用phantomjs就可以抓取到,但是用phantomjs又出现一个问题是,高级搜索链接死活找不到,因此也就没有办法进行高级搜索了,但是利用IE驱动就可以。基于这种情况,就利用IE驱动和phantomjs结合起来进行。首先利用IE驱动进行高级条件是设置,点击“高级搜索”链接,进而得到要抓取数据的首页的url,然后再获取页数,在此基础上再利用phantomjs进行具体数据的抓取。

      1、设置高级查询条件

      

      2、获取爬虫的页数

      

      

      

    # elements = self.urldriver.find_elements_by_xpath("//div[@class='layer_menu_list W_scroll']/ul/li")
    elements = self.urldriver.find_elements_by_xpath(pageCountLable)
    #要爬虫的页数
    self.pageCount = len(elements)

    3、获取微博记录

      

    Elements = self.driver.find_elements_by_xpath("//div[@class='WB_cardwrap S_bg2 clearfix']")

    4、获取每条微博对应的URL:

       

      hrefElements = self.driver.find_elements_by_xpath("//a[@class='W_textb']")
    
    
    
    
    

     5、具体代码如下

      

    # coding=utf-8
    import os
    import re
    from selenium import webdriver
    import selenium.webdriver.support.ui as ui
    from selenium.webdriver.common.keys import Keys
    import time
    from selenium.webdriver.common.action_chains import ActionChains
    from selenium.webdriver.support.select import Select
    import IniFile
    from selenium.webdriver.common.keys import Keys
    import LogFile
    class weibo:

    def __init__(self):
    #通过配置文件获取IEDriverServer.exe路径
    configfile = os.path.join(os.getcwd(),'config.conf')
    self.cf = IniFile.ConfigFile(configfile)
    IEDriverServer = self.cf.GetValue("section", "IEDriverServer")
    #每抓取一页数据延迟的时间,单位为秒,默认为5秒
    self.pageDelay = 5
    pageInteralDelay = self.cf.GetValue("section", "pageInteralDelay")
    if pageInteralDelay:
    self.pageDelay = int(pageInteralDelay)

    os.environ["webdriver.ie.driver"] = IEDriverServer
    self.urldriver = webdriver.Ie(IEDriverServer)
    # self.driver = webdriver.PhantomJS()
    self.wait = ui.WebDriverWait(self.urldriver, 20)
    self.urldriver.maximize_window()



    def scroll_top(self):
    '''
    滚动条拉到顶部
    :return:
    '''
    if self.urldriver.name == "chrome":
    js = "var q=document.body.scrollTop=0"

    else:
    js = "var q=document.documentElement.scrollTop=0"
    return self.urldriver.execute_script(js)

    def scroll_foot(self):
    '''
    滚动条拉到底部
    :return:
    '''

    if self.urldriver.name == "chrome":
    js = "var q=document.body.scrollTop=10000"

    else:
    js = "var q=document.documentElement.scrollTop=10000"
    return self.urldriver.execute_script(js)


    def logon(self):
    '''
    登录
    :return:
    '''
    isneedLogon = False
    try:
    gn_login = self.driver.find_element_by_xpath("//div[@class='gn_login']")
    isneedLogon = True
    except Exception, e:
    if e.msg.find('Unable') > -1: # 找不到登录元素说明已经登录
    print 'logon'
    if isneedLogon:
    userNameInput = self.driver.find_element_by_xpath("//input[@name='username']")
    userNameInput.send_keys('手机号')
    passwordInput = self.driver.find_element_by_xpath("//input[@name='password']")
    passwordInput.send_keys('XXXXX')
    # 输入用户名密码登录
    logon_elements = self.driver.find_element_by_xpath("//a[@class='W_btn_a btn_32px']")
    logon_elements.click()

    def Set_CurrentUrl(self):
    firstUrl = self.cf.GetValue("section", "firstUrl")
    if len(firstUrl) > 0 :
    self.urldriver.get(firstUrl)
    self.urldriver.implicitly_wait(5)
    self.adv_Setting()
    time.sleep(5)
    # self.urldriver.implicitly_wait(4)
    #滚动到底部,便于发现页数
    self.scroll_foot()
    #要爬虫网页的URL
    print self.urldriver.current_url
    if self.urldriver.current_url== firstUrl:
    time.sleep(5)

    self.current_url = self.urldriver.current_url.replace('Refer=g','page=')


    pageCountLable = self.cf.GetValue("section", "pageCountLable")
    try:
    # elements = self.urldriver.find_elements_by_xpath("//div[@class='layer_menu_list W_scroll']/ul/li")
    elements = self.urldriver.find_elements_by_xpath(pageCountLable)
    #要爬虫的页数
    self.pageCount = len(elements)
    # print self.pageCount
    except Exception,e:
    print e.message

    self.urldriver.close()
    self.urldriver.quit()
    self.driver = webdriver.PhantomJS()
    self.wait = ui.WebDriverWait(self.driver, 20)
    self.driver.maximize_window()
    else:
    print 'please set first url'

    def CatchData(self):
    '''
    抓取数据
    :return:
    '''
    start = time.clock()
    # #打印标题
    # print self.driver.title
    htmls= self.cf.GetValue("section", "htmlLable").split(';')
    htmlLables = []
    for h in htmls:
    if len(h) > 0:
    htmlLables.append(h)
    logfile = os.path.join(os.getcwd(), r'log.txt')
    log = LogFile.LogFile(logfile)

    pageIndex = 1
    pageCount = self.pageCount
    pageCount = 2
    recordCount = 0
    weiboOriginalUrlLabel = self.cf.GetValue("section", "weiboOriginalUrlLabel")
    while pageCount > 0:
    url = self.current_url +str(pageIndex)
    self.driver.get(url)
    #延迟5秒
    self.driver.implicitly_wait(5)
    pageCount = pageCount -1
    for className in htmlLables:
    self.wait.until(lambda driver: self.driver.find_elements_by_xpath(className))
    Elements = self.driver.find_elements_by_xpath(className)

    #查找微博对应的原始url
    urlList = []
    # self.wait.until(lambda driver: self.driver.find_elements_by_xpath("//a[@class='W_textb']"))
    # hrefElements = self.driver.find_elements_by_xpath("//a[@class='W_textb']")
    self.wait.until(lambda driver: self.driver.find_elements_by_xpath(weiboOriginalUrlLabel))
    hrefElements = self.driver.find_elements_by_xpath(weiboOriginalUrlLabel)
    for hrefe in hrefElements:
    urlList.append(hrefe.get_attribute('href').encode('utf8'))

    self.driver.implicitly_wait(2)
    index = 0
    strMessage = ' '
    strsplit = ' ------------------------------------------------------------------------------------ '
    index = 0
    for element in Elements:
    print ' '
    txt = element.text.encode('utf8')
    #每个文本前面第一个字符是c,去掉
    txt = txt[1:]
    print ' '
    print txt
    print '微博链接:'+urlList[index]
    print strsplit

    strMessage = txt+ " "
    strMessage += '微博链接:'+ urlList[index] + " "
    strMessage += strsplit
    strMessage = unicode(strMessage,'utf8')
    log.WriteLog(strMessage)
    # self.printTopic(txt)
    recordCount = recordCount + 1
    index = index + 1

    pageIndex = pageIndex + 1
    self.driver.implicitly_wait(10)

    self.driver.close()
    self.driver.quit()
    end = time.clock()

    print ' '
    print "共抓取了: %d 页数据" % self.pageCount
    print "共抓取了: %d 个微博记录" % recordCount
    print "整个过程用时间: %f 秒" % (end - start)

    def adv_Setting(self):
    '''
    高级搜索框设置:
    1、在父窗口点击“高级搜索链接”,打开高级搜索窗口
    2、在关键字输入框输入要搜索的关键字
    3、选择类型,默认是全部
    4、选择包含,默认是全部
    5、设置开始日期,开始时间,结束日期,结束时间
    6、设置地点
    :return: True :设置完成;False:出现错误
    '''

    try:
    # 首先延迟3秒,否则会出现没有找到高级搜索这个链接
    # time.sleep(3)
    # 1、打开高级搜索窗口
    # self.driver.switch_to_default_content()searchInp_form

    self.wait.until(lambda driver: self.urldriver.find_element_by_xpath("//a[@class='adv_settiong']"))
    adv_elements = self.urldriver.find_element_by_xpath("//a[@class='adv_settiong']")
    adv_elements.click()

    # 2、在关键字输入框输入搜索关键字
    time.sleep(5)
    keyword = self.cf.GetValue("adv_setting", "keywords")
    keyword = keyword.replace(' ','')
    if len(keyword) > 0:
    js = "var obj = document.getElementsByName('keyword')[0];obj.value='" + keyword + "';"
    self.urldriver.execute_script(js)

    # 3、选择类型,默认是全部
    # 全部:radio01;热门:radio02;原创:radio03;关注人:radio04;认证用户:radio05;媒体:radio07
    type_select = self.cf.GetValue("adv_setting", "type_select")
    type_select = type_select.replace(' ', '')
    if len(type_select) > 0 :
    type_elements = self.urldriver.find_element_by_id(type_select)
    # type_elements = self.driver.find_element_by_id("radio03")
    type_elements.click()

    # 4、选择包含,默认是全部
    # 全部:radio_sub1;含图片:radio_sub2;含视频:radio_sub3;含音乐:radio_sub4;含短链:radio_sub5
    contain_select = self.cf.GetValue("adv_setting", "contain_select")
    contain_select = contain_select.replace(' ', '')
    if len(contain_select) > 0:
    contain_elements = self.urldriver.find_element_by_id(contain_select)
    # contain_elements = self.driver.find_element_by_id("radio_sub2")
    contain_elements.click()

    # 5、 开始日期
    starttime = self.cf.GetValue("adv_setting", "stime")
    starttime = starttime.replace(' ', '')
    #如何开始日期没有设置,开始时间就不用设置了
    if len(starttime) > 0:
    js = 'var obj = document.getElementsByName("stime")[0];obj.removeAttribute("readonly");obj.value="' + starttime + '";'
    self.urldriver.execute_script(js)

    # 开始时间 0时到23时
    startHour = self.cf.GetValue("adv_setting", "startHour")
    startHour = startHour.replace(' ', '')
    if len(startHour)>0:
    self.urldriver.find_element_by_xpath("//select[@name='startHour']/option[@value='0']").click()
    # startHour_element = self.driver.find_element_by_xpath("//select[@name='startHour']")
    # # startHour_element.find_element_by_xpath("//option[@value='" + startHour + "']").click()
    # Select(startHour_element).select_by_visible_text(startHour)

    # 结束日期
    endtime = self.cf.GetValue("adv_setting", "etime")
    endtime = endtime.replace(' ', '')
    if len(endtime) > 0:
    js = 'var obj = document.getElementsByName("etime")[0];obj.removeAttribute("readonly");obj.value="' + endtime + '";'
    self.urldriver.execute_script(js)

    # 结束时间 0时到23时
    endHour = self.cf.GetValue("adv_setting", "endHour")
    endHour = endHour.replace(' ', '')
    if len(endHour) > 0:
    self.urldriver.find_element_by_xpath("//select[@name='endHour']/option[@value='23']").click()
    # endHour_element = self.driver.find_element_by_xpath("//select[@name='endHour']")
    # endHour_element.find_element_by_xpath("//option[@value='0']").click()
    # Select(endHour_element).select_by_visible_text(endHour)

    # 6、选择省份
    # self.driver.find_element_by_xpath("//select[@name='prov']/option[@value='11']").click()
    province = self.cf.GetValue("adv_setting", "province")
    province = province.replace(' ', '')
    if len(province) > 0:
    prov_element = self.urldriver.find_element_by_xpath("//select[@name='prov']")
    province = unicode(province, "utf8")
    Select(prov_element).select_by_visible_text(province)

    city = self.cf.GetValue("adv_setting", "city")
    city = city.replace(' ', '')
    if len(city) > 0:
    # 选择城市
    city_element = self.urldriver.find_element_by_xpath("//select[@name='city']")
    city = unicode(city, "utf8")
    Select(city_element).select_by_visible_text(city)

    # 点击搜索微博链接
    ss_elements = self.urldriver.find_element_by_xpath("//a[@class='W_btn_cb']")
    ss_elements.click()
    # time.sleep(20)
    return True
    except Exception,e:
    return False


    # #测试抓取微博数据
    obj = weibo()
    obj.Set_CurrentUrl()
    obj.CatchData()


    配置文件的内容
    [section]
    #IE驱动的路径
    iedriverserver = C:Program FilesInternet ExplorerIEDriverServer.exe

    pageinteraldelay = 5

    #要搜索的 微博标签,如果有多个,中间用分号隔开
    htmlLable = //div[@class='WB_cardwrap S_bg2 clearfix']

    #要获取爬虫也是的标签
    pageCountLable = //div[@class='layer_menu_list W_scroll']/ul/li
    #首页url
    firstUrl = http://s.weibo.com/weibo/1?topnav=1&wvr=6&b=1

    #current_url = http://s.weibo.com/weibo/%25E8%25B6%25B3%25E7%2590%2583&region=custom:11:8&scope=ori&suball=1&timescope=custom:2017-03-28:2017-03-28&Refer=g
    #current_url = http://s.weibo.com/weibo/%25E8%25B6%25B3%25E7%2590%2583&region=custom:11:1000&scope=ori&suball=1&timescope=custom:2017-03-20:2017-03-28&page=1

    #查找微博对应的原始url
    weiboOriginalUrlLabel = //a[@class='W_textb']

    #微博高级搜索关键字段设置项
    [adv_setting]

    #文本输入框要搜索的关键字
    keywords = 足球

    #类型的选择
    type_select = radio03

    #包含的选择
    contain_select = radio_sub1

    #开始日期的选择
    stime = 2017-03-20
    #开始时间的选择,可以为空
    starthour =

    #结束日期的选择
    etime = 2017-03-28
    #结束时间,可以为空
    endhour =

    #选择省份
    province = 北京
    #选择城市
    city = 海淀区

  • 相关阅读:
    java 代码 添加控件 修改位置 View
    获取整个Activity的layout
    线程加锁 同步
    应用内悬浮按钮 可吸附 展开有动画 mini播放器
    svg 动画
    动画之二:属性动画 Property Animation
    ButterKnife 免去findviewby的麻烦
    ImageView 控件的宽高随图片变化
    python pip使用国内镜像安装第三方库:命令行或PyCharm
    pycharm安装pika提示CondaHTTPError: HTTP 000 CONNECTION FAILED for url <https://repo.anaconda.com>
  • 原文地址:https://www.cnblogs.com/shaosks/p/6644654.html
Copyright © 2011-2022 走看看