zoukankan      html  css  js  c++  java
  • [Python爬虫] 之十三:Selenium +phantomjs抓取活动树会议活动数据

     抓取活动树网站中会议活动数据(http://www.huodongshu.com/html/index.html)

     具体的思路是[Python爬虫] 之十一中抓取活动行网站的类似,都是用多线程来抓取,但是由于活动树网站 ,每个关键字搜索页的ur是固定,比如搜索“数字”结果有470个结果,没页10条记录,第二页的url和第一页的 url是一样的。

     因此针对每个关键字用一个线程进行搜索。

     

        具体代码如下:

        

    # coding=utf-8
    import os
    import re
    from selenium import webdriver
    import selenium.webdriver.support.ui as ui
    from selenium.webdriver.common.keys import Keys
    import time
    from selenium.webdriver.common.action_chains import ActionChains
    from selenium.webdriver.support.select import Select
    import IniFile
    from selenium.webdriver.common.keys import Keys
    from threading import Thread
    import thread
    import LogFile
    import urllib
    import mongoDbBase
    #抓取数据线程类
    class ScrapyData_Thread(Thread):
    #抓取数据线程类
    def __init__(self,webSearchUrl,pageCountLable,htmlLable,originalUrlLabel,nextUrlLabel,keyword,db):
    '''
    构造函数
    :param webSearchUrl: 搜索页url
    :param pageCountLable: 搜索页数标签
    :param htmlLable: 要搜索的标签
    :param OriginalUrlLabel: 每个记录对应的url标签
    :param nextUrlLabel: 下一页标签
    :param keywords: 要搜索的关键字,多个关键字中间用分号(;)隔开
    :param db: 保存数据库引擎
    '''
    Thread.__init__(self)

    self.webSearchUrl = webSearchUrl
    self.pageCountLable = pageCountLable
    self.htmlLable = htmlLable
    self.originalUrlLabel = originalUrlLabel
    self.nextUrlLabel = nextUrlLabel
    self.keyword = keyword
    self.db = db

    # IEDriverServer = self.cf.GetValue("section", "IEDriverServer")
    # os.environ["webdriver.ie.driver"] = IEDriverServer
    # self.urldriver = webdriver.Ie(IEDriverServer)

    self.driver = webdriver.PhantomJS()
    self.wait = ui.WebDriverWait(self.driver, 20)
    self.driver.maximize_window()

    def compareDate(self, dateLeft, dateRight):
    '''
    比较俩个日期的大小
    :param dateLeft: 日期 格式2017-03-04
    :param dateRight:日期 格式2017-03-04
    :return: 1:左大于右,0:相等,-1:左小于右
    '''
    dls = dateLeft.split('-')
    drs = dateRight.split('-')
    if len(dls) > len(drs):
    return 1
    if int(dls[0]) == int(drs[0]) and int(dls[1]) == int(drs[1]) and int(dls[2]) == int(drs[2]):
    return 0

    if int(dls[0]) > int(drs[0]):
    return 1
    elif int(dls[0]) == int(drs[0]) and int(dls[1]) > int(drs[1]):
    return 1
    elif int(dls[0]) == int(drs[0]) and int(dls[1]) == int(drs[1]) and int(dls[2]) > int(drs[2]):
    return 1
    return -1

    def date_isValid(self, strDateText):
    '''
    判断日期时间字符串是否合法:如果给定时间大于当前时间是合法,或者说当前时间给定的范围内
    :param strDateText: 三种格式 '017.04.27 ~ 04.28'; '2017.04.20 08:30 ~ 12:30' ; '2015.12.29 ~ 2016.01.03'
    :return: True:合法;False:不合法
    '''
    datePattern = re.compile(r'd{4}-d{2}-d{2}')
    date = strDateText.replace('.', '-')
    strDate = re.findall(datePattern, date)
    currentDate = time.strftime('%Y-%m-%d')

    if len(strDate) == 2:
    if self.compareDate(strDate[1], currentDate) > 0:
    return True
    elif len(strDate) == 1:
    if self.compareDate(strDate[0], currentDate) >= 0:
    return True
    else:
    datePattern = re.compile(r'd{4}-d{2}-d{2}s~sd{2}-d{2}')
    #2015-06-04 13:30 ~ 17:30
    strDate = re.findall(datePattern, date)
    if len(strDate) >0:
    if self.compareDate(strDate[0][0:5] + strDate[0][13:], currentDate) >= 0:
    return True
    else:
    return False
    return False

    def run(self):
    print ''
    print '关键字:%s ' % self.keyword
    self.driver.get(self.webSearchUrl)
    time.sleep(5)
    # 记录数
    pageCount_elements = self.driver.find_elements_by_xpath(self.pageCountLable)
    if len(pageCount_elements) > 0:
    strCount = pageCount_elements[0].text.encode('utf8')
    pageCount = int(strCount) / 10
    if int(strCount) % 10 > 0:
    pageCount = pageCount + 1

    page_Count = pageCount
    pageIndex = 0
    kword = self.keyword
    recordCount = 0
    while pageCount > 0:
    pageCount = pageCount - 1
    if pageIndex > 0:
    next_element = self.driver.find_elements_by_xpath(self.nextUrlLabel)
    if len(next_element) > 0:
    next_element[0].click()
    time.sleep(3)

    self.wait.until(lambda driver: self.driver.find_elements_by_xpath(self.htmlLable))
    Elements = self.driver.find_elements_by_xpath(self.htmlLable)

    # 查找微博对应的原始url
    urlList = []
    self.wait.until(lambda driver: self.driver.find_elements_by_xpath(self.originalUrlLabel))
    hrefElements = self.driver.find_elements_by_xpath(self.originalUrlLabel)
    for hrefe in hrefElements:
    urlList.append(hrefe.get_attribute('href').encode('utf8'))

    index = 0
    strMessage = ' '
    strsplit = ' ------------------------------------------------------------------------------------ '
    index = 0
    # 每页中有用记录
    usefulCount = 0
    meetingList = []
    for element in Elements:
    txt = element.text.encode('utf8')

    txts = txt.split(' ')

    # strDate = re.findall(self.datePattern, txt)
    # 日期大于今天并且搜索的关键字在标题中才认为是复合要求的数据
    if self.date_isValid(txts[1]) and txts[0].find(kword) > -1:
    dictM = {'title': txts[0], 'date': txts[1],
    'url': urlList[index], 'keyword': kword, 'info': txt}
    meetingList.append(dictM)

    # print ' '
    # print txt
    # print '活动链接:' + urlList[index]
    # print strsplit
    #
    # strMessage = txt + " "
    # strMessage += '活动链接:' + urlList[index] + " "
    # strMessage += strsplit
    # strMessage = unicode(strMessage, 'utf8')
    # # log.WriteLog(strMessage)
    usefulCount = usefulCount + 1
    recordCount = recordCount + 1
    index = index + 1

    pageIndex = pageIndex + 1
    if usefulCount == 0:
    break
    else:
    self.db.SaveMeetings(meetingList) #保存数据库中

    print "共浏览了: %d 页数据" % page_Count
    print "共抓取了: %d 个符合条件的活动记录" % recordCount

    self.driver.close()
    self.driver.quit()

    if __name__ == '__main__':

    configfile = os.path.join(os.getcwd(), 'activity.conf')
    cf = IniFile.ConfigFile(configfile)
    webSearchUrl = cf.GetValue("section", "webSearchUrl")
    pageCountLable = cf.GetValue("section", "pageCountLable")
    htmlLable = cf.GetValue("section", "htmlLable")
    originalUrlLabel = cf.GetValue("section", "originalUrlLabel")
    nextUrlLabel = cf.GetValue("section", "nextUrlLabel")

    keywords= cf.GetValue("section", "keywords")
    keywordlist = keywords.split(';')
    start = time.clock()
    db = mongoDbBase.mongoDbBase()
    for keyword in keywordlist:
    if len(keyword) > 0:
    url = webSearchUrl + urllib.quote(keyword)
    t = ScrapyData_Thread(url, pageCountLable, htmlLable,originalUrlLabel,nextUrlLabel,keyword,db)
    t.setDaemon(True)
    t.start()
    t.join()

    end = time.clock()
    print "整个过程用时间: %f 秒" % (end - start)

      
    配置文件内容:

    [section]
    #IE驱动的路径
    iedriverserver = C:Program FilesInternet ExplorerIEDriverServer.exe

    #要搜索的标签,如果有多个,中间用分号隔开
    htmlLable = //div[@id ='eventList']/div[@class ='list']


    #要获取爬虫也是的标签
    pageCountLable = //span[@id='eventNumber']

    #给定网址的搜索首页Url
    webSearchUrl = http://www.huodongshu.com/html/find_search.html?search_keyword=


    #查找对应的原始url
    originalUrlLabel = //div[@class='listR']/h2/a

    #下一页链接对应的标签
    nextUrlLabel = //dt[@class='next']/a

    #文本输入框要搜索的关键字
    keywords = 互联网电视;智能电视;数字;影音;家庭娱乐;节目;视听;版权;数据


  • 相关阅读:
    微信小程序--form表单消息推送
    微信小程序学习笔记五(持续更新)---小程序上传文件
    微信小程序学习笔记四(持续更新)---征服scroll-view下拉刷新
    微信小程序学习笔记三(持续更新)---小程序组件通信
    微信小程序学习笔记二(持续更新)---小程序网络请求封装
    linux下安装微信开发者工具(fedora27)
    初学小程序学习笔记(持续更新)
    bootstrap使用popover插件实现点击按钮显示二维码图片
    gulp-jshint 编译出错Error:Cannot find modul 'jshint/src/cli' 解决办法
    vscode 下的 typescript 自动编译方法
  • 原文地址:https://www.cnblogs.com/shaosks/p/6705060.html
Copyright © 2011-2022 走看看