zoukankan      html  css  js  c++  java
  • [Python爬虫] 之十四:Selenium +phantomjs抓取媒介360数据

      

        

      具体代码如下:

      

    # coding=utf-8
    import os
    import re
    from selenium import webdriver
    import selenium.webdriver.support.ui as ui
    from selenium.webdriver.common.keys import Keys
    import time
    from selenium.webdriver.common.action_chains import ActionChains
    from selenium.webdriver.support.select import Select
    import IniFile
    from selenium.webdriver.common.keys import Keys
    from threading import Thread
    import thread
    import LogFile
    import urllib
    import mongoDB
    #抓取数据线程类
    class ScrapyData_Thread(Thread):
    #抓取数据线程类
    def __init__(self,webSearchUrl,inputTXTIDLabel,searchlinkIDLabel,htmlLable,htmltime,keyword,invalid_day,db):
    '''
    构造函数
    :param webSearchUrl: 搜索页url
    :param inputTXTIDLabel: 搜索输入框的标签
    :param searchlinkIDLabel: 搜索链接的标签
    :param htmlLable: 要搜索的标签
    :param htmltime: 要搜索的时间
    :param invalid_day: 要搜索的关键字,多个关键字中间用分号(;)隔开
    :param keywords: #资讯发布的有效时间,默认是3天以内
    :param db: 保存数据库引擎
    '''
    Thread.__init__(self)

    self.webSearchUrl = webSearchUrl
    self.inputTXTIDLabel = inputTXTIDLabel
    self.searchlinkIDLabel = searchlinkIDLabel
    self.htmlLable = htmlLable
    self.htmltime = htmltime
    self.keyword = keyword
    self.invalid_day = invalid_day
    self.db = db

    self.driver = webdriver.PhantomJS()
    self.wait = ui.WebDriverWait(self.driver, 20)
    self.driver.maximize_window()

    def Comapre_to_days(self,leftdate, rightdate):
    '''
    比较连个字符串日期,左边日期大于右边日期多少天
    :param leftdate: 格式:2017-04-15
    :param rightdate: 格式:2017-04-15
    :return: 天数
    '''
    l_time = time.mktime(time.strptime(leftdate, '%Y-%m-%d'))
    r_time = time.mktime(time.strptime(rightdate, '%Y-%m-%d'))
    result = int(l_time - r_time) / 86400
    return result

    def run(self):
    print '关键字:%s' % self.keyword

    self.driver.get(self.webSearchUrl)
    time.sleep(1)

    # js = "var obj = document.getElementById('ctl00_ContentPlaceHolder1_txtSearch');obj.value='" + self.keyword + "';"
    # self.driver.execute_script(js)
    # 点击搜索链接
    ss_elements = self.driver.find_element_by_id(self.inputTXTIDLabel)
    ss_elements.send_keys(unicode(self.keyword,'utf8'))

    search_elements = self.driver.find_element_by_id(self.searchlinkIDLabel)
    search_elements.click()
    time.sleep(4)

    self.wait.until(lambda driver: self.driver.find_elements_by_xpath(self.htmlLable))
    Elements = self.driver.find_elements_by_xpath(self.htmlLable)

    timeElements = self.driver.find_elements_by_xpath(self.htmltime)
    urlList = []
    for hrefe in Elements:
    urlList.append(hrefe.get_attribute('href').encode('utf8'))

    index = 0
    strMessage = ' '
    strsplit = ' ------------------------------------------------------------------------------------ '
    index = 0
    # 每页中有用记录
    recordCount = 0
    usefulCount = 0
    meetingList = []
    kword = self.keyword
    currentDate = time.strftime('%Y-%m-%d')

    for element in Elements:

    strDate = timeElements[index].text.encode('utf8')
    # 日期在3天内并且搜索的关键字在标题中才认为是复合要求的数据
    #因为搜索的记录是安装时间倒序,所以如果当前记录的时间不在3天以内,那么剩下的记录肯定是小于当前日期的,所以就退出
    if self.Comapre_to_days(currentDate,strDate) < self.invalid_day:
    usefulCount = 1
    txt = element.text.encode('utf8')
    if txt.find(kword) > -1:
    dictM = {'title': txt, 'date': strDate,
    'url': urlList[index], 'keyword': kword, 'info': txt}
    meetingList.append(dictM)

    # print ' '
    # print '标题:%s' % txt
    # print '日期:%s' % strDate
    # print '资讯链接:' + urlList[index]
    # print strsplit

    # # log.WriteLog(strMessage)
    recordCount = recordCount + 1
    else:
    usefulCount = 0

    if usefulCount:
    break
    index = index + 1

    self.db.SaveMeetings(meetingList)
    print "共抓取了: %d 个符合条件的资讯记录" % recordCount


    self.driver.close()
    self.driver.quit()

    if __name__ == '__main__':

    configfile = os.path.join(os.getcwd(), 'chinaMedia.conf')
    cf = IniFile.ConfigFile(configfile)
    webSearchUrl = cf.GetValue("section", "webSearchUrl")
    inputTXTIDLabel = cf.GetValue("section", "inputTXTIDLabel")
    searchlinkIDLabel = cf.GetValue("section", "searchlinkIDLabel")
    htmlLable = cf.GetValue("section", "htmlLable")
    htmltime = cf.GetValue("section", "htmltime")

    invalid_day = int(cf.GetValue("section", "invalid_day"))

    keywords= cf.GetValue("section", "keywords")
    keywordlist = keywords.split(';')
    start = time.clock()
    db = mongoDB.mongoDbBase()
    for keyword in keywordlist:
    if len(keyword) > 0:
    t = ScrapyData_Thread(webSearchUrl,inputTXTIDLabel,searchlinkIDLabel,htmlLable,htmltime,keyword,invalid_day,db)
    t.setDaemon(True)
    t.start()
    t.join()

    end = time.clock()
    print "整个过程用时间: %f 秒" % (end - start)

  • 相关阅读:
    maven总结二: 常用标签及属性
    maven总结一: 常用命令
    SpringCloud集成Seata并使用Nacos做注册中心与配置中心
    Seata 分布式事务解决方案及特点
    mui 文件附件下载
    Mock学习记录
    postman 测试elasticsearch 导出json
    elasticsearch 整合spring cloud Alibaba nacos 配置文件
    Elasticsearch 多字段高亮分页查询 返回泛型List集合
    Elasticsearch 多字段高亮字段返回json
  • 原文地址:https://www.cnblogs.com/shaosks/p/6723452.html
Copyright © 2011-2022 走看看