zoukankan      html  css  js  c++  java
  • selenium+plantomJS

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    """
        流程框架:
            1.搜索关键词,利用selenium驱动浏览器搜索关键词,查询得到商品列表
            2.分析页码并翻页,得到商品页码数,模拟翻页,得到后续页面的商品列表
            3.分析提取商品内容,利用PyQuery分析源码,解析得到商品列表
            4.存储至MongoDB,将商品列表信息存储到Mongodb数据库
    """
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    import re
    from pyquery import PyQuery as pq
    import pymongo
    
    LOCATION = "localhost"
    MONGO_DB = 'taobao'
    MONGO_TABLE = "taobao"
    KEYWORD = '零食'
    
    
    client = pymongo.MongoClient(LOCATION)
    db = client[MONGO_DB]
    
    
    driver = webdriver.PhantomJS(service_args=['--load-images=false', ])
    wait = WebDriverWait(driver, 20)
    driver.set_window_size(1366, 768)
    
    
    def search(keyword):
        print("正在搜索关键字:%s" % keyword)
        try:
            driver.get('http://www.taobao.com')
            element = wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#q"))
            )
            submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search')))
            element.clear()
            element.send_keys(keyword)
            submit.click()
            total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.total')))
            get_product()
            return total.text
        except TimeoutException:
            search(keyword)
    
    
    def next_page(page_number):
        print("正在翻页:%s" % page_number)
        flag = False
        try:
            element = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'input.input:nth-child(2)')))
            submit = WebDriverWait(driver, 20).until(
                              EC.element_to_be_clickable((By.CSS_SELECTOR, 'span.btn:nth-child(4)')))
            element.clear()
            element.send_keys(page_number)
            submit.click()
            wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, 'span.num'), str(page_number)))
            get_product()
            flag = True
        except TimeoutException:
            next_page(page_number)
        except Exception as e:
            print(e)
        return flag
    
    
    def get_product():
        print("正在获取产品信息...")
        try:
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
            html = driver.page_source
            doc = pq(html)
            items = doc('#mainsrp-itemlist .items .item').items()
            for item in items:
                product = {
                    "img": item.find(".pic-box div a img").attr('src'),
                    'title': item.find(".title a").text(),
                    "sales": item.find(".deal-cnt").text(),
                    "shop": item.find(".shopname").text(),
                    "location": item.find(".location").text(),
                    "price": item.find(".price strong").text()
                }
                save_to_mongo(product)
        except Exception as e:
            print(e)
    
    
    def save_to_mongo(result):
        try:
            if db[MONGO_TABLE].insert(result):
                print("产品信息成功保存到mongodb", result)
        except Exception as e:
            print("保存失败!", e)
    
    
    def main():
        page = search(KEYWORD)
        page = re.compile("d+").search(page).group(0)
        flag = ''
        for i in range(2, int(page) + 1):
            flag = next_page(i)
        return flag
    
    
    if __name__ == "__main__":
        main()
    

      

  • 相关阅读:
    中美贸易战再次开启,世界两极化进程正在加快形成!..... Copyright: 1688澳洲新闻网 Read more at: https://www.1688.com.au/world/international/2018/06/17/369368/
    缠师的博客中关于舒伯特中的回帖,细思极恐
    南怀瑾,脱了国学大师的外衣,只剩下江湖和名利
    你可能修了一个假的“不净观”
    陈大惠老师:什么叫道德?
    C#-正则,常用几种数据解析-端午快乐
    html5+go+websocket简单实例代码
    Task三个列子的分享
    golang-web框架revel一个表单提交的总结
    百度流行音乐-资源数据整合
  • 原文地址:https://www.cnblogs.com/nixingguo/p/7266507.html
Copyright © 2011-2022 走看看