zoukankan      html  css  js  c++  java
  • selenium实战演练

    利用selenium以及pyquery,爬取当当网图书信息,并且将数据存入文件以及MongoDB数据库中。

    配置文件:

    key="python"
    MONGO_URL='localhost'
    MONGO_DB='dangdang'
    MONGO_TABLE='book'

    dangdang.py

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from config import *
    from selenium.common.exceptions import TimeoutException
    from pyquery import PyQuery as pq
    import json
    import pymongo
    
    driver = webdriver.Chrome() #driver=webdriver.PhantomJS()无界面浏览器
    client=pymongo.MongoClient(MONGO_URL)
    db=client[MONGO_DB]
    
    def search():
        """
        获取关键字,并且获取一共有多少页码
        :return: 
        """
        try:
            driver.get("http://book.dangdang.com")
            input = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#key_S"))
            )
            button=WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#form_search_new > input.button"))
            )
            input.send_keys(key)
            button.click()
            html=driver.page_source
            doc=pq(html)
            li=doc('div.paging > ul > li:nth-last-child(3)')
            total_num=li.text()
            return total_num
        except TimeoutException:
            return search()
    
    def get_one_page():
        """
        获取每一页的信息
        :return: 
        """
        html=driver.page_source
        doc=pq(html)
        items=doc('#search_nature_rg ul li').items()
        for item in items:
            yield {
                'title':item.find('a').attr('title'),
                'src':item.find('a').attr('href'),
                'img':item.find('a img').attr('data-original'),
                'price':item.find('.search_pre_price').text(),
                'discount':item.find('.search_discount').text()
            }
    
    def save_to_file(result):
        """
        将字典对象存入到文件中
        :param result: 
        :return: 
        """
        with open('dangdang.txt','a',encoding='utf-8') as f:
            f.write(json.dumps(result,ensure_ascii=False)+'
    ')
            f.close()
    
    def save_to_mongo(result):
        """
        将字典对象存入到数据库
        :param result: 
        :return: 
        """
        try:
            if db[MONGO_TABLE].insert(result):
                print('存储成功',result)
        except Exception:
            print('存储失败')
    
    def next_page(num):
        """
        获取下一页
        :param num: 
        :return: 
        """
        try:
    
            input = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#t__cp"))
            )
    
            button=WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#click_get_page"))
            )
            input.clear()
            input.send_keys(num)
            button.click()
    
        except TimeoutException:
            return next_page(num)
    
    
    def main():
        total_num=int(search())
        for i in range(2,total_num+1):
            next_page(i)
            results=get_one_page()
            for result in results:
                save_to_file(result)
                save_to_mongo(result)
    
    if __name__ == "__main__":
        main()

    存入到文件中的数据:

    {"img": null, "src": "http://product.dangdang.com/25228733.html", "price": "¥109.00", "title": " Python数据科学手册", "discount": "定价: (9.6折)"}
    {"img": "http://img3m2.ddimg.cn/63/7/25173882-1_b_3.jpg", "src": "http://product.dangdang.com/25173882.html", "price": "¥69.00", "title": " Python游戏编程快速上手 第4版", "discount": "定价: (9.6折)"}
    {"img": "http://img3m7.ddimg.cn/75/5/25232007-1_b_3.jpg", "src": "http://product.dangdang.com/25232007.html", "price": "¥79.00", "title": " Python机器学习基础教程", "discount": "定价: (9.6折)"}
    {"img": "http://img3m5.ddimg.cn/8/8/25307675-1_b_2.jpg", "src": "http://product.dangdang.com/25307675.html", "price": "¥59.00", "title": " 深度学习入门 基于Python的理论与实现", "discount": "定价: (9.6折)"}
    {"img": "http://img3m6.ddimg.cn/10/10/25098886-1_b_2.jpg", "src": "http://product.dangdang.com/25098886.html", "price": "¥79.00", "title": " Python高性能编程", "discount": "定价: (9.6折)"}
    {"img": "http://img3m7.ddimg.cn/32/32/27857147-1_b_1.jpg", "src": "http://product.dangdang.com/27857147.html", "price": "¥69.00", "title": " Python深度学习实战:基于TensorFlow和Keras的聊天机器人以及人脸、物体和语音识别", "discount": "定价: (9.6折)"}
    {"img": "http://img3m3.ddimg.cn/85/16/25060153-1_b_2.jpg", "src": "http://product.dangdang.com/25060153.html", "price": "¥69.00", "title": " Python极客项目编程", "discount": "定价: (9.6折)"}
    {"img": "http://img3m7.ddimg.cn/96/32/26475567-1_b_2.jpg", "src": "http://product.dangdang.com/26475567.html", "price": "¥69.00", "title": " 小小的Python编程故事", "discount": "定价: (9.6折)"}
    {"img": "http://img3m6.ddimg.cn/32/4/25339676-1_b_2.jpg", "src": "http://product.dangdang.com/25339676.html", "price": "¥59.00", "title": " Python数据可视化之matplotlib实践", "discount": "定价: (9.6折)"}
    {"img": "http://img3m1.ddimg.cn/9/0/25107201-1_b_3.jpg", "src": "http://product.dangdang.com/25107201.html", "price": "¥69.00", "title": " Python与机器学习实战:决策树、集成学习、支持向量机与神经网络算法详解及编程实现", "discount": "定价: (9.6折)"}
    {"img": "http://img3m7.ddimg.cn/35/32/25240877-1_b_2.jpg", "src": "http://product.dangdang.com/25240877.html", "price": "¥69.00", "title": " Python数据分析从入门到精通", "discount": "定价: (9.6折)"}
    {"img": "http://img3m2.ddimg.cn/15/23/27866832-1_b_3.jpg", "src": "http://product.dangdang.com/27866832.html", "price": "¥69.00", "title": " Python自动化开发实战", "discount": "定价: (8.8折)"}
    {"img": "http://img3m6.ddimg.cn/78/36/26485746-1_b_2.jpg", "src": "http://product.dangdang.com/26485746.html", "price": "¥79.00", "title": " Python量化交易实战", "discount": "定价: (8.8折)"}
    {"img": "http://img3m5.ddimg.cn/82/2/25218055-1_b_3.jpg", "src": "http://product.dangdang.com/25218055.html", "price": "¥89.00", "title": " Python 3学习笔记(上卷)", "discount": "定价: (9.6折)"}
    {"img": "http://img3m4.ddimg.cn/31/7/25219984-1_b_1.jpg", "src": "http://product.dangdang.com/25219984.html", "price": "¥69.00", "title": " Python贝叶斯分析", "discount": "定价: (9.6折)"}
    {"img": "http://img3m5.ddimg.cn/43/0/23617285-1_b_1.jpg", "src": "http://product.dangdang.com/23617285.html", "price": "¥38.00", "title": " Python袖珍指南(第五版)", "discount": "定价: (6.9折)"}
    {"img": "http://img3m9.ddimg.cn/76/20/24104299-1_b_12.jpg", "src": "http://product.dangdang.com/24104299.html", "price": "¥49.50", "title": " 零基础入门学习Python", "discount": "定价: (8.81折)"}
    {"img": "http://img3m7.ddimg.cn/8/12/26913257-1_b_2.jpg", "src": "http://product.dangdang.com/26913257.html", "price": "¥139.00", "title": " Python经典实例", "discount": "定价: (9.6折)"}
    {"img": "http://img3m8.ddimg.cn/57/19/25113288-1_b_2.jpg", "src": "http://product.dangdang.com/25113288.html", "price": "¥39.00", "title": " Python设计模式 第2版", "discount": "定价: (9.59折)"}
    {"img": "http://img3m8.ddimg.cn/32/34/25253348-1_b_3.jpg", "src": "http://product.dangdang.com/25253348.html", "price": "¥49.00", "title": " Python 3爬虫、数据清洗与可视化实战", "discount": "定价: (9.6折)"}
    {"img": "http://img3m8.ddimg.cn/12/34/27859998-1_b_3.jpg", "src": "http://product.dangdang.com/27859998.html", "price": "¥88.00", "title": " Python程序设计", "discount": "定价: (9.61折)"}
    {"img": "http://img3m9.ddimg.cn/15/19/24220959-1_b_3.jpg", "src": "http://product.dangdang.com/24220959.html", "price": "¥59.00", "title": " 跟老齐学Python:轻松入门", "discount": "定价: (9.6折)"}
    {"img": "http://img3m0.ddimg.cn/39/27/25216230-1_b_6.jpg", "src": "http://product.dangdang.com/25216230.html", "price": "¥128.00", "title": " Head First Python(第二版)", "discount": "定价: (6.9折)"}

    存入到MongoDB中:

  • 相关阅读:
    Python基础:内置函数
    Python基础:获取平台相关信息
    Python生成器-博文读后感
    IP地址、域名、域名解析系统相关
    Flask:初次使用Flask-SQLAlchemy读取SQLite3
    Flask:初次使用Blueprints
    Flask:使用Eclipse+PyDev插件编辑基于package的项目
    Flask:操作SQLite3(0.1)
    SQLite数据库初步
    Flask:redirect()函数
  • 原文地址:https://www.cnblogs.com/shenjianping/p/10929886.html
Copyright © 2011-2022 走看看