zoukankan      html  css  js  c++  java
  • python selenium 爬取淘宝

    # -*- coding:utf-8 -*-
    # author : yesehngbao 
    # time:2018/3/29
    
    
    import re
    import pymongo
    
    
    from lxml import etree
    from selenium import webdriver
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.common.by import By
    
    
    # from selenium.webdriver.common.utils import Keys
    
    MONGO_HOST = 'localhost'
    MONGO_PORT = 27017
    MONGO_DB = 'test'
    MONGO_COLL = 'selenum_tao'
    
    
    
    webdir = webdriver.Chrome()
    
    
    def get_page_num():
        webdir.get('http://www.taobao.com')
        input = WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#q')))
        button = WebDriverWait(webdir, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search')))
        input.clear()
        input.send_keys('衬衫')
        button.click()
        page_num = WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))).text
        page_num = re.findall('d+', page_num)[0]
        return page_num
    
    
    def gain_page(page):
        try:
            WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.items .item .pic a img')))
            input = WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.J_Input')))
            button = WebDriverWait(webdir, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.J_Submit')))
            input.clear()
            input.send_keys(page)
            button.click()
            WebDriverWait(webdir, 10).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page)))
        except Exception:
            gain_page(page)
    
    
    def get_page_html(page):
        if page:
            html = webdir.page_source
            return html
    
    
    def analysis_page(html):
        doc = etree.HTML(html)
        div_list = doc.xpath('.//div[@class="items"]//div[contains(@class,"item")]')
        for div in div_list:
            img = div.xpath('.//div[@class="pic"]/a/img/@data-src')[0]
            money = div.xpath('.//div[contains(@class, "price")]/strong/text()')[0]
            yield {
                'img': img,
                'money': money,
            }
    
    
    def save_mongo(content):
        mongo_client = pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT)
        db = mongo_client[MONGO_DB]
        coll = db[MONGO_COLL]
        coll.insert(content)
    
    
    def main():
        page_num = get_page_num()
        for page in range(1, int(page_num)+1):
            gain_page(page)
            html = get_page_html(page)
            content = analysis_page(html)
            save_mongo(content)
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    Lua调用C++时打印堆栈信息
    Node.js批量去除BOM文件
    cocos2d-x中CCLabelAtlas的小图片拼接
    node.js使用mysql模块的坑
    关于chrome插件编写的小结
    【吐槽】如风达快递
    bat调用TexturePacker更新SpriteSheet
    使用node-webkit实现打包工具的小结
    使用devenv.exe自动编译项目
    svn导出文件进行比较
  • 原文地址:https://www.cnblogs.com/yijian001/p/8848159.html
Copyright © 2011-2022 走看看