zoukankan      html  css  js  c++  java
  • python selenium 爬取淘宝

    # -*- coding:utf-8 -*-
    # author : yesehngbao 
    # time:2018/3/29
    
    
    import re
    import pymongo
    
    
    from lxml import etree
    from selenium import webdriver
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.common.by import By
    
    
    # from selenium.webdriver.common.utils import Keys
    
    MONGO_HOST = 'localhost'
    MONGO_PORT = 27017
    MONGO_DB = 'test'
    MONGO_COLL = 'selenum_tao'
    
    
    
    webdir = webdriver.Chrome()
    
    
    def get_page_num():
        webdir.get('http://www.taobao.com')
        input = WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#q')))
        button = WebDriverWait(webdir, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search')))
        input.clear()
        input.send_keys('衬衫')
        button.click()
        page_num = WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))).text
        page_num = re.findall('d+', page_num)[0]
        return page_num
    
    
    def gain_page(page):
        try:
            WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.items .item .pic a img')))
            input = WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.J_Input')))
            button = WebDriverWait(webdir, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.J_Submit')))
            input.clear()
            input.send_keys(page)
            button.click()
            WebDriverWait(webdir, 10).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page)))
        except Exception:
            gain_page(page)
    
    
    def get_page_html(page):
        if page:
            html = webdir.page_source
            return html
    
    
    def analysis_page(html):
        doc = etree.HTML(html)
        div_list = doc.xpath('.//div[@class="items"]//div[contains(@class,"item")]')
        for div in div_list:
            img = div.xpath('.//div[@class="pic"]/a/img/@data-src')[0]
            money = div.xpath('.//div[contains(@class, "price")]/strong/text()')[0]
            yield {
                'img': img,
                'money': money,
            }
    
    
    def save_mongo(content):
        mongo_client = pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT)
        db = mongo_client[MONGO_DB]
        coll = db[MONGO_COLL]
        coll.insert(content)
    
    
    def main():
        page_num = get_page_num()
        for page in range(1, int(page_num)+1):
            gain_page(page)
            html = get_page_html(page)
            content = analysis_page(html)
            save_mongo(content)
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    python matplotlib 绘图
    python set add 导致问题 TypeError: unhashable type: 'list'
    python 子类继承父类的__init__方法
    python 内存监控模块之memory_profiler
    git log 常用命令
    wireshark使用教程
    python os.path模块
    Linux crontab 定时任务
    linux环境变量LD_LIBRARY_PATH
    Linux的ldconfig和ldd用法
  • 原文地址:https://www.cnblogs.com/yijian001/p/8848159.html
Copyright © 2011-2022 走看看