zoukankan      html  css  js  c++  java
  • python selenium 爬取淘宝

    # -*- coding:utf-8 -*-
    # author : yesehngbao 
    # time:2018/3/29
    
    
    import re
    import pymongo
    
    
    from lxml import etree
    from selenium import webdriver
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.common.by import By
    
    
    # from selenium.webdriver.common.utils import Keys
    
    MONGO_HOST = 'localhost'
    MONGO_PORT = 27017
    MONGO_DB = 'test'
    MONGO_COLL = 'selenum_tao'
    
    
    
    webdir = webdriver.Chrome()
    
    
    def get_page_num():
        webdir.get('http://www.taobao.com')
        input = WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#q')))
        button = WebDriverWait(webdir, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search')))
        input.clear()
        input.send_keys('衬衫')
        button.click()
        page_num = WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))).text
        page_num = re.findall('d+', page_num)[0]
        return page_num
    
    
    def gain_page(page):
        try:
            WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.items .item .pic a img')))
            input = WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.J_Input')))
            button = WebDriverWait(webdir, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.J_Submit')))
            input.clear()
            input.send_keys(page)
            button.click()
            WebDriverWait(webdir, 10).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page)))
        except Exception:
            gain_page(page)
    
    
    def get_page_html(page):
        if page:
            html = webdir.page_source
            return html
    
    
    def analysis_page(html):
        doc = etree.HTML(html)
        div_list = doc.xpath('.//div[@class="items"]//div[contains(@class,"item")]')
        for div in div_list:
            img = div.xpath('.//div[@class="pic"]/a/img/@data-src')[0]
            money = div.xpath('.//div[contains(@class, "price")]/strong/text()')[0]
            yield {
                'img': img,
                'money': money,
            }
    
    
    def save_mongo(content):
        mongo_client = pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT)
        db = mongo_client[MONGO_DB]
        coll = db[MONGO_COLL]
        coll.insert(content)
    
    
    def main():
        page_num = get_page_num()
        for page in range(1, int(page_num)+1):
            gain_page(page)
            html = get_page_html(page)
            content = analysis_page(html)
            save_mongo(content)
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    hangfire+bootstrap ace 模板实现后台任务管理平台
    jquery.tagthis和jquery.autocomplete一起实现标签
    jquery.autocomplete自动补齐和自定义格式
    C#控制台程序的参数解析类库 CommandLine简单使用说明
    排序算法之折半插入排序的思想以及Java实现
    排序算法之直接插入排序的思想以及Java实现
    Call to a member function display() on a non-object问题的解决
    jQuery设置元素的readonly和disabled属性
    JAVA之Math类常用数学运算记录
    Syntax error, parameterized types are only available if source level is 1.5 解决方案
  • 原文地址:https://www.cnblogs.com/yijian001/p/8848159.html
Copyright © 2011-2022 走看看