zoukankan      html  css  js  c++  java
  • 基于selenium爬取京东

    爬取iphone

    注意:browser对象会发生变化,当对当前网页做任意操作时

    import time
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    #
    
    if __name__ == '__main__':
    
        browser = webdriver.Chrome()
        browser.get('https://www.jd.com')
        # 搜索iphone
        _input = browser.find_element_by_id('key')
        _input.send_keys('iphone')
        _input.send_keys(Keys.ENTER)
        time.sleep(5)
        # 按销量排序
        sales = browser.find_element_by_xpath('//div[@class="f-sort"]/a[2]')
        sales.click()
    
        has_next = True
        while has_next:
            # 获取当前的页码
            time.sleep(5)
            cur_page = browser.find_element_by_xpath('//div[@id="J_bottomPage"]/span[@class="p-skip"]/input').get_attribute('value')
            print('-------------------------   当前页码 {}  -------------------------'.format(cur_page))
            
            # 加载全部数据,数据随着滚动条的下来而加载
            # good_list = browser.find_element_by_id('J_goodsList')
            # y = good_list.rect['y'] + good_list.rect['height']
            next_page = browser.find_element_by_class_name('pn-next')
            y = next_page.location['y']
            browser.execute_script('window.scrollTo(0, {})'.format(y))
            time.sleep(3)
            # 获取当前页面所有商品列表
            p_list = browser.find_elements_by_class_name('gl-item')
            for p in p_list:
                production = {}
                sku = p.get_attribute('data-sku')
                production['price'] = p.find_element_by_css_selector('strong.J_{}'.format(sku)).text
                production['name'] = p.find_element_by_css_selector('div.p-name>a>em').text
                production['comment'] = p.find_element_by_id('J_comment_{}'.format(sku)).text
                production['shop'] = p.find_element_by_css_selector('div.p-shop>span>a').get_attribute('title')
                print(production)
    
            # 下一页
            cur_next_page = browser.find_element_by_class_name('pn-next')
            # 判断是否是最后一页
            if 'disabled' in cur_next_page.get_attribute('class'):
                has_next = False
            else:
                cur_next_page.click()
    
        browser.quit()

     优化

    import time
    import sys
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    #
    
    if __name__ == '__main__':
        keyword = 'iphone'
        if len(sys.argv) > 1:
            keyword = sys.argv[1]
        browser = webdriver.Chrome()
        browser.get('https://www.jd.com')
        # 搜索iphone
        _input = browser.find_element_by_id('key')
        _input.send_keys(keyword)
        _input.send_keys(Keys.ENTER)
        time.sleep(5)
        # 按销量排序
        sales = browser.find_element_by_xpath('//div[@class="f-sort"]/a[2]')
        sales.click()
    
        has_next = True
        while has_next:
            # 获取当前的页码
            time.sleep(5)
            cur_page = browser.find_element_by_xpath('//div[@id="J_bottomPage"]/span[@class="p-skip"]/input').get_attribute('value')
            print('-------------------------   当前页码 {}  -------------------------'.format(cur_page))
    
            # 加载全部数据,数据随着滚动条的下来而加载
            # good_list = browser.find_element_by_id('J_goodsList')
            # y = good_list.rect['y'] + good_list.rect['height']
            next_page = browser.find_element_by_class_name('pn-next')
            y = next_page.location['y']
            browser.execute_script('window.scrollTo(0, {})'.format(y))
            time.sleep(3)
            # 获取当前页面所有商品列表
            p_list = browser.find_elements_by_class_name('gl-item')
            for p in p_list:
                production = {}
                sku = p.get_attribute('data-sku')
                production['price'] = p.find_element_by_css_selector('strong.J_{}'.format(sku)).text
                production['name'] = p.find_element_by_css_selector('div.p-name>a>em').text
                production['comment'] = p.find_element_by_id('J_comment_{}'.format(sku)).text
                production['shop'] = p.find_element_by_css_selector('div.p-shop>span>a').get_attribute('title')
                print(production)
    
            # 下一页
            cur_next_page = browser.find_element_by_class_name('pn-next')
            # 判断是否是最后一页
            if 'disabled' in cur_next_page.get_attribute('class'):
                has_next = False
            else:
                cur_next_page.click()
    
        browser.quit()

    通过sys使

    python jd.py mac

    补充

    sys.argv[0] 是脚本的名称

    sys.argv[1] 是参数

  • 相关阅读:
    MySQL-InnoDB锁(一)
    Java方法调用机制
    并发编程-锁相关的内存语义
    Java开发工具汇总
    并发编程-底层实现原理
    并发编程-Java内存模型
    JsonTest
    PTA(Basic Level)1034.有理数四则运算
    PTA(Advanced Level)1081.Rational Sum
    PTA(Advanced Level)1008.Elevator
  • 原文地址:https://www.cnblogs.com/wt7018/p/11874823.html
Copyright © 2011-2022 走看看