爬取iphone
注意:browser对象会发生变化,当对当前网页做任意操作时
import time from selenium import webdriver from selenium.webdriver.common.keys import Keys # if __name__ == '__main__': browser = webdriver.Chrome() browser.get('https://www.jd.com') # 搜索iphone _input = browser.find_element_by_id('key') _input.send_keys('iphone') _input.send_keys(Keys.ENTER) time.sleep(5) # 按销量排序 sales = browser.find_element_by_xpath('//div[@class="f-sort"]/a[2]') sales.click() has_next = True while has_next: # 获取当前的页码 time.sleep(5) cur_page = browser.find_element_by_xpath('//div[@id="J_bottomPage"]/span[@class="p-skip"]/input').get_attribute('value') print('------------------------- 当前页码 {} -------------------------'.format(cur_page)) # 加载全部数据,数据随着滚动条的下来而加载 # good_list = browser.find_element_by_id('J_goodsList') # y = good_list.rect['y'] + good_list.rect['height'] next_page = browser.find_element_by_class_name('pn-next') y = next_page.location['y'] browser.execute_script('window.scrollTo(0, {})'.format(y)) time.sleep(3) # 获取当前页面所有商品列表 p_list = browser.find_elements_by_class_name('gl-item') for p in p_list: production = {} sku = p.get_attribute('data-sku') production['price'] = p.find_element_by_css_selector('strong.J_{}'.format(sku)).text production['name'] = p.find_element_by_css_selector('div.p-name>a>em').text production['comment'] = p.find_element_by_id('J_comment_{}'.format(sku)).text production['shop'] = p.find_element_by_css_selector('div.p-shop>span>a').get_attribute('title') print(production) # 下一页 cur_next_page = browser.find_element_by_class_name('pn-next') # 判断是否是最后一页 if 'disabled' in cur_next_page.get_attribute('class'): has_next = False else: cur_next_page.click() browser.quit()
优化
import time import sys from selenium import webdriver from selenium.webdriver.common.keys import Keys # if __name__ == '__main__': keyword = 'iphone' if len(sys.argv) > 1: keyword = sys.argv[1] browser = webdriver.Chrome() browser.get('https://www.jd.com') # 搜索iphone _input = browser.find_element_by_id('key') _input.send_keys(keyword) _input.send_keys(Keys.ENTER) time.sleep(5) # 按销量排序 sales = browser.find_element_by_xpath('//div[@class="f-sort"]/a[2]') sales.click() has_next = True while has_next: # 获取当前的页码 time.sleep(5) cur_page = browser.find_element_by_xpath('//div[@id="J_bottomPage"]/span[@class="p-skip"]/input').get_attribute('value') print('------------------------- 当前页码 {} -------------------------'.format(cur_page)) # 加载全部数据,数据随着滚动条的下来而加载 # good_list = browser.find_element_by_id('J_goodsList') # y = good_list.rect['y'] + good_list.rect['height'] next_page = browser.find_element_by_class_name('pn-next') y = next_page.location['y'] browser.execute_script('window.scrollTo(0, {})'.format(y)) time.sleep(3) # 获取当前页面所有商品列表 p_list = browser.find_elements_by_class_name('gl-item') for p in p_list: production = {} sku = p.get_attribute('data-sku') production['price'] = p.find_element_by_css_selector('strong.J_{}'.format(sku)).text production['name'] = p.find_element_by_css_selector('div.p-name>a>em').text production['comment'] = p.find_element_by_id('J_comment_{}'.format(sku)).text production['shop'] = p.find_element_by_css_selector('div.p-shop>span>a').get_attribute('title') print(production) # 下一页 cur_next_page = browser.find_element_by_class_name('pn-next') # 判断是否是最后一页 if 'disabled' in cur_next_page.get_attribute('class'): has_next = False else: cur_next_page.click() browser.quit()
通过sys使
python jd.py mac
补充
sys.argv[0] 是脚本的名称
sys.argv[1] 是参数