# -*- coding:utf-8 -*- # author : yesehngbao # time:2018/3/29 import re import pymongo from lxml import etree from selenium import webdriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By # from selenium.webdriver.common.utils import Keys MONGO_HOST = 'localhost' MONGO_PORT = 27017 MONGO_DB = 'test' MONGO_COLL = 'selenum_tao' webdir = webdriver.Chrome() def get_page_num(): webdir.get('http://www.taobao.com') input = WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))) button = WebDriverWait(webdir, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search'))) input.clear() input.send_keys('衬衫') button.click() page_num = WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))).text page_num = re.findall('d+', page_num)[0] return page_num def gain_page(page): try: WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.items .item .pic a img'))) input = WebDriverWait(webdir, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.J_Input'))) button = WebDriverWait(webdir, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.J_Submit'))) input.clear() input.send_keys(page) button.click() WebDriverWait(webdir, 10).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page))) except Exception: gain_page(page) def get_page_html(page): if page: html = webdir.page_source return html def analysis_page(html): doc = etree.HTML(html) div_list = doc.xpath('.//div[@class="items"]//div[contains(@class,"item")]') for div in div_list: img = div.xpath('.//div[@class="pic"]/a/img/@data-src')[0] money = div.xpath('.//div[contains(@class, "price")]/strong/text()')[0] yield { 'img': img, 'money': money, } def save_mongo(content): mongo_client = pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT) db = mongo_client[MONGO_DB] coll = db[MONGO_COLL] coll.insert(content) def main(): page_num = get_page_num() for page in range(1, int(page_num)+1): gain_page(page) html = get_page_html(page) content = analysis_page(html) save_mongo(content) if __name__ == '__main__': main()