from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By import re,json from pyquery import PyQuery as pq driver = webdriver.Ie() wait=WebDriverWait(driver,20,0.2) url = "http://taobao.com" def search(): try: driver.get(url) # 判断页面是否加载成功 input=wait.until(EC.presence_of_element_located((By.ID,"q"))) input.send_keys("美食") submit=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button"))) submit.click() #总的页数 total=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total"))) get_products() return total.text except TimeoutError: return search() def next_page(page_number): try: input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))) #输入框 submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))) #确定 input.clear() input.send_keys(page_number) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number))) get_products() except TimeoutError: next_page(page_number) def get_products(): wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item"))) html=driver.page_source #进行解析 doc=pq(html) items=doc('#mainsrp-itemlist .items .item').items() #字典 for item in items: product={ 'image':item.find('.pic .img').attr('src'),#获取SRC属性 'price':item.find('.price').text(),#商品价格 'deal':item.find('.deal-cnt').text()[:-3],#成交量 去掉付款人三个字 'title':item.find(".title").text(),#商品标题 'shop':item.find(".shop").text(),#店名 'location':item.find(".location").text() #地址 } print(product) save_data(product) def save_data(result): with open('淘宝商品信息','a+',encoding="utf-8")as f: f.write(json.dumps(result,ensure_ascii=False)+' ') f.close() def main(): total=search() #只打印数字页数,强制转换为int类型 total=int(re.compile('(d+)').search(total).group(1)) print(total) for i in range(2,total+1): next_page(i) driver.quit() if __name__ == '__main__': main()