引入相关的包
import os
from urllib.parse import unquote
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
使用 selenium 启动 Chrome 浏览器
url = 'https://uland.taobao.com/sem/tbsearch?'
def get_driver(url):
# 实例化一个启动参数对象
options = webdriver.ChromeOptions()
# 添加启动参数
options.add_argument('--headless') # 无界面模式
# 禁用浏览器正在被自动化程序控制的提示
# options.add_argument('--disable-infobars') # 新版已失效
options.add_experimental_option("useAutomationExtension", False) # 添加实验性质的设置参数
options.add_experimental_option("excludeSwitches", ['enable-automation'])
# 禁用图片加载
prefs = {
'profile.default_content_setting_values': {
'images': 2
}
}
options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(executable_path='./chromedriver.exe', options=options)
driver.get(url)
inputs = driver.find_element_by_id('J_search_key')
inputs.send_keys('华为手机')
search_btn = driver.find_element_by_css_selector('.submit')
search_btn.click()
return driver
driver = get_driver(url)
获取商品信息
def get_products(driver):
items = driver.find_elements(By.XPATH, "//div//ul[@class='pc-search-items-list']/li")
data = []
for item in items:
title = item.find_element_by_xpath(".//div/span[@class='title-text']").text
price = item.find_element_by_xpath(".//div/span[@class='coupon-price-afterCoupon']").text
store = item.find_element_by_xpath(".//div/div[@class='seller-name']").text
sell = item.find_element_by_xpath(".//div/div[@class='sell-info']").text.split(' ')[-1]
store_link = item.find_element_by_xpath("./a").get_attribute('href')
data.append([title, price, store, sell, store_link])
df = pd.DataFrame(data, columns=['title', 'price', 'store', 'sell', 'store_link'])
save_df(df)
结果保存为csv
def save_df(df=None, path='./phone.csv'):
if os.path.exists(path):
df_exist = pd.read_csv(path, encoding='utf8', index_col=0)
df = pd.concat([df_exist, df])
df.to_csv(path, encoding='utf8')
获取下一页
def get_next_page(driver):
driver.implicitly_wait(10)
next_btn = driver.find_element_by_xpath(
".//span[@class='pc-search-page-item pc-search-page-item-after J_page-nav-item']")
next_btn.click()
循环获取所有信息
def get_all_pages(driver, maxPage=4):
try:
current = int(unquote(driver.current_url).split('=')[-1])
except:
current = 0
while current <= maxPage:
get_products(driver)
get_next_page(driver)
driver.implicitly_wait(10)
current = int(unquote(driver.current_url).split('=')[-1])
结果: