zoukankan      html  css  js  c++  java
  • selenium模拟浏览器爬取淘宝产品信息

     1 from selenium import webdriver
     2 from selenium.webdriver.common.by import By
     3 from selenium.webdriver.support.ui import WebDriverWait
     4 from selenium.webdriver.support import expected_conditions as EC
     5 from selenium.common.exceptions import TimeoutException
     6 import re
     7 from pyquery import PyQuery
     8 from day01.config import *
     9 import pymongo
    10 client = pymongo.MongoClient(MONGO_URL) #连接mongodb
    11 db = client[MONGO_DB]
    12 
    13 browser = webdriver.Chrome()
    14 wait = WebDriverWait(browser,10)
    15 
    16 def search():
    17     try:
    18         browser.get("https://www.taobao.com")
    19         # 输入框
    20         input_box = wait.until(
    21             EC.presence_of_element_located((By.CSS_SELECTOR,"#q"))
    22         )
    23         submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
    24         input_box.send_keys("美食")
    25         submit.click()
    26         login = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#fm-login-id")))
    27         if login is not None:
    28             login.send_keys("********")
    29             password = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#fm-login-password")))
    30             password.send_keys("*********")
    31             login_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#login-form > div.fm-btn > button")))
    32             login_button.click()
    33         else:
    34             pass
    35         total_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total")))
    36         get_products()
    37         return total_page.text
    38     except TimeoutException:
    39         return search()
    40     # finally:
    41     #     browser.quit()
    42 
    43 def next_page(page_number):
    44     "操作翻页"
    45     try:
    46         input_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > input")))
    47         confirm_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
    48         input_page.clear()
    49         input_page.send_keys(page_number)
    50         confirm_button.click()
    51         # 判断页码数是否在当前页,用来判断元素中存在指定文本的
    52         wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number)))
    53         get_products()
    54     except TimeoutException:
    55         next_page(page_number)
    56 
    57 def get_products():
    58     "获取产品信息"
    59     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))
    60     html = browser.page_source #可以返回网页源码
    61     doc = PyQuery(html) #使用pyquery解析网页
    62     items = doc('#mainsrp-itemlist .items .item').items()
    63     for item in items:
    64         product = {
    65             'image':item.find('.pic .img').attr('src'),#获取标签属性
    66             'price':item.find('.price').text(), #价格
    67             'deal':item.find('.deal-cnt').text()[:-3], #成交量
    68             'title':item.find('.title').text(),
    69             'shop':item.find('.shop').text(),
    70             'location':item.find('.location').text()
    71         }
    72         # print(product)
    73         save_to_mongo(product)
    74         # from day01.connectMongo import ConnectMongo
    75         # con = ConnectMongo()
    76         # con.insert_one_data(product,"table")
    77 
    78 def save_to_mongo(result):
    79     try:
    80         if db[MONGO_TABLE].insert(result):
    81             print("存储到mongodb成功")
    82     except Exception as e:
    83         print("存储到mongodb异常,%s"%e)
    84 
    85 
    86 def main():
    87     result = search()
    88     total = int(re.compile("(d+)").search(result).group(1))
    89     for i in range(2,total+1):
    90         next_page(i)
    91 
    92 if __name__ == '__main__':
    93     main()
  • 相关阅读:
    mysql 关联关系
    Powershell
    判断Server Manager里面的Role是否已经安排
    Powershell 获取文件版本信息
    PowerShell---Operators 介绍
    C#代码覆盖率 -vsinstr和OpenCover
    敏捷测试介绍
    c#中abstract、override、new、virtual、sealed使用
    装箱和拆箱
    Code Review
  • 原文地址:https://www.cnblogs.com/yzmPython/p/14184494.html
Copyright © 2011-2022 走看看