zoukankan      html  css  js  c++  java
  • 淘宝爬虫

     1 from selenium import webdriver
     2 from selenium.webdriver.common.by import By
     3 from selenium.webdriver.support import expected_conditions as EC
     4 from selenium.webdriver.support.wait import WebDriverWait
     5 from selenium.common.exceptions import TimeoutException
     6 from pyquery import PyQuery as pq
     7 import re
     8 from config import *
     9 import pymongo
    10 
    11 client = pymongo.MongoClient(MONGO_URL)
    12 db = client[MONGO_DB]
    13 browser = webdriver.Chrome()
    14 
    15 wait = WebDriverWait(browser, 10)
    16 
    17 
    18 def search():
    19     try:
    20         browser.get('https://www.taobao.com')
    21         input_ = wait.until(
    22             EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
    23         )
    24         submit = wait.until(
    25             EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))        
    26         )
    27 
    28         input_.send_keys('xiaomi')
    29         submit.click()
    30 
    31         total = wait.until(
    32             EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))
    33             )    
    34         get_products()
    35         return total.text    
    36     except TimeoutException:
    37         return search()
    38 
    39 def next_page(page_num):
    40     try:
    41         input_ = wait.until(
    42                 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
    43             )
    44         submit = wait.until(
    45                 EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))        
    46             )
    47         input_.clear()
    48         input_.send_keys(page_num)
    49         submit.click()
    50         wait.until(EC.text_to_be_present_in_element(
    51             (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_num)))
    52         get_products()
    53     except TimeoutException:
    54         next_page(page_num)
    55 
    56 def get_products():
    57     wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
    58     html = browser.page_source
    59     doc = pq(html)
    60     items = doc('#mainsrp-itemlist .items .item').items()
    61     for item in items:
    62         product ={
    63             'image': item.find('.pic .img').attr('src'),
    64             'price': item.find('.price').text(),
    65             'deal': item.find('.deal-cnt').text()[:-3],
    66             'title': item.find('.title').text(),
    67             'shop': item.find('.shop').text(),
    68             'location': item.find('.location').text()
    69 
    70         }
    71         print(product)
    72         save_to_mongo(product)
    73 
    74 def save_to_mongo(result):
    75     try:
    76         if db[MONGO_TABLE].insert(result):
    77             print('success save to mongodb', result)
    78     except Exception:
    79         print('error to mongo')
    80 
    81 def main():
    82     total = search()
    83     total = int(re.compile('(d+)').search(total).group(1))
    84     # print(total)
    85     for i in range(2, total):
    86         next_page(i)
    87     browser.close()
    88 
    89 if __name__ == '__main__':
    90     main()

     config.py

    1 MONGO_URL = 'localhost'
    2 MONGO_DB = 'taobao'
    3 MONGO_TABLE = 'product'

     

    运行结果:

     数据库:

     

  • 相关阅读:
    泛型程序设计详解(一)
    面向对象三大特性-----封装、继承、多态
    委托与事件-委托事件案例(三)
    委托与事件-事件详解(二)
    委托与事件-委托详解(一)
    抽象类及接口详解
    基础类型详解下
    C#类型详解
    【JVM】-- JVM内存结构
    【redis】-- redis的持久化(作为数据库)
  • 原文地址:https://www.cnblogs.com/MC-Curry/p/9338906.html
Copyright © 2011-2022 走看看