zoukankan      html  css  js  c++  java
  • 使用selenium模拟浏览器抓取淘宝信息

    通过Selenium模拟浏览器抓取淘宝商品美食信息,并存储到MongoDB数据库中。

    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from pyquery import PyQuery as pq
    import re
    import json
    from config import *
    import pymongo
    client = pymongo.MongoClient(MONGO_URL)
    db = client[MONGO_DB]
    browser = webdriver.Firefox()
    wait = WebDriverWait(browser,10)
    
    def search():
        try:
            browser.get('https://www.taobao.com')
            input = wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
            )
            submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button')))
            input.send_keys('美食')
            submit.click()
            total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
            get_products()
            return total.text
        except TimeoutException:
            return search()
    
    def next_page(page_number):
        try:
            input = wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
                )
            submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
            input.clear()
            input.send_keys(page_number)
            submit.click()
            wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
            get_products()
        except TimeoutException:
            return next_page(page_number)
    
    # def write_to_file(content):
    #     with open('E:/python/Projects/test1/result.txt','a',encoding='utf-8') as f:
    #         f.write(json.dumps(content,ensure_ascii=False) + '
    ')
    #         f.close()
    
    def get_products():
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))
        html = browser.page_source
        doc = pq(html)
        items = doc('#mainsrp-itemlist .items .item').items()
        for item in items:
            product = {
                'image': item.find('.pic .img').attr('src'),
                'price': item.find('.price').text(),
                'deal': item.find('.deal-cnt').text()[:-3],
                'title': item.find('.title').text(),
                'shop': item.find('.shop').text(),
                'location': item.find('.location').text()
            }
            print(product)
            save_to_mongo(product )
            # write_to_file(product)
    def save_to_mongo(result):
        try:
            if db[MONGO_TABLE].insert(result):
                print('存储到MONGODB成功',result)
        except Exception:
            print('存储到MONGODB失败',result)
    
    def main():
        total= search()
        total = int(re.compile('(d+)').search(total).group(1))
        for i in range(2,total+1):
            next_page(i)
        browser.close()
    if __name__ == '__main__':
        main()
  • 相关阅读:
    417 Pacific Atlantic Water Flow 太平洋大西洋水流
    416 Partition Equal Subset Sum 分割相同子集和
    415 Add Strings 字符串相加
    414 Third Maximum Number 第三大的数
    413 Arithmetic Slices 等差数列划分
    412 Fizz Buzz
    410 Split Array Largest Sum 分割数组的最大值
    409 Longest Palindrome 最长回文串
    day22 collection 模块 (顺便对比queue也学习了一下队列)
    day21 计算器作业
  • 原文地址:https://www.cnblogs.com/ls-pankong/p/12287012.html
Copyright © 2011-2022 走看看