zoukankan      html  css  js  c++  java
  • 使用Selenium爬取淘宝商品

    import pymongo
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    from pyquery import PyQuery as pq
    from urllib.parse import quote
    
    # browser = webdriver.Chrome()
    # browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
    
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(chrome_options=chrome_options)
    
    
    MONGO_URL = 'localhost'
    MONGO_DB = 'taobao'
    MONGO_COLLECTION = 'products'
    
    KEYWORD = 'ipad'
    
    MAX_PAGE = 100
    
    SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
    
    
    wait = WebDriverWait(browser, 10)
    client = pymongo.MongoClient(MONGO_URL)
    db = client[MONGO_DB]
    
    
    def index_page(page):
        """
        抓取索引页
        :param page: 页码
        """
        print('正在爬取第', page, '')
        try:
            url = 'https://s.taobao.com/search?q=' + quote(KEYWORD)
            browser.get(url)
            if page > 1:
                input = wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
                submit = wait.until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
                input.clear()
                input.send_keys(page)
                submit.click()
            wait.until(
                EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
            get_products()
        except TimeoutException:
            index_page(page)
    
    
    def get_products():
        """
        提取商品数据
        """
        html = browser.page_source
        doc = pq(html)
        items = doc('#mainsrp-itemlist .items .item').items()
        for item in items:
            product = {
                'image': item.find('.pic .img').attr('data-src'),
                'price': item.find('.price').text(),
                'deal': item.find('.deal-cnt').text(),
                'title': item.find('.title').text(),
                'shop': item.find('.shop').text(),
                'location': item.find('.location').text()
            }
            print(product)
            save_to_mongo(product)
    
    
    def save_to_mongo(result):
        """
        保存至MongoDB
        :param result: 结果
        """
        try:
            if db[MONGO_COLLECTION].insert(result):
                print('存储到MongoDB成功')
        except Exception:
            print('存储到MongoDB失败')
    
    
    def main():
        """
        遍历每一页
        """
        for i in range(1, MAX_PAGE + 1):
            index_page(i)
        browser.close()
    
    
    if __name__ == '__main__':
        main()
    View Code
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    from urllib.parse import quote
    from pyquery import PyQuery
    from pymongo import MongoClient
    
    browser = webdriver.Chrome()
    wait = WebDriverWait(browser, 10)
    KEYWORD = 'iPad'
    
    
    def index_page(page):
        """
        抓取索引页
        :param page: 页码
        """
        print('正在爬取第', page, '')
        try:
            url = 'https://s.taobao.com/search?q=' + quote(KEYWORD)
            browser.get(url)
            if page > 1:
                input = wait.until(
                    EC.presence_of_element_located(
                        (By.CSS_SELECTOR, '#mainsrp-page div.form > input')
                    )
                )
                # 利用 CSS 选择器,选择 id=mainsrp-page 的节点的所有元素,再从中选择父节点为 div 的 input 元素
                # 其中 div 的属性 class=form。此时选择的是输入框
                submit = wait.until(
                    EC.element_to_be_clickable(
                        (By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')
                    )
                )
                # 利用CSS选择器,选择 id=mainsrp-pager 的节点的所有元素,再从中选择父节点为 div 的 span 元素
                # 其中 div 的属性 class=form,span 的属性 class=btn J_Submit。此时选择的是"确定"按钮
                input.clear()                   # 清除输入框中的内容
                input.send_keys(page)           # 在输入框中输入页码 page
                submit.click()                  # 点击确定按钮
            wait.until(
                EC.text_to_be_present_in_element(
                    (By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)
                )
            )
            # 利用CSS选择器,选择 id=mainsrp-pager 的节点的所有元素,在从中选出 span 节点,
            # 其中span节点的父节点为li节点,li节点的属性class=item active
            wait.until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, '.m-itemlist .items .item')
                )
            )
            # 利用 CSS 选择器,选择 class=m-iremlist 的节点,从该节点中选择 class-items 的节点,再从中选择 class=item 的节点的所有元素
            # 这里对应的是这一页中的每个淘宝商品
            get_products()
        except TimeoutException:
            index_page(page)
    
    
    def get_products():
        """
        提取商品数据
        """
        html = browser.page_source                          # 获取源代码
        doc = PyQuery(html)                                 # 解析html
        items = doc('#mainsrp-itemlist .items .item').items()
        for item in items:
            # 这个 item 中
            product = {
                'image': item.find('.pic .J_ItemPic.img').attr('data-src'),
                # 找到 class=pic 的节点,再从中选择 class=J_ItemPic img 的节点,最后通过 attr() 方法获取 data-src 属性
                'price': item.find('.price.g_price.g_price-highlight').text(),
                # 找到属性 class=price g_price g_price-highlight 的节点,获取其文本(价格)
                'deal': item.find('.deal-cnt').text(),
                # 找到 class=deal-cnt 的节点,获取其文本(付款人数)
                'title': item.find('row.row-2.title').text(),
                # 找到属性值 class=row row-2 title 的节点,获取其文本(商品价格)
                'shop': item.find('.shop').text(),
                # 获取 class=shop 的节点的文本(店铺)
                'location': item.find('.location').text()
                # 获取 class=location 的节点的文本(店铺地址)
            }
            print(product)
            save_to_mongo(product)
    
    
    MONGO_URL = 'localhost'
    MONGO_DB = 'TaoBao'
    MONGO_COLLECTION = 'products'
    client = MongoClient(MONGO_URL)
    db = client[MONGO_DB]
    
    
    def save_to_mongo(result):
        """
        将爬取结果保存到MongoDB
        :param result: 结果
        :return:
        """
        try:
            if db[MONGO_COLLECTION].insert(result):
                print('存储到 MongoDB 成功')
        except Exception:
            print('存储到 MongoDB 失败')
    
    
    MAX_PAGE = 100
    
    
    def main():
        """
        遍历每一页
        :return:
        """
        for i in range(1, MAX_PAGE + 1):
            index_page(i)
        browser.close()
    
    
    if __name__ == '__main__':
        main()
    与上面的一样
  • 相关阅读:
    LeetCode Count of Range Sum
    LeetCode 158. Read N Characters Given Read4 II
    LeetCode 157. Read N Characters Given Read4
    LeetCode 317. Shortest Distance from All Buildings
    LeetCode Smallest Rectangle Enclosing Black Pixels
    LeetCode 315. Count of Smaller Numbers After Self
    LeetCode 332. Reconstruct Itinerary
    LeetCode 310. Minimum Height Trees
    LeetCode 163. Missing Ranges
    LeetCode Verify Preorder Serialization of a Binary Tree
  • 原文地址:https://www.cnblogs.com/liyihua/p/11230984.html
Copyright © 2011-2022 走看看