zoukankan      html  css  js  c++  java
  • selenium+pyquery爬取淘宝商品信息

    import re
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from pyquery import PyQuery as pq
    
    
    KEYWORD = '小米手机'
    MAX_PAGE = 3
    
    # 浏览器驱动
    browser = webdriver.Chrome()
    wait = WebDriverWait(browser,10)
    
    
    def get_products():
        # 获取网页源代码
        html = browser.page_source
        # 解析
        content = pq(browser.page_source)
        # 得到所有选择的内容
        items = content('#mainsrp-itemlist .m-itemlist .grid.g-clearfix .item').items()
        for item in items:
            product = {
                'image':item.find('.pic .img').attr('data-src'),
                'price':item.find('.price').text().strip(),
                'deal':item.find('.deal-cnt').text(),
                'title':item.find('.title').text(),
                'shop':item.find('.shop').text(),
                'location':item.find('.location').text()
            }
    
            print('--------{}----------
    '.format(product))
    
    
    def index_page(page):
        '''
        抓取索引页
        :param page:
        :return:
        '''
        print(10*'-','正在抓取第{}页'.format(page),10*'-')
        try:
            url = 'https://s.taobao.com/search?q={}'.format(KEYWORD)
            print(url)
            browser.get(url)
            # 如果抓取的不是第一页,进行跳页操作
            if page > 1:
                input = wait.until(EC.presence_of_element_located((
                    By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > input'
                )))
                submit = wait.until(EC.element_to_be_clickable((
                    By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'
                )))
                input.clear()
                input.send_keys(page)
                submit.click()
            # 等待页面加载完成(当前高亮页码是page)
            wait.until(EC.text_to_be_present_in_element((
                By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'
            ),str(page)))
            # 等待所有商品信息加载完成
            wait.until(EC.presence_of_element_located((
                By.CSS_SELECTOR,'#mainsrp-itemlist .m-itemlist .grid.g-clearfix .item')
            ))
            get_products()
        except TimeoutException:
            index_page()
    
    
    def main():
        # 遍历每一页
        for page in range(1,MAX_PAGE+1):
            index_page(page)
  • 相关阅读:
    胖虎都看得懂的CSS入门
    Python-ORM之sqlalchemy的简单使用
    类似fabric主机管理demo
    Redis 数据库学习
    sublime 3插件安装记录
    斐波那契数列—java实现
    mysql基础操作记录
    [转]修改github已提交的用户名和邮箱
    python nose的html报告优化
    python report中文显示乱码
  • 原文地址:https://www.cnblogs.com/ray-mmss/p/9385927.html
Copyright © 2011-2022 走看看