zoukankan      html  css  js  c++  java
  • 使用Pyquery+selenium抓取淘宝商品信息

    配置文件,配置好数据库名称,表名称,要搜索的产品类目,要爬取的页数

    MONGO_URL = 'localhost'
    MONGO_DB = 'taobao'
    MONGO_TABLE = 'phone'
    
    SERVICE_ARGS = [
        '--disk-cache=true',  # 在phantomjs时使用缓存
        '--load-images=false'  # 使用phantomjs时不加载出图片
    ]
    
    KEYWORD = '手机'
    MAXPAGE = 5

    主程序

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # @Date    : 2018-06-14 22:02:26
    # @Author  : Chenjun (320316430@qq.com;)
    # @Link    : http://example.org
    # @Version : $Id$
    import re from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from pyquery import PyQuery as pq from config import * import pymongo #使用mongodb数据库存储,在此python提供pymongo库方便使用 client = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB] browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) #使用phantomjs无界面浏览器,在爬虫抓取时更方便,并且提供api配置 browser.set_window_size(1400, 900) wait = WebDriverWait(browser, 10) #设置等待时长等待信息加载出来
    #拿到所有的商品信息
    def search(): print('正在搜索...') try: browser.get('https://www.taobao.com') input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#q')) #等待输入框加载出来并插入光标 ) submit = wait.until(EC.element_to_be_clickable( (By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))) #等待搜索兼可被点击 input.send_keys(KEYWORD) #模拟用户输入 submit.click() #模拟用户点击 get_products() total = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))) #获取搜索结果总页数 return total.text except TimeoutException: return search()
    def next_page(page_number): print('正在翻页...') try: input = wait.until( EC.presence_of_element_located( (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input')) #等待输入页码框加载出来并插入光标 ) submit = wait.until(EC.element_to_be_clickable( (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) #等待跳转按钮可以被点击 input.clear() #清除当前页码 input.send_keys(page_number) #模拟输入新页码 submit.click() #模拟点击 wait.until(EC.text_to_be_present_in_element( (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number))) #等到网页跳转到输入的页面 get_products() except TimeoutException: next_page(page_number)
    #拿到具体商品信息
    def get_products(): wait.until(EC.presence_of_element_located(( By.CSS_SELECTOR, '#mainsrp-itemlist .items .item'))) #等待商品被加载出来 html = browser.page_source #拿到当前页面dom文档 doc = pq(html) items = doc('#mainsrp-itemlist .items .item').items() count = 0 for item in items: count += 1 product = { #pyquery解析文档 'image': item.find('.pic .img').attr('src'), 'price': item.find('.price').text(), 'deal': item.find('.deal-cnt').text()[:-3], 'title': item.find('.title').text(), 'shop': item.find('.shop').text(), 'location': item.find('.location').text() } save_to_mongo(product, count) print(type(items), type(item)) def save_to_mongo(result, count): try: if db[MONGO_TABLE].insert(result): #存储到mongodb print(f'存储{count}到了MONGODB成功') except Exception: print('存储失败') def main(): try: total = search() total = int(re.compile('(d+)').search(total).group(1)) if total >= MAXPAGE: total = MAXPAGE for i in range(2, total + 1): next_page(i) except Exception: print('出错啦!') finally: browser.close() #无论成败,记得关闭浏览器 if __name__ == '__main__': main()
  • 相关阅读:
    面向对象之多态,property
    描述符
    day23 面向对象之继承
    day22面向对象
    os模块
    logging日志模块,四种方式
    Linux 如何测试 IO 性能(磁盘读写速度)
    Vi命令:如何删除全部内容
    cdnbest如何查看站点操作日志(同步日志)
    Linux查找含有某字符串的所有文件
  • 原文地址:https://www.cnblogs.com/tarantino/p/9188972.html
Copyright © 2011-2022 走看看