zoukankan      html  css  js  c++  java
  • Selenium+Chrome+PhantomJS爬取淘宝美食

    搜索关键字

    利用selenium驱动浏览器搜索有关键字,得到查询后的商品列表

    分析页码并翻页

    得到商品码数,模拟翻页,得到后续页面的商品列表

    分析提取商品内容

    利用PyQuery分析源码,解析得到商品列表

    存储至MongoDB

    将商品列表信息存储到数据库MongoDB

     

    代码如下:

    import re
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from pyquery import PyQuery as pq
    from config import *
    import pymongo
    
    
    client = pymongo.MongoClient(MONGO_URL,connect=False)
    db = client[MONGO_DB]
    
    
    browser = webdriver.Chrome()
    # browser = webdriver.PhantomJS(r'D:phantomjs-2.1.1-windowsinphantomjs.exe')
    # browser.set_window_size(1400,900)
    wait = WebDriverWait(browser, 10)
    
    def search():
        print('正在搜索')
        try:
            browser.get('https://www.taobao.com')
            input = wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#q"))
            )
            submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
            input.send_keys(KEYWORD)
            submit.click()
            total = wait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total"))
            )
            get_products()
            return total.text
        except TimeoutException:
            return search()
    
    def next_page(page_number):
        print('正在翻页',page_number)
        try:
            input = wait.until (
                EC.presence_of_element_located ( (By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input") )
            )
            submit = wait.until (
                EC.element_to_be_clickable ( (By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit") ) )
            input.clear()
            input.send_keys(page_number)
            submit.click()
            wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
            get_products()
        except TimeoutException:
            next_page(page_number)
    
    def get_products():
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))
        html = browser.page_source
        doc = pq(html)
        items = doc('#mainsrp-itemlist .items .item').items()
        for item in items:
            product = {
                'image':item.find('.pic .img').attr('data-src'),
                'price':re.sub('\n','',item.find('.g_price').text()),
                'deal':item.find('.deal-cnt').text()[:-3],
                'title':re.sub(r'
    ',' ',item.find('.title').text()),
                'shop':item.find('.shop').text(),
                'location':item.find('.location').text()
            }
            print(product)
            save_to_mongo(product)
    
    def save_to_mongo(result):
        try:
            if db[MONGO_TABLE].insert_one(result):
                print('存储到MongoDB成功',result)
        except:
            print('存储到MongoDB失败',result)
    
    
    def main():
        try:
            total = search()
            total = int(re.compile(r'(d+)').search(total).group(1))
            for i in range(2,total + 1):
                next_page(i)
        except:
            print('出错啦')
        finally:
            browser.close()
    
    
    if __name__ == '__main__':
        main()
    spider.py
    MONGO_URL = 'localhost'
    
    MONGO_DB = 'toubao'
    
    MONGO_TABLE = 'product'
    
    SERVICE_ARGS = ['--load-images=[false]','--disk-cache=[true]']
    # SERVICE_ARGS = [
    #     '--proxy=113.106.249.42:80',
    #     '--proxy-type=socks5',
    # ]
    
    KEYWORD = '美食'
    config.py

    注意

    phantomjs已经不再支持selenium模块,所以我这里安装的是旧版本selenium pip install selenium==3.8.0

  • 相关阅读:
    操作系统之磁盘结构笔记
    Linux 操作系统位数(32or64)识别
    手把手教你mysql(十)索引
    Linux命令 — 设置或查看网络配置命令ifconfig
    字符数组的排列
    android 加载图片防止内存溢出
    eCos系统CPU负载测量
    模糊控制——理论基础(4模糊推理)
    模糊控制——理论基础(3模糊关系及其运算)
    模糊控制——理论基础(2隶属函数)
  • 原文地址:https://www.cnblogs.com/YangARTuan/p/10610055.html
Copyright © 2011-2022 走看看