zoukankan      html  css  js  c++  java
  • 爬虫_淘宝(selenium)

    总体来说代码还不是太完美

    实现了js渲染网页的解析的一种思路

    主要是这个下拉操作,不能一下拉到底,数据是在中间加载进来的,

    具体过程都有写注释

      1 from selenium import webdriver
      2 import time
      3 from selenium.webdriver.support.ui import WebDriverWait
      4 from selenium.webdriver.support import expected_conditions as EC
      5 from selenium.webdriver.common.by import By
      6 from lxml import etree
      7 import re
      8 import pymongo
      9 
     10 client = pymongo.MongoClient('127.0.0.1', port=27017)
     11 db = client.taobao_mark
     12 collection = db.informations
     13 
     14 
     15 def search(url):
     16 
     17     # 设置无头浏览器
     18     opt = webdriver.ChromeOptions()
     19     opt.set_headless()
     20     driver = webdriver.Chrome(options=opt)
     21     driver.get(url)
     22 
     23     # 使用显示等待加载输入文本框,设置搜索关键字,以'轻薄本'为例
     24     input_key = WebDriverWait(driver, 10).until(
     25         EC.presence_of_element_located((By.ID, 'q')))
     26     input_key.send_keys('轻薄本')
     27 
     28     # 显示等待加载搜索按钮,然后执行点击操作
     29     search_btn = WebDriverWait(driver, 10).until(
     30         EC.presence_of_element_located((By.CLASS_NAME, 'btn-search')))
     31     search_btn.click()
     32 
     33     # 搜索出来的网页内容只有一行与关键字相关,剩下的都是广告
     34     # 此时发现更多相关内容在//span[@class="see-all"]这个标签的链接中,
     35     # 通过xpath获取链接,进行跳转
     36     element = etree.HTML(driver.page_source)
     37     list_page_url = element.xpath('//span[@class="see-all"]/a/@href')[0]
     38     list_page_url = 'https://s.taobao.com/' + list_page_url
     39     driver.close()
     40     return list_page_url
     41     
     42 
     43 def scroll(driver):
     44     # 如果没有进行动态加载,只能获取四分之一的内容,调用js实现下拉网页进行动态加载
     45     for y in range(7):
     46         js='window.scrollBy(0,600)'
     47         driver.execute_script(js)
     48         time.sleep(0.5)
     49 
     50 
     51 def get_detail_url(url):
     52 
     53     # 用来存储列表页中商品的的链接
     54     detail_urls = []
     55 
     56     opt = webdriver.ChromeOptions()
     57     opt.set_headless()
     58     driver = webdriver.Chrome(options=opt)
     59     # driver = webdriver.Chrome()
     60     driver.get(url)
     61 
     62     scroll(driver)
     63 
     64     # 通过简单处理,获取当前最大页数
     65     max_page = WebDriverWait(driver, 10).until(
     66         EC.presence_of_element_located((By.XPATH, '//div[@class="total"]'))
     67         )
     68     text = max_page.get_attribute('textContent').strip()
     69     max_page = re.findall('d+', text, re.S)[0]    
     70     # max_page = int(max_page)
     71     max_page = 1
     72     # 翻页操作
     73     for i in range(1, max_page+1):
     74         print('正在爬取第%s页' % i)
     75         # 使用显示等待获取页面跳转页数文本框
     76         next_page_btn = WebDriverWait(driver, 10).until(
     77             EC.presence_of_element_located((By.XPATH, '//input[@class="input J_Input"]'))
     78             )
     79 
     80         # 获取确定按钮,点击后可进行翻页
     81         submit = WebDriverWait(driver, 10).until(
     82             EC.element_to_be_clickable((By.XPATH, '//span[@class="btn J_Submit"]'))
     83             )
     84 
     85         next_page_btn.clear()
     86         next_page_btn.send_keys(i)
     87         # 这里直接点击会报错'提示不可点击',百度说有一个蒙层,
     88         # 沉睡两秒等待蒙层消失即可点击
     89         time.sleep(2)
     90         submit.click()
     91         scroll(driver)
     92 
     93         urls = WebDriverWait(driver, 10).until(
     94         EC.presence_of_all_elements_located((By.XPATH, '//a[@class="img-a"]'))
     95         )
     96         for url in urls:
     97             detail_urls.append(url.get_attribute('href'))
     98     driver.close()
     99     # 返回所有商品链接列表
    100     return detail_urls
    101 
    102 def parse_detail(detail_urls):
    103     parameters = []
    104     opt = webdriver.ChromeOptions()
    105     opt.set_headless()
    106     driver = webdriver.Chrome(options=opt)
    107     for url in detail_urls:
    108         parameter = {}
    109 
    110         print('正在解析网址%s' % url)
    111         driver.get(url)
    112         html = driver.page_source
    113         element = etree.HTML(html)
    114 
    115         # 名字name
    116         name = element.xpath('//div[@class="spu-title"]/text()')
    117         name = name[0].strip()
    118         parameter['name'] = name
    119 
    120         # 价格price
    121         price = element.xpath('//span[@class="price g_price g_price-highlight"]/strong/text()')[0]
    122         price = str(price)
    123         parameter['price'] = price
    124 
    125         # 特点specials
    126         b = element.xpath('//div[@class="item col"]/text()')
    127         specials = []
    128         for i in b:
    129             if re.match('w', i, re.S):
    130                 specials.append(i)
    131         parameter['specials'] = specials
    132 
    133         param = element.xpath('//span[@class="param"]/text()')
    134         # 尺寸
    135         size = param[0]
    136         parameter['size'] = size
    137         # 笔记本CPU
    138         cpu = param[1]
    139         parameter['cpu'] = cpu
    140         # 显卡
    141         graphics_card = param[2]
    142         parameter['graphics_card'] = graphics_card
    143         # 硬盘容量
    144         hard_disk = param[3]
    145         parameter['hard_disk'] = hard_disk
    146         # 处理器主频
    147         processor = param[4]
    148         parameter['processor'] = processor
    149         # 内存容量
    150         memory = param[5]
    151         parameter['memory'] = memory
    152         parameter['url'] = url
    153         print(parameter)
    154         print('='*50)
    155         save_to_mongo(parameter)
    156         parameters.append(parameter)
    157     return parameters
    158         
    159 
    160 def save_to_mongo(result):
    161     try:
    162         if collection.insert(result):
    163             print('success save to mongodb', result)
    164     except Exception:
    165         print('error to mongo')
    166 
    167 
    168 def main():
    169     url = 'https://www.taobao.com/'
    170     list_page_url = search(url)
    171     detail_urls = get_detail_url(list_page_url)
    172     parameters = parse_detail(detail_urls)
    173 
    174 if __name__ == '__main__':
    175     main()

    运行结果

     数据库

  • 相关阅读:
    JavaScript递归方法 生成 json tree 树形结构数据
    分布式系统唯一ID生成方案汇总
    Twitter-Snowflake,64位自增ID算法详解
    手机端页面自适应解决方案—rem布局
    vue.js之路由
    kafka数据迁移实践
    mysql查询时强制区分大小写
    js加密参数传给后台,后台解密base64
    Target runtime com.genuitec.runtime.generic.jee60 is not defined
    怎么在点击浏览器前进、后退键时刷新页面而不读取缓存
  • 原文地址:https://www.cnblogs.com/MC-Curry/p/9531908.html
Copyright © 2011-2022 走看看