zoukankan      html  css  js  c++  java
  • Python 爬虫实例(11)—— 爬虫 苏宁易购

    # coding:utf-8
    
    import json
    import redis
    import time
    import requests
    session = requests.session()
    import logging.handlers
    import pickle
    import sys
    import re
    import datetime
    from bs4 import BeautifulSoup
    from selenium import webdriver
    import os
    
    
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    
    r =redis.Redis(host="123.56.74.190",port=6379,password="ZBHRwlb1608")
    
    import platform
    sysStr = platform.system()
    if sysStr =="Windows":
        LOG_FILE_check = 'C:\log\wlb\crawler\cic.log'
    else:
        LOG_FILE_check = '/log/wlb/crawler/cic.log'
    
    
    handler = logging.handlers.RotatingFileHandler(LOG_FILE_check, maxBytes=128 * 1024 * 1024,backupCount=10)  # 实例化handler  200M 最多十个文件
    fmt = '
    ' + '%(asctime)s - %(filename)s:%(lineno)s  - %(message)s'
    formatter = logging.Formatter(fmt)  # 实例化formatter
    handler.setFormatter(formatter)  # 为handler添加formatter
    logger = logging.getLogger('check')  # 获取名为tst的logger
    logger.addHandler(handler)  # 为logger添加handler
    logger.setLevel(logging.DEBUG)
    
    
    def spider():
        chromedriver = "C:Program Files (x86)GoogleChromeApplicationchromedriver.exe"
        os.environ["webdriver.chrome.driver"] = chromedriver
        browser = webdriver.Chrome(chromedriver)
    
        # 设置浏览器需要打开的url
        url = "https://www.suning.com/"
        browser.get(url)
        time.sleep(5)
    
        browser.find_element_by_id("searchKeywords").send_keys(u'手机')
        time.sleep(2)
    
        for i in range(1,100):
            browser.find_element_by_name("index1_none_search_ss1").click()
            browser.find_element_by_id("nextPage").click()
            result = browser.page_source
            soup = BeautifulSoup(result,'html.parser')
            result_ul = soup.find_all('div',attrs={"id":"filter-results"})[0]
    
            result_list = result_ul.find_all('div',attrs={"class":"li-bg"})
            print len(result_list)
            print result_list[1]
            # for item in result_list:
            #     print item
            #     print "==" * 30
            #
            # time.sleep(500)
    
            for item in result_list:
    
                item = str(item).replace('
    ','').replace('
    ','').replace('	','')
                print "==" * 30
    
                print item
                try:
                    sold_price = re.findall('pricefn="priceCenterShow"><i>¥</i>(.*?)<i>.*?</i></span>',item)[0]
                except:
                    sold_price = re.findall('<i>¥</i>(.*?)<i>.*?</i></span>',item)[0]
                try:
                    item_name = re.findall('<i class=".*?" style=".*?"></i>(.*?)</b></a>',item)[0]
                except:
                    item_name = re.findall('target="_blank" title="(.*?)"><i class=',item)[0]
    
                try:
                    item_url = re.findall('class=".*?" href="(.*?)" name',item)[0]
                except:
                    item_url = re.findall('<a class=".*?" href="(.*?)" id=', item)[0]
                try:
                    item_desc = re.findall('<span><i></i><em>(.*?)</em><b></b></span>',item)[0]
                except:
                    item_desc = re.findall('<em>(.*?)</em>', item)[0]
    
    
    
    
                print item_url
                print item_name
                print sold_price
                print item_desc
    
    
    
        time.sleep(500)
    
    
    
    
    
    
    
    
    
    
    
    
    
    spider()
  • 相关阅读:
    13 内建属性 _getattribute_ 内建函数
    12 垃圾回收GC
    11 元类
    12 动态语言 __slots__
    11 作用域
    10 带参数的装饰器 通用装饰器 类装饰器
    9 装饰器
    8 闭包
    6 生成器 yield 协程
    cmd常用命令
  • 原文地址:https://www.cnblogs.com/xuchunlin/p/8326007.html
Copyright © 2011-2022 走看看