zoukankan      html  css  js  c++  java
  • Python 爬虫实例(9)—— 搜索 爬取 淘宝

    # coding:utf-8
    
    import json
    import redis
    import time
    import requests
    session = requests.session()
    import logging.handlers
    import pickle
    import sys
    import re
    import datetime
    from bs4 import BeautifulSoup
    
    
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    
    
    
    
    
    import datetime
    # 生成一年的日期
    def dateRange(start, end, step=1, format="%Y-%m-%d"):
        strptime, strftime = datetime.datetime.strptime, datetime.datetime.strftime
        days = (strptime(end, format) - strptime(start, format)).days
        return [strftime(strptime(start, format) + datetime.timedelta(i), format) for i in xrange(0, days, step)]
    
    
    
    
    def spider():
        from selenium import webdriver
        import os
        # 引入chromedriver.exe
        chromedriver = "C:Program Files (x86)GoogleChromeApplicationchromedriver.exe"
        os.environ["webdriver.chrome.driver"] = chromedriver
        browser = webdriver.Chrome(chromedriver)
    
        # 设置浏览器需要打开的url
        url = "https://www.taobao.com/"
        browser.get(url)
        time.sleep(1)
    
        browser.find_element_by_id("q").send_keys(u'python')
        browser.find_element_by_class_name("btn-search").click()
        time.sleep(5)
    
        for i in range(1,100):
            browser.find_element_by_xpath('//a[@trace="srp_bottom_pagedown"]').click()
            time.sleep(15)
    
    
            result = browser.page_source
    
            result_replace = str(result).replace('
    ','').replace('
    ','').replace('	','').replace(' ','')
    
    
            result_replace = re.findall('<divclass="pic-boxJ_MouseEneterLeaveJ_PicBox">(.*?)</div><divclass="ctx-boxJ_MouseEneterLeaveJ_IconMoreNew">(.*?)</div><divclass="rowrow-4g-clearfix">(.*?)</div></div></div>',result_replace)
    
            print len(result_replace)
    
            for item in result_replace:
    
                item_imgurl = re.findall('data-src="(.*?)"alt=',item[0])[0]
                item_name = re.findall('alt="(.*?)"/></a></div><divclass=',item[0])[0]
    
                item_loation = re.findall('<divclass="location">(.*?)</div>',item[1])[0]
    
                company_name = re.findall('</span></span><span>(.*?)</span></a></div><divclass="location">',item[1])[0]
    
    
                company_price = re.findall('<divclass="priceg_priceg_price-highlight"><span>¥</span><strong>(.*?)</strong></div>',item[1])[0]
    
                purchase_num = re.findall('<divclass="deal-cnt">(.*?)人付款</div>',item[1])[0]
    
                print item_imgurl
                print item_name
                print item_loation
                print company_name
    
                print company_price
                print purchase_num
                print "="*30
    
                # time.sleep(1000)
    
        # 关闭浏览器
        # browser.quit()
    
    
    
    
    
    spider()
  • 相关阅读:
    codeforces #322 div 2 D. Three Logos (枚举)
    hdu 5481||bestcoder #57 div 2 C Desiderium ()
    codeforces #322 div 2 C. Developing Skills(思路)
    codeforces #322 div 2 B. Luxurious Houses (思路)
    codeforces #322 div 2 A. Vasya the Hipster(随便搞)
    [转自codeforces] How to come up with the solutions: techniques
    uva 489
    hdoj 5479 || bestcoder #57 div 2 A Scaena Felix(模拟)
    hdu 5480|| bestcoder   #57 div 2 Conturbatio(前缀和||树状数组)
    支付宝和内购的区别以及集成方法
  • 原文地址:https://www.cnblogs.com/xuchunlin/p/8313319.html
Copyright © 2011-2022 走看看