zoukankan      html  css  js  c++  java
  • Python 爬虫实例(12)—— python selenium 爬虫


    # coding:utf-8 from common.contest import * def spider():

      url = "http://www.salamoyua.com/es/subasta.aspx?origen=subastas&subasta=79"
      
       chromedriver = 'C:/Users/xuchunlin/AppData/Local/Google/Chrome/Application/chromedriver.exe'
        chome_options = webdriver.ChromeOptions()
       
      #使用代理  # proxies = r.get('4') # chome_options.add_argument(('--proxy-server=http://' + proxies)) os.environ["webdriver.chrome.driver"] = chromedriver driver = webdriver.Chrome(chromedriver, chrome_options=chome_options) for i in range(1,100): print "正在爬取第" + str(i) + "页的数据" if i ==1: # 请求url driver.get(session_url) result = driver.page_source else: try: # 将页面滚动条拖到底部 js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) driver.find_element_by_id('ctl00_phContenidos_lbSiguiente').click() # 得到爬取页面的结果 result = driver.page_source time.sleep(3) except: result = "" soup = BeautifulSoup(result, 'html.parser') result_div = soup.find_all('figure', attrs={"class": "Lotes fade"}) # print len(result_div) for i in result_div:
          
             result_replace = replace(i)
                    print result_replace
    
                    item_url = re.findall('<figure class="Lotes fade"><a href="(.*?)" id=',result_replace)[0]
                    item_url = "http://www.salamoyua.com/es/" + item_url.replace('','')
    
                    item_imgurl = re.findall('<img id=".*?" src="..(.*?)" style="border-0px', result_replace)[0]
                    item_imgurl = "http://www.salamoyua.com" + item_imgurl.replace('', '')
    
                    if "Remate" not in result_replace:
                        sold_price = ""
                    else:
                        sold_price = re.findall('<p><strong>Remate:(.*?)</strong></p></figcaption>', result_replace)[0]
                        sold_price = sold_price.replace(' ','')
    
                    try:
    
                        item_lotnum = re.findall('title="Lote vendido"><span id=".*?">(.*?)</span>', result_replace)[0]
                        item_lotnum = item_lotnum.replace('Lote','').replace(' ','')
                    except:
                        item_lotnum = re.findall('<span id=".*?">(.*?)</span></header>',result_replace)[0]
                        item_lotnum = item_lotnum.replace('Lote', '').replace(' ', '')
    
                    print item_url
                    print item_lotnum
                    print item_imgurl
                    print sold_price
    
                  
      

    spider()
  • 相关阅读:
    二分法查找
    全排列 递归实现 c 语言实现
    南阳oj 题目290 动物统计加强版 字典树
    蛇形填数
    南阳理工oj 题目289 苹果 01背包
    南阳理工 oj 题目38 布线问题
    南阳理工oj 题目85 有趣的数 Cantor数表
    CSU-1110 RMQ with Shifts (单点更新+区间最小值 zkw线段树)
    POJ-2387 Til the Cows Come Home
    HDU-2680 Choose the best route
  • 原文地址:https://www.cnblogs.com/xuchunlin/p/8441398.html
Copyright © 2011-2022 走看看