zoukankan      html  css  js  c++  java
  • 使用selenium做简单爬虫的实例

    selenium 是一个Web自动化测试的软件包,可以用于自动测试Web应用,也可以用于当作简单的爬虫制作工具,

    这是一个简单的demo,用于爬取Google APP Store中的一个类别:

    # -*- coding: utf-8 -*-
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support.ui import WebDriverWait
    from time import sleep
    import sqlite3
    import sys
    
    # connect the sqlite3 
    
    def Conn_DB(db_name = 'app_info.db'):
      try:
        conn = sqlite3.connect(db_name)
      except Exception, e:
        print "Conn Error ", e
      return conn
    
    # get the category of the apps
    
    def Get_Category(root_address):
      url_list = root_address.split('/')
      return url_list[-1].replace("?",' ').split(' ')[0]
    
    # we have to login so that to get the info from every app
    
    def Login_Google(browser, category_root_address):
      
      browser.get(category_root_address)
    
      # click to login
      login_link = browser.find_element_by_id('gb_70')
      webdriver.ActionChains(browser).move_to_element(login_link).click(login_link).perform()
    
      # input your email here
      email = browser.find_element_by_name('Email')
      # you should input your email here
      email.send_keys('') 
    
      # input your password here
      pwd = browser.find_element_by_name('Passwd')
      # you should input your password for your email here
      pwd.send_keys('')
      pwd.send_keys(Keys.RETURN)
    
      print 'Login Success'
    
    
    # load the whole page and then return the number of the apps under the category
    
    def Load_All_Apps(browser):
    
      # try to load the whole page to select want I want, the magic number 13 is based on the test
      
      for times in xrange(13):
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        sleep(2.5)
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.5);")
        sleep(2.5)
        print times
    
        # click the show more button to load more apps
        show_more_button = browser.execute_script("return document.querySelector('#show-more-button')['style']['cssText'];")
        if show_more_button != 'display: none;':
          browser.execute_script("document.querySelector('#show-more-button').click();")
          print 'click button'
        print show_more_button
    
      # to the bottom of the page
      browser.execute_script("window.scrollTo(0, 0);")
    
      number = browser.execute_script("return document.querySelectorAll('button.price').length;")
      print number
      
      return number
    
    def Click_Install_Button(browser, category_root_address):
      get_permissions_code = """var permissions = document.querySelectorAll('.perm-description');
    var precise_locaton = 'precise location (GPS and network-based)';
    var approximate_location = 'approximate location (network-based)';
    var ways = '';
    
    for (var perm in permissions) {
    	if (permissions[perm].innerHTML == precise_locaton) {
    		ways += 'p';
    	} else if (permissions[perm].innerHTML == approximate_location) {
    		ways += 'a';
    	}
    }
    return ways;"""
    
      # get all install button objects
      get_button_list_code = """return document.querySelectorAll('button.price');"""
      button_list = browser.execute_script(get_button_list_code)
      # print dir(button_list[0])
      # button_list.reverse()
    
      numbers_of_button = len(button_list)
    
      count = 0
      # index = 1
      sleep(3)
    
      #webdriver.ActionChains(browser).move_to_element(button_list[1]).click(button_list[1]).perform()
      #sleep(1)
      #browser.execute_script("document.querySelector('#purchase-cancel-button').click();")
      #webdriver.ActionChains(browser).move_to_element(button_list[3]).click(button_list[3]).perform()
      #sleep(1)
      #browser.execute_script("document.querySelector('#purchase-cancel-button').click();")
      
      category = Get_Category(category_root_address)
    
      get_app_address_code = """var app_address_list = document.querySelectorAll("h2 a");var list = [];
    for (var i = 0; i < app_address_list.length; i++) {list.push(app_address_list[i]['href']);} return list;"""
      address_list = browser.execute_script(get_app_address_code)
    
      conndb = Conn_DB()
      db_cursor = conndb.cursor()
    
      number_of_i_want = 0
    
      insert_sql = u"""insert into app_info (categroy, name, link, get_geo_ways) values ('{0}', '{1}', '{2}', '{3}')"""
    
      for index in range(1, numbers_of_button, 2):
        try:
          webdriver.ActionChains(browser).move_to_element(button_list[index]).click(button_list[index]).perform()
          sleep(3.5)
          count += 1
          #index += 2
        except IndexError:
          print "Out of index"
          break
        
        try:
          print "Count ", count
          perms = browser.execute_script(get_permissions_code)
          sleep(2)
          appname = browser.execute_script("return document.querySelector('.purchase-header .title').innerHTML;")
          print u"App id is: ", appname , u"Perm is: ", perms, u"Address is: ", address_list[count - 1]
          
          if perms:
            sql_with_data = insert_sql.format(category, appname, address_list[count - 1], perms)
            db_cursor.execute(sql_with_data)
            conndb.commit()
            number_of_i_want += 1
            
        except Exception, e:
          print "Error for ", e, "Number is ", count, "Pers is", perms
          continue
        # click cancle button
        browser.execute_script("document.querySelector('#purchase-cancel-button').click();")
        sleep(1)
    
      print "compary ", count , numbers_of_button, "I want :", number_of_i_want
      db_cursor.close()
      conndb.close()
      # print browser.execute_script()
    
    if __name__ == '__main__':
      root_address = 'https://play.google.com/store/apps/category/TRAVEL_AND_LOCAL?hl=en'
      
      driver = webdriver.Chrome()
      Login_Google(driver, root_address)
      Load_All_Apps(driver)
      Click_Install_Button(driver, root_address)
    
      #sys.exit()
    
      fd = file("./res.txt", "wb")
      fd.write("over")
      fd.close()
    

      

  • 相关阅读:
    TCP源码—连接建立
    TCP系列02—连接管理—1、三次握手与四次挥手
    TCP系列01—概述及协议头格式
    ubuntu软件管理apt与dpkg
    318. Maximum Product of Word Lengths
    317. Shortest Distance from All Buildings
    316. Remove Duplicate Letters
    315. Count of Smaller Numbers After Self
    314. Binary Tree Vertical Order Traversal
    313. Super Ugly Number
  • 原文地址:https://www.cnblogs.com/jaw-crusher/p/3669387.html
Copyright © 2011-2022 走看看