zoukankan      html  css  js  c++  java
  • 使用selenium做简单爬虫的实例

    selenium 是一个Web自动化测试的软件包,可以用于自动测试Web应用,也可以用于当作简单的爬虫制作工具,

    这是一个简单的demo,用于爬取Google APP Store中的一个类别:

    # -*- coding: utf-8 -*-
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support.ui import WebDriverWait
    from time import sleep
    import sqlite3
    import sys
    
    # connect the sqlite3 
    
    def Conn_DB(db_name = 'app_info.db'):
      try:
        conn = sqlite3.connect(db_name)
      except Exception, e:
        print "Conn Error ", e
      return conn
    
    # get the category of the apps
    
    def Get_Category(root_address):
      url_list = root_address.split('/')
      return url_list[-1].replace("?",' ').split(' ')[0]
    
    # we have to login so that to get the info from every app
    
    def Login_Google(browser, category_root_address):
      
      browser.get(category_root_address)
    
      # click to login
      login_link = browser.find_element_by_id('gb_70')
      webdriver.ActionChains(browser).move_to_element(login_link).click(login_link).perform()
    
      # input your email here
      email = browser.find_element_by_name('Email')
      # you should input your email here
      email.send_keys('') 
    
      # input your password here
      pwd = browser.find_element_by_name('Passwd')
      # you should input your password for your email here
      pwd.send_keys('')
      pwd.send_keys(Keys.RETURN)
    
      print 'Login Success'
    
    
    # load the whole page and then return the number of the apps under the category
    
    def Load_All_Apps(browser):
    
      # try to load the whole page to select want I want, the magic number 13 is based on the test
      
      for times in xrange(13):
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        sleep(2.5)
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.5);")
        sleep(2.5)
        print times
    
        # click the show more button to load more apps
        show_more_button = browser.execute_script("return document.querySelector('#show-more-button')['style']['cssText'];")
        if show_more_button != 'display: none;':
          browser.execute_script("document.querySelector('#show-more-button').click();")
          print 'click button'
        print show_more_button
    
      # to the bottom of the page
      browser.execute_script("window.scrollTo(0, 0);")
    
      number = browser.execute_script("return document.querySelectorAll('button.price').length;")
      print number
      
      return number
    
    def Click_Install_Button(browser, category_root_address):
      get_permissions_code = """var permissions = document.querySelectorAll('.perm-description');
    var precise_locaton = 'precise location (GPS and network-based)';
    var approximate_location = 'approximate location (network-based)';
    var ways = '';
    
    for (var perm in permissions) {
    	if (permissions[perm].innerHTML == precise_locaton) {
    		ways += 'p';
    	} else if (permissions[perm].innerHTML == approximate_location) {
    		ways += 'a';
    	}
    }
    return ways;"""
    
      # get all install button objects
      get_button_list_code = """return document.querySelectorAll('button.price');"""
      button_list = browser.execute_script(get_button_list_code)
      # print dir(button_list[0])
      # button_list.reverse()
    
      numbers_of_button = len(button_list)
    
      count = 0
      # index = 1
      sleep(3)
    
      #webdriver.ActionChains(browser).move_to_element(button_list[1]).click(button_list[1]).perform()
      #sleep(1)
      #browser.execute_script("document.querySelector('#purchase-cancel-button').click();")
      #webdriver.ActionChains(browser).move_to_element(button_list[3]).click(button_list[3]).perform()
      #sleep(1)
      #browser.execute_script("document.querySelector('#purchase-cancel-button').click();")
      
      category = Get_Category(category_root_address)
    
      get_app_address_code = """var app_address_list = document.querySelectorAll("h2 a");var list = [];
    for (var i = 0; i < app_address_list.length; i++) {list.push(app_address_list[i]['href']);} return list;"""
      address_list = browser.execute_script(get_app_address_code)
    
      conndb = Conn_DB()
      db_cursor = conndb.cursor()
    
      number_of_i_want = 0
    
      insert_sql = u"""insert into app_info (categroy, name, link, get_geo_ways) values ('{0}', '{1}', '{2}', '{3}')"""
    
      for index in range(1, numbers_of_button, 2):
        try:
          webdriver.ActionChains(browser).move_to_element(button_list[index]).click(button_list[index]).perform()
          sleep(3.5)
          count += 1
          #index += 2
        except IndexError:
          print "Out of index"
          break
        
        try:
          print "Count ", count
          perms = browser.execute_script(get_permissions_code)
          sleep(2)
          appname = browser.execute_script("return document.querySelector('.purchase-header .title').innerHTML;")
          print u"App id is: ", appname , u"Perm is: ", perms, u"Address is: ", address_list[count - 1]
          
          if perms:
            sql_with_data = insert_sql.format(category, appname, address_list[count - 1], perms)
            db_cursor.execute(sql_with_data)
            conndb.commit()
            number_of_i_want += 1
            
        except Exception, e:
          print "Error for ", e, "Number is ", count, "Pers is", perms
          continue
        # click cancle button
        browser.execute_script("document.querySelector('#purchase-cancel-button').click();")
        sleep(1)
    
      print "compary ", count , numbers_of_button, "I want :", number_of_i_want
      db_cursor.close()
      conndb.close()
      # print browser.execute_script()
    
    if __name__ == '__main__':
      root_address = 'https://play.google.com/store/apps/category/TRAVEL_AND_LOCAL?hl=en'
      
      driver = webdriver.Chrome()
      Login_Google(driver, root_address)
      Load_All_Apps(driver)
      Click_Install_Button(driver, root_address)
    
      #sys.exit()
    
      fd = file("./res.txt", "wb")
      fd.write("over")
      fd.close()
    

      

  • 相关阅读:
    学习的原动力
    “六顶思考帽”给我的启示
    关于DataSet与Strongly typed DataSet几点思考(原创)
    设计模式之Singleton和Factory
    CentOS修改网络配置
    Proxmox VE(PVE)安装教程
    CentOS开启SELinux导致samba无法访问的解决办法
    nano编辑器使用教程
    CentOS 如何挂载硬盘
    PVE硬盘直通
  • 原文地址:https://www.cnblogs.com/jaw-crusher/p/3669387.html
Copyright © 2011-2022 走看看