zoukankan      html  css  js  c++  java
  • python 模拟浏览器

    想用python模拟浏览器访问web的方法测试些东西,有哪几种方法呢?

    一类:单纯的访问web,不解析其js,css等。

    1. urllib2

    #-*- coding:utf-8 -*
    import urllib2
    
    def Furllib2(ip,port,url,timeout):
        proxydict = {}
        proxydict['http'] = "http://%s:%s"%(ip,port)
        print proxydict
        proxy_handler = urllib2.ProxyHandler(proxydict)
        opener = urllib2.build_opener(proxy_handler)
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        urllib2.install_opener(opener)
        try:
            response = urllib2.urlopen(url,timeout=timeout)
            print response.geturl()
            print response.getcode()
            print response.info()
            print response.read()
            return True
        except:
            print 'some errors occored' + '-'*50
            return 0
    
    def main():
        proxyip = '14.18.16.69'
        proxyport = '80'
        proxy = 'http://2.181.1.127:80'
        url = 'http://www.cnblogs.com/'
        timeout = 4
        print Furllib2(proxyip,proxyport,url,timeout)
    
    
    if __name__ == "__main__":
        main()
    

     2. mechanize(与网站的自动化交互)

    http://wwwsearch.sourceforge.net/mechanize/doc.html

    def Fmechanize(url):
        cookies = mechanize.CookieJar()
        opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
        try:
            r = opener.open(url)  # GET
            # r = opener.open("http://example.com/", data)  # POST
            print r.geturl()
            print r.info()
            return True
    
        except:
            return 0
    

     二类:模拟浏览器,使用firefox等的浏览器引擎,支持js,css等。

    1. selenium 的firefox或者chrome等驱动,但是由于要打开一个浏览器,所以会比较慢(浏览器驱动可以到selenium官网上下载,也可以到firefox插件出搜索)

    def Fselenium_firefox(ip,port,url,timeout):
        try:
    
            profile = webdriver.FirefoxProfile()
            profile.set_preference('network.proxy.type', 1)
            profile.set_preference('network.proxy.http',ip)
            profile.set_preference('network.proxy.http_port', port)
            profile.update_preferences()
            driver = webdriver.Firefox(profile,timeout = timeout)
        except Exception:
            print traceback.print_exc()
            return 0
            pass
        try:
    
            driver.get(url)
            time.sleep(5)
            cookies= driver.get_cookies()
            print cookies
            # driver.get()
    
            driver.quit()
            return 1
    
    
        except Exception:
            traceback.print_exc()
            # print 'not have Union allianceid'
            driver.quit()
            return 0
    

     2. selenium :headless test使用selenium+ phantomjs驱动,无需打开浏览器,但是支持js的模拟浏览器动作,也就说说和你手工打开是没有区别的。

    http://selenium.googlecode.com/git/docs/api/py/api.html

    def Fselenium_phantomjs(ip,port,url,timeout):
        try:
            proxyip = '%s%s%s%s'%('--proxy=',ip,':',port)
            proxyport = '--proxy-type=http'
            service_args = []
            service_args.append(proxyip)
            service_args.append(proxyport)
            print service_args
    
            driver = webdriver.PhantomJS(service_args = service_args)
       #driver = webdriver.PhantomJS("/root/phantomjs-1.9.7-linux-x86_64/bin/phantomjs",service_args = service_args)制定phantomjs的位置 driver.set_page_load_timeout(timeout) driver.get(url) time.sleep(4) except Exception: traceback.print_exc() try: geturl = driver.current_url print driver.current_url return True except Exception: traceback.print_exc() geturl = None return 0

     3. qt,网上戗来的代码

    http://qt-project.org/wiki/PySide#PySide.QtWebKit.PySide.QtWebKit.QWebView.url

    from PyQt4 import QtCore, QtGui, QtWebKit, QtNetwork
    
    class cookieJar(QtNetwork.QNetworkCookieJar):
        def __init__(self, cookiesKey, parent=None):
            super(cookieJar, self).__init__(parent)
    
            self.mainWindow = parent
            self.cookiesKey = cookiesKey
            cookiesValue    = self.mainWindow.settings.value(self.cookiesKey)       
    
            if cookiesValue:
                cookiesList = QtNetwork.QNetworkCookie.parseCookies(cookiesValue)
                self.setAllCookies(cookiesList)
    
       # def setCookiesFromUrl (self, cookieList, url):
        #    cookiesValue = self.mainWindow.settings.value(self.cookiesKey)
         #   cookiesArray = cookiesValue if cookiesValue else QtCore.QByteArray()
    
          #  for cookie in cookieList:
           #     cookiesArray.append(cookie.toRawForm() + "\n")
    
            #self.mainWindow.settings.setValue(self.cookiesKey, cookiesArray)
    
            #return super(cookieJar, self).setCookiesFromUrl(cookieList, url)
        def deleteCookie(self,cookieList):
    	cookie = []
    	self.mainWindow.settings.value(cookie)
    class webView(QtWebKit.QWebView):
        def __init__(self, cookiesKey, url, parent=None):
            super(webView, self).__init__(parent)
    
            self.cookieJar = cookieJar(cookiesKey, parent)
    
            self.page().networkAccessManager().setCookieJar(self.cookieJar)
    
    class myWindow(QtGui.QMainWindow):
        def __init__(self, parent=None):
            super(myWindow, self).__init__(parent)
    
            self.cookiesKey = "cookies"
    
            self.centralwidget = QtGui.QWidget(self)
    
            self.tabWidget = QtGui.QTabWidget(self.centralwidget)
            self.tabWidget.setTabsClosable(True)
    
            self.verticalLayout = QtGui.QVBoxLayout(self.centralwidget)
            self.verticalLayout.addWidget(self.tabWidget)
    
            self.actionTabAdd = QtGui.QAction(self)
            self.actionTabAdd.setText("Add Tab")
            self.actionTabAdd.triggered.connect(self.on_actionTabAdd_triggered)
    
            self.lineEdit = QtGui.QLineEdit(self)
            self.lineEdit.setText("http://www.example.com")
    
            self.toolBar = QtGui.QToolBar(self)
            self.toolBar.addAction(self.actionTabAdd)
            self.toolBar.addWidget(self.lineEdit)
    
            self.addToolBar(QtCore.Qt.ToolBarArea(QtCore.Qt.TopToolBarArea), self.toolBar)
            self.setCentralWidget(self.tabWidget)
    
            self.settings = QtCore.QSettings()
    
        @QtCore.pyqtSlot()
        def on_actionShowCookies_triggered(self):
            webView = self.tabWidget.currentWidget()
            listCookies = webView.page().networkAccessManager().cookieJar().allCookies()
    
            for cookie in  listCookies:
                print cookie.toRawForm()
    
        @QtCore.pyqtSlot()
        def on_actionTabAdd_triggered(self):
            url = self.lineEdit.text()
            self.addNewTab(url if url else 'about:blank')
    
        def addNewTab(self, url):
            tabName = u"Tab {0}".format(str(self.tabWidget.count()))
    
            tabWidget= webView(self.cookiesKey, url, self)
            tabWidget.loadFinished.connect(self.on_tabWidget_loadFinished)
            tabWidget.load(QtCore.QUrl(url))
    
            tabIndex = self.tabWidget.addTab(tabWidget, tabName)
    
            self.tabWidget.setCurrentIndex(tabIndex)
    
        @QtCore.pyqtSlot()
        def on_tabWidget_loadFinished(self):
            cookies2 = self.settings.value(self.cookiesKey)
    	
    	
    if __name__ == "__main__":
        import sys
    
        app = QtGui.QApplication(sys.argv)
        app.setApplicationName('myWindow')
    
        main = myWindow()
        main.resize(666, 333)
        main.show()
    
        sys.exit(app.exec_())
    

    4. qt-headless

    http://qt-project.org/wiki/PySide#PySide.QtWebKit.PySide.QtWebKit.QWebView.url

    import sys  
    from PyQt4.QtGui import *  
    from PyQt4.QtCore import *  
    from PyQt4.QtWebKit import *  
      
    class Render(QWebPage):  
      def __init__(self, url):  
        self.app = QApplication(sys.argv)  
        QWebPage.__init__(self)  
        self.loadFinished.connect(self._loadFinished)  
        self.mainFrame().load(QUrl(url))  
        self.app.exec_()  
      
      def _loadFinished(self, result):  
        self.frame = self.mainFrame()  
        self.app.quit()  
      
    url = 'http://webscraping.com'  
    r = Render(url)  
    html = r.frame.toHtml()  
    print html
    

     5. splinter :打开浏览器,模拟操作,python的

    http://splinter.cobrateam.info/docs/tutorial.html

    >>> from splinter import Browser
    >>> browser = Browser()
    >>> url = "http://www.cnblogs.com"
    >>> browser.visit(url)
    

    具体用哪个要看你有什么具体的需求了

  • 相关阅读:
    国产html编辑器
    豆瓣90mhz电台还行
    购物搜索
    linode vps 优化集锦
    这个系统,我看行,
    代理网站
    合理设置域名TTL值给网站加速
    dns切换到dnspod
    elixir usage
    JSSH介绍
  • 原文地址:https://www.cnblogs.com/maseng/p/3578553.html
Copyright © 2011-2022 走看看