zoukankan      html  css  js  c++  java
  • selenium&phantom实战--获取代理数据

    获取快代理网站的数据

    注意:

    #!/usr/bin/env python
    # _*_ coding: utf-8 _*_
    # __author__ ='kong'
    # 导入模块
    from selenium import webdriver
    
    # 定义一个类用来存放代理数据
    class Item(object):
        ip = None
        port = None
        anonymous = None
        type = None
        support = None
        local = None
        speed = None
    
    # 主类
    class GetProx(object):
        def __init__(self):
            self.startUrl = "http://www.kuaidaili.com/proxylist/"
            self.urls = self.getUrls()
            self.proxList = self.getProxyList(self.urls)
            self.fileName = 'proxy.txt'
            self.saveFile(self.fileName,self.proxList)
    
        # 获取所有要访问的url
        def getUrls(self):
            urls = []
            for i in xrange(1,11):
                url = self.startUrl + str(i)
                urls.append(url)
            return urls
    
        # 获取每个url的代理数据
        def getProxyList(self,urls):
            # 创建一个浏览器实例
            browser = webdriver.PhantomJS()
            proxyList = []
            item = Item()
            for url in urls:
                # 向指定的url发送请求
                browser.get(url)
                # 智能等待5秒
                browser.implicitly_wait(5)
                # 获取网页上的代理表格数据
                elements = browser.find_elements_by_xpath("//tbody/tr")
                for element in elements:
                    item.ip = element.find_element_by_xpath("./td[1]").text.encode("utf8")
                    item.port = element.find_element_by_xpath("./td[2]").text.encode("utf8")
                    item.anonymous = element.find_element_by_xpath("./td[3]").text.encode("utf8")
                    item.type = element.find_element_by_xpath("./td[4]").text.encode("utf8")
                    item.support = element.find_element_by_xpath("./td[5]").text.encode("utf8")
                    item.local = element.find_element_by_xpath("./td[6]").text.encode("utf8")
                    item.speed = element.find_element_by_xpath("./td[7]").text.encode("utf8")
                    proxyList.append(item)
            # 最后退出浏览器实例
            browser.quit()
            return proxyList
    
        # 代理数据写入文件中
        def saveFile(self,fileName,proxyList):
            with open(fileName,'w') as fp:
                for each in proxyList:
                    fp.write(each.ip + "	")
                    fp.write(each.port + "	")
                    fp.write(each.anonymous +"	")
                    fp.write(each.type + "	")
                    fp.write(each.support + "	")
                    fp.write(each.local + "	")
                    fp.write(each.speed + "	")
                    fp.write("
    ")
    
    if __name__ == '__main__':
        gp = GetProx()
    

      

  • 相关阅读:
    Java实现找出数组中重复次数最多的元素以及个数
    java经典小算法
    java将数组中的零放到末尾
    BP神经网络
    Centos配置Caffe详解
    JAVA面试题之实现字符串的倒序输出
    Android 发送短信与接收短信
    java 选择排序法
    java数组获取最值
    spring kafka consumer原理解析二
  • 原文地址:https://www.cnblogs.com/kongzhagen/p/6283143.html
Copyright © 2011-2022 走看看