zoukankan      html  css  js  c++  java
  • Python 爬取各大代理IP网站(元类封装)

    import requests
    from pyquery import PyQuery as pq
    base_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8'
    }
    def get_page(url):
        headers = dict(base_headers)
        print('Getting',url)
        try:
            r = requests.get(url,headers=headers)
            print('Getting result',url,r.status_code)
            if r.status_code == 200:
                return r.text
        except ConnectionError:
            pritn('Cramling Failed',url)
            return None
            
    
    
    # 道生一:创建抽取代理的metaclass
    class ProxyMetaclass(type):
        """
            元类,在FreeProxyGetter类中加入
            __CrawlName__,__CrawlFunc__和__CrawlFuncCount__
            三个参数,分别表示爬虫函数名,函数实例和爬虫函数的数量。
        """
        # __new__控制__init__的执行,所以在其执行之前
        # cls:代表要__init__的类,此参数在实例化时由Python解释器自动提供
        # name:类名
        # bases:代表继承父类的集合
        # attrs:类的方法及属性的集合
        def __new__(cls, name, bases, attrs):
            count = 0
            # 在attrs字典加入两个key,key的值是个列表
            attrs['__CrawlFunc__'] = []
            attrs['__CrawlName__'] = []
            for k, v in attrs.items():
                if 'crawl_' in k:
                    attrs['__CrawlName__'].append(k) #函数名依次添加进attrs['__CrawlName__']列表
                    attrs['__CrawlFunc__'].append(v) #函数实例依次添加进attrs['__CrawlFunc__']列表
                    print(k,v)
                    #print(attrs['__CrawlName__'])
                    count += 1
            for k in attrs['__CrawlName__']:
                # 剔除原有的字典键值对
                attrs.pop(k)
            attrs['__CrawlFuncCount__'] = count
            print(attrs)
            return type.__new__(cls, name, bases, attrs)
    
    
    # 一生二:创建代理获取类
    
    class ProxyGetter(object, metaclass=ProxyMetaclass):
        def get_raw_proxies(self, site):
            proxies = []
            print('Site', site)
            for func in self.__CrawlFunc__:
                if func.__name__==site:
                    this_page_proxies = func(self)
                    for proxy in this_page_proxies:
                        print('Getting', proxy, 'from', site)
                        proxies.append(proxy)
              print(proxies)
    return proxies def crawl_daili66(self, page_count=4): start_url = 'http://www.66ip.cn/{}.html' urls = [start_url.format(page) for page in range(1, page_count + 1)] # format和%s的用法一样 for url in urls: print('Crawling', url) html = get_page(url) if html: doc = pq(html) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() yield ':'.join([ip, port])   ''' def crawl_proxy360(self): start_url = 'http://www.proxy360.cn/Region/China' print('Crawling', start_url) html = get_page(start_url) if html: doc = pq(html) lines = doc('div[name="list_proxy_ip"]').items() for line in lines: ip = line.find('.tbBottomLine:nth-child(1)').text() port = line.find('.tbBottomLine:nth-child(2)').text() yield ':'.join([ip, port])   ''' def crawl_goubanjia(self): start_url = 'http://www.goubanjia.com/free/gngn/index.shtml' html = get_page(start_url) if html: doc = pq(html) tds = doc('td.ip').items() for td in tds: td.find('p').remove() yield td.text().replace(' ', '') if __name__ == '__main__': # 二生三:实例化ProxyGetter crawler = ProxyGetter() print(crawler.__CrawlName__) # 三生万物 for site_label in range(crawler.__CrawlFuncCount__): site = crawler.__CrawlName__[site_label] # site_label是列表的索引值 myProxies = crawler.get_raw_proxies(site)

    运行结果

    D:pythontest>python proxy_ip.py
    crawl_goubanjia <function ProxyGetter.crawl_goubanjia at 0x00000000035D2510>
    crawl_daili66 <function ProxyGetter.crawl_daili66 at 0x00000000035D2488>
    {'__qualname__': 'ProxyGetter', '__module__': '__main__', '__CrawlName__': ['cra
    wl_goubanjia', 'crawl_daili66'], '__CrawlFunc__': [<function ProxyGetter.crawl_g
    oubanjia at 0x00000000035D2510>, <function ProxyGetter.crawl_daili66 at 0x000000
    00035D2488>], 'get_raw_proxies': <function ProxyGetter.get_raw_proxies at 0x0000
    0000035D2400>, '__CrawlFuncCount__': 2}
    ['crawl_goubanjia', 'crawl_daili66']
    Site crawl_goubanjia
    Getting http://www.goubanjia.com/free/gngn/index.shtml
    Getting result http://www.goubanjia.com/free/gngn/index.shtml 403
    []
    Site crawl_daili66
    Crawling=== http://www.66ip.cn/1.html
    Getting http://www.66ip.cn/1.html
    Getting result http://www.66ip.cn/1.html 200
    Getting 123.163.97.198:9999 from crawl_daili66
    Getting 36.249.109.21:9999 from crawl_daili66
    Getting 163.204.245.52:9999 from crawl_daili66
    Getting 222.189.247.207:9999 from crawl_daili66
    Getting 87.250.218.12:44168 from crawl_daili66
    Getting 118.172.176.61:8080 from crawl_daili66
    Getting 134.119.214.206:1080 from crawl_daili66
    Getting 110.74.208.154:21776 from crawl_daili66
    Crawling=== http://www.66ip.cn/2.html
    Getting http://www.66ip.cn/2.html
    Getting result http://www.66ip.cn/2.html 200
    Getting 120.234.138.102:53779 from crawl_daili66
    Getting 110.86.136.127:9999 from crawl_daili66
    Getting 59.57.38.197:9999 from crawl_daili66
    Getting 202.62.86.94:83 from crawl_daili66
    Getting 210.22.176.146:37299 from crawl_daili66
    Getting 180.183.136.212:8080 from crawl_daili66
    Getting 183.87.153.98:49602 from crawl_daili66
    Getting 222.124.2.186:8080 from crawl_daili66
    Getting 123.169.126.9:3 from crawl_daili66
    Getting 123.169.126.93:9999 from crawl_daili66
    Getting 158.255.249.58:50100 from crawl_daili66
    Getting 1.198.72.242:9999 from crawl_daili66
    Crawling=== http://www.66ip.cn/3.html
    Getting http://www.66ip.cn/3.html
    Getting result http://www.66ip.cn/3.html 200
    Getting 163.204.246.10:2 from crawl_daili66
    Getting 186.159.112.6:53281 from crawl_daili66
    Getting 163.204.246.102:9999 from crawl_daili66
    Getting 88.87.72.72:8080 from crawl_daili66
    Getting 193.169.118.6:53281 from crawl_daili66
    Getting 196.216.220.204:36739 from crawl_daili66
    Getting 185.109.62.124:808 from crawl_daili66
    Getting 1.193.246.78:9999 from crawl_daili66
    Getting 188.131.239.119:8118 from crawl_daili66
    Getting 1.10.188.93:34871 from crawl_daili66
    Getting 182.116.237.203:9999 from crawl_daili66
    Getting 139.99.223.230:8080 from crawl_daili66
    Crawling=== http://www.66ip.cn/4.html
    Getting http://www.66ip.cn/4.html
    Getting result http://www.66ip.cn/4.html 200
    Getting 163.204.246.232:9999 from crawl_daili66
    Getting 117.28.96.105:9999 from crawl_daili66
    Getting 202.29.220.34:38961 from crawl_daili66
    Getting 123.169.114.80:9999 from crawl_daili66
    Getting 115.42.34.3:8080 from crawl_daili66
    Getting 41.84.131.78:53281 from crawl_daili66
    Getting 123.163.96.207:9999 from crawl_daili66
    Getting 182.35.83.12:9999 from crawl_daili66
    Getting 191.241.226.230:53281 from crawl_daili66
    Getting 202.138.236.35:56413 from crawl_daili66
    Getting 194.1.193.226:35646 from crawl_daili66
    Getting 202.158.77.122:47284 from crawl_daili66

    ['123.163.97.198:9999', '36.249.109.21:9999', '163.204.245.52:9999', '222.189.247.207:9999', '87.250.218.12:44168',
    '118.172.176.61:8080', '134.119.214.206:1080', '110.74.208.154:21776', '120.234.138.102:53779', '110.86.136.127:9999',
    '59.57.38.197:9999', '202.62.86.94:83', '210.22.176.146:37299', '180.183.136.212:8080', '183.87.153.98:49602',
    '222.124.2.186:8080', '123.169.126.9:3', '123.169.126.93:9999', '158.255.249.58:50100', '1.198.72.242:9999',
    '163.204.246.10:2', '186.159.112.6:53281', '163.204.246.102:9999', '88.87.72.72:8080', '193.169.118.6:53281',
    '185.109.62.124:808', '1.193.246.78:9999', '188.131.239.119:8118', '1.10.188.93:34871', '182.116.237.203:9999',
    '139.99.223.230:8080', '163.204.246.232:9999', '117.28.96.105:9999', '202.29.220.34:38961', '123.169.114.80:9999',
    '115.42.34.3:8080', '41.84.131.78:53281', '123.163.96.207:9999', '182.35.83.12:9999', '191.241.226.230:53281',
    '202.138.236.35:56413', '194.1.193.226:35646','196.216.220.204:36739', '202.158.77.122:47284']

    
    
    //看来只有一个代理网站能爬到数据
  • 相关阅读:
    weblogic详解
    Java中常见的5种WEB服务器介绍
    Eclipse 插件ibeetl
    Eclipse安装svn插件的几种方式
    在 Laravel 应用中使用 pjax 进行页面加速
    Pjax无刷新跳转页面实现,支持超链接与表单提交
    emlog通过pjax实现无刷新加载网页--完美解决cnzz统计和javascript失效问题
    PJAX全局无刷新的设置方法~
    pjax使用小结
    jQuery+pjax简单示例汇总
  • 原文地址:https://www.cnblogs.com/linyouyi/p/11460034.html
Copyright © 2011-2022 走看看