zoukankan      html  css  js  c++  java
  • 爬虫代理池源代码测试-Python3WebSpider

    元类属性的使用

    来源:
    https://github.com/Python3WebSpider/ProxyPool/blob/master/proxypool/crawler.py

    主要关于元类的使用:
    通过获取由元类生成的爬虫抓取类的部分属性.这里为抓取函数,以相同的字符开头的抓取函数,生成属性列表,这样可以持续调用.目的是可以仅仅添加不同的抓取函数抓取不同的网站,而类的其他部分不用做调整.
    ,

    部分代码:

    
    class ProxyMetaclass(type):
        def __new__(cls, name, bases, attrs):
            count = 0
            attrs['__CrawlFunc__'] = []
            for k, v in attrs.items():
                if 'crawl_' in k:
                    attrs['__CrawlFunc__'].append(k)
                    count += 1
            attrs['__CrawlFuncCount__'] = count
            return type.__new__(cls, name, bases, attrs)
    
    
    class Crawler(object, metaclass=ProxyMetaclass):
        def get_proxies(self, callback):
            proxies = []
            for proxy in eval("self.{}()".format(callback)):
                print('成功获取到代理', proxy)
                proxies.append(proxy)
            return proxies
           
        def crawl_daili66(self, page_count=4):
            """
            获取代理66
            :param page_count: 页码
            :return: 代理
            """
            start_url = 'http://www.66ip.cn/{}.html'
            urls = [start_url.format(page) for page in range(1, page_count + 1)]
            for url in urls:
                print('Crawling', url)
                html = get_page(url)
                if html:
                    doc = pq(html)
                    trs = doc('.containerbox table tr:gt(0)').items()
                    for tr in trs:
                        ip = tr.find('td:nth-child(1)').text()
                        port = tr.find('td:nth-child(2)').text()
                        yield ':'.join([ip, port])
    
    

    测试方法

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # @Time    : 12/19/19 4:10 PM
    # @Author  : yon
    # @Email   : @qq.com
    # @File    : test
    
    
    import json
    import re
    from pyquery import PyQuery as pq
    
    
    class ProxyMetaclass(type):
        def __new__(cls, name, bases, attrs):
            count = 0
            attrs['__CrawlFunc__'] = []
            for k, v in attrs.items():
                print("打印k")
                print(k)
                print("打印v")
                print(v)
                if 'crawl_' in k:
                    attrs['__CrawlFunc__'].append(k)
                    count += 1
            attrs['__CrawlFuncCount__'] = count
            return type.__new__(cls, name, bases, attrs)
    
    
    class Crawler(object, metaclass=ProxyMetaclass):
        def get_proxies(self, callback):
            proxies = []
            for proxy in eval("self.{}()".format(callback)):
                print('成功获取到代理', proxy)
                proxies.append(proxy)
            return proxies
    
        def crawl_daili66(self, page_count=4):
            """
            获取代理66
            :param page_count: 页码
            :return: 代理
            """
            start_url = 'http://www.66ip.cn/{}.html'
            urls = [start_url.format(page) for page in range(1, page_count + 1)]
            for url in urls:
                print('Crawling', url)
                html = get_page(url)
                if html:
                    doc = pq(html)
                    trs = doc('.containerbox table tr:gt(0)').items()
                    for tr in trs:
                        ip = tr.find('td:nth-child(1)').text()
                        port = tr.find('td:nth-child(2)').text()
                        yield ':'.join([ip, port])
    
        def crawl_ip3366(self):
            for page in range(1, 4):
                start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(page)
                html = get_page(start_url)
                ip_address = re.compile('<tr>s*<td>(.*?)</td>s*<td>(.*?)</td>')
                # s * 匹配空格,起到换行作用
                re_ip_address = ip_address.findall(html)
                for address, port in re_ip_address:
                    result = address + ':' + port
                    yield result.replace(' ', '')
    
        def crawl_kuaidaili(self):
            for i in range(1, 4):
                start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i)
                html = get_page(start_url)
                if html:
                    ip_address = re.compile('<td data-title="IP">(.*?)</td>')
                    re_ip_address = ip_address.findall(html)
                    port = re.compile('<td data-title="PORT">(.*?)</td>')
                    re_port = port.findall(html)
                    for address, port in zip(re_ip_address, re_port):
                        address_port = address + ':' + port
                        yield address_port.replace(' ', '')
    
        def crawl_xicidaili(self):
            for i in range(1, 3):
                start_url = 'http://www.xicidaili.com/nn/{}'.format(i)
                headers = {
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                    'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3',
                    'Host': 'www.xicidaili.com',
                    'Referer': 'http://www.xicidaili.com/nn/3',
                    'Upgrade-Insecure-Requests': '1',
                }
                html = get_page(start_url, options=headers)
                if html:
                    find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S)
                    trs = find_trs.findall(html)
                    for tr in trs:
                        find_ip = re.compile('<td>(d+.d+.d+.d+)</td>')
                        re_ip_address = find_ip.findall(tr)
                        find_port = re.compile('<td>(d+)</td>')
                        re_port = find_port.findall(tr)
                        for address, port in zip(re_ip_address, re_port):
                            address_port = address + ':' + port
                            yield address_port.replace(' ', '')
    
        def crawl_ip3366(self):
            for i in range(1, 4):
                start_url = 'http://www.ip3366.net/?stype=1&page={}'.format(i)
                html = get_page(start_url)
                if html:
                    find_tr = re.compile('<tr>(.*?)</tr>', re.S)
                    trs = find_tr.findall(html)
                    for s in range(1, len(trs)):
                        find_ip = re.compile('<td>(d+.d+.d+.d+)</td>')
                        re_ip_address = find_ip.findall(trs[s])
                        find_port = re.compile('<td>(d+)</td>')
                        re_port = find_port.findall(trs[s])
                        for address, port in zip(re_ip_address, re_port):
                            address_port = address + ':' + port
                            yield address_port.replace(' ', '')
    
        def crawl_iphai(self):
            start_url = 'http://www.iphai.com/'
            html = get_page(start_url)
            if html:
                find_tr = re.compile('<tr>(.*?)</tr>', re.S)
                trs = find_tr.findall(html)
                for s in range(1, len(trs)):
                    find_ip = re.compile('<td>s+(d+.d+.d+.d+)s+</td>', re.S)
                    re_ip_address = find_ip.findall(trs[s])
                    find_port = re.compile('<td>s+(d+)s+</td>', re.S)
                    re_port = find_port.findall(trs[s])
                    for address, port in zip(re_ip_address, re_port):
                        address_port = address + ':' + port
                        yield address_port.replace(' ', '')
    
        def crawl_data5u(self):
            start_url = 'http://www.data5u.com/free/gngn/index.shtml'
            headers = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
                'Cache-Control': 'max-age=0',
                'Connection': 'keep-alive',
                'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86',
                'Host': 'www.data5u.com',
                'Referer': 'http://www.data5u.com/free/index.shtml',
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
            }
            html = get_page(start_url, options=headers)
            if html:
                ip_address = re.compile('<span><li>(d+.d+.d+.d+)</li>.*?<li class="port.*?>(d+)</li>', re.S)
                re_ip_address = ip_address.findall(html)
                for address, port in re_ip_address:
                    result = address + ':' + port
                    yield result.replace(' ', '')
    
    
    class Getter():
        def __init__(self):
            self.crawler = Crawler()
    
        def run(self):
            print('获取器开始执行')
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                print(callback_label)
                callback = self.crawler.__CrawlFunc__[callback_label]
                print(callback)
                # # 获取代理
                # proxies = self.crawler.get_proxies(callback)
                # sys.stdout.flush()
                # for proxy in proxies:
                #     self.redis.add(proxy)
    
    
    if __name__ == '__main__':
        get = Getter()
        get.run()
    
    

    测试结果

    /home/baixiaoxu/PycharmProjects/pytthon-tt/venv/bin/python /home/baixiaoxu/PycharmProjects/pytthon-tt/proxypool/test.py
    打印k
    __module__
    打印v
    __main__
    打印k
    __qualname__
    打印v
    Crawler
    打印k
    get_proxies
    打印v
    <function Crawler.get_proxies at 0x7f905ca5a598>
    打印k
    crawl_daili66
    打印v
    <function Crawler.crawl_daili66 at 0x7f905ca5a620>
    打印k
    crawl_ip3366
    打印v
    <function Crawler.crawl_ip3366 at 0x7f905ca5a840>
    打印k
    crawl_kuaidaili
    打印v
    <function Crawler.crawl_kuaidaili at 0x7f905ca5a730>
    打印k
    crawl_xicidaili
    打印v
    <function Crawler.crawl_xicidaili at 0x7f905ca5a7b8>
    打印k
    crawl_iphai
    打印v
    <function Crawler.crawl_iphai at 0x7f905ca5a6a8>
    打印k
    crawl_data5u
    打印v
    <function Crawler.crawl_data5u at 0x7f905ca5a8c8>
    打印k
    __CrawlFunc__
    打印v
    ['crawl_daili66', 'crawl_ip3366', 'crawl_kuaidaili', 'crawl_xicidaili', 'crawl_iphai', 'crawl_data5u']
    获取器开始执行
    0
    crawl_daili66
    1
    crawl_ip3366
    2
    crawl_kuaidaili
    3
    crawl_xicidaili
    4
    crawl_iphai
    5
    crawl_data5u
    
    进程完成,退出码 0
    
    
    
  • 相关阅读:
    移动开发 Native APP、Hybrid APP和Web APP介绍
    urllib与urllib2的学习总结(python2.7.X)
    fiddler及postman讲解
    接口测试基础
    UiAutomator2.0 和1.x 的区别
    adb shell am instrument 命令详解
    GT问题记录
    HDU 2492 Ping pong (树状数组)
    CF 567C Geometric Progression
    CF 545E Paths and Trees
  • 原文地址:https://www.cnblogs.com/g2thend/p/12069968.html
Copyright © 2011-2022 走看看