zoukankan      html  css  js  c++  java
  • 通过代理刷网页点击量

     
    #!/usr/bin/python
    #-*- coding:utf-8 -*-
    '''
    此脚本主要实现网页的点击量,除了实现次功能点外,还有三个知识点:
    1、随机获取代理ip,通过代理ip访问指定站点,其目的是防止ip被封
    2、访问一个页面后,随机休息几秒,再访问,其目的是防止网站前面有4-7层过滤设备拦截
    3、修改http的user agent字段,有些网站和4-7层设备会检查
    '''
     
    import urllib2,re,time,urllib,random,user_agents
    PROXYIPURL = 'http://www.goodips.com/?ip=&port=&dengji=&adr=%E7%94%B5%E4%BF%A1&checktime=&sleep=1%E7%A7%92%E5%86%85&cunhuo=48%E5%B0%8F%E6%97%B6%E4%BB%A5%E4%B8%8A&px='
    
    class getProxyIP:
    #   从网页抓去代理ip ,并整理格式
        def getProxyHtml(self):
    #        抓去代理 ip页面的代码
            page = urllib.urlopen(PROXYIPURL)
            html = page.read()
            #print html
            return html
         
        def ipPortRe(self):
    #       从页面代码中取出代理 ip和端口
            html = self.getProxyHtml()
            #ip_re = re.compile(r'(((2[0-4]d|25[0-5]|[01]?dd?).){3}(2[0-4]d|25[0-5]|[01]?dd?))')
            ip_re = re.compile(r'(d{1,3}.d{1,3}.d{1,3}.d{1,3}).+
    .+>(d{1,5})<')
            ip_port = re.findall(ip_re,html)
            return ip_port
             
             
        def proxyIP(self):
    #       格式化输出代理 ip和端口
            ip_port = self.ipPortRe()
    #       将代理 ip整理成['221.238.28.158:8081', '183.62.62.188:9999']格式        
            proxyIP = []
            for i in range(0,len(ip_port)):
                proxyIP.append(':'.join(ip_port[i]))   
                 
    #       将代理 ip整理成[{'http': 'http://221.238.28.158:8081'}, {'http': 'http://183.62.62.188:9999'}]格式        
            proxy_list = []
            for i in range(0,len(proxyIP)):
                a0 = 'http://%s'%proxyIP[i]
                a1 = {'http':'%s'%a0}
                proxy_list.append(a1)
            return proxy_list
     
    def getHtml(url):
        p = getProxyIP()
        proxy_list = p.proxyIP()
        proxy_ip =random.choice(proxy_list) #在proxy_list中随机取一个ip
        print proxy_ip    
        proxy_support = urllib2.ProxyHandler(proxy_ip)
        opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler)
        urllib2.install_opener(opener)
        request = urllib2.Request(url)
        user_agent = random.choice(user_agents.user_agents)  #在user_agents中随机取一个做user_agent
        request.add_header('User-Agent',user_agent) #修改user-Agent字段
        print user_agent
        html = urllib2.urlopen(request).read()
        print proxy_ip
        return proxy_ip
     
     
     
    URLS = ['http://www.x'x'xxw.net/study.asp?vip=',
            'http://www.x'x'x'x'x'x.com/?fromuid=16',
            ]
     
    count_True,count_False,count= 0,0,0
    while True:
        for url in URLS:
            count +=1
            try:
                proxy_ip=getHtml(url)            
            except urllib2.URLError:
                #print 'URLError! The bad proxy is %s' %proxy_ip
                count_False += 1
            except urllib2.HTTPError:
                #print 'HTTPError! The bad proxy is %s' %proxy_ip
                count_False += 1
            except:
                 #print 'Unknown Errors! The bad proxy is %s ' %proxy_ip 
                 count_False += 1
            randomTime = random.uniform(1,3) #取1-10之间的随机浮点数
            time.sleep(randomTime) #随机等待时间
            print '%d Eroors,%d ok,总数 %d' %(count_False,count - count_False,count)
     1 #!/usr/bin/python
     2 #-*- coding:utf-8 -*-
     3 '''
     4 Created on 2013-7-14
     5  
     6 @author: Administrator
     7 '''
     8  
     9 user_agents = [
    10     'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    11     'Opera/9.25 (Windows NT 5.1; U; en)',
    12     'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    13     'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    14     'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    15     'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
    16 ]
    View Code
  • 相关阅读:
    [洛谷P1484] 种树
    Codeforces Round #505 Div. 1 + Div. 2
    [NOIp2015] 斗地主
    ☆ [NOIp2016] 天天爱跑步 「树上差分」
    [NOI2010] 超级钢琴
    [POI2000] 病毒
    [SCOI2010] 股票交易
    [NOI2002] 贪吃的九头龙
    [ZJOI2008] 骑士
    LeetCode 笔记系列 18 Maximal Rectangle [学以致用]
  • 原文地址:https://www.cnblogs.com/chenjingyi/p/5794712.html
Copyright © 2011-2022 走看看