zoukankan      html  css  js  c++  java
  • python多线程抓取代理服务器

    文章转载自:https://blog.linuxeye.com/410.html

     代理服务器:http://www.proxy.com.ru

      1 #coding: utf-8
      2 
      3 import urllib2
      4 import re
      5 import time
      6 import threading
      7 import MySQLdb
      8 
      9 rawProxyList = []
     10 checkedProxyList = []
     11 
     12 #抓取代理网站
     13 targets = []
     14 for i in xrange(1, 23):
     15     target = r"http://www.proxy.com.ru/list_%d.html" % i
     16     targets.append(target)
     17     #print target + "
    "
     18 
     19 #抓取代理服务器正则
     20 p = re.compile(r'''<tr><b><td>(d+)</td><td>(.+?)</td><td>(d+)</td><td>(.+?)</td><td>(.+?)</td></b></tr>''')
     21 
     22 #获取代理的类
     23 
     24 class ProxyGet(threading.Thread):
     25     def __init__(self, target):
     26         threading.Thread.__init__(self)
     27         self.target = target
     28 
     29 
     30     def getProxy(self):
     31         req = urllib2.Request(self.target)
     32         respnse = urllib2.urlopen(req)
     33         result = respnse.read()
     34         matches = p.findall(result)
     35         #print matches
     36         for row in matches:
     37             ip = row[1]
     38             port = row[2]
     39             addr = row[4].decode("cp936").encode("utf-8")
     40             proxy = [ip, port, addr]
     41             #print proxy
     42             rawProxyList.append(proxy)
     43 
     44 
     45     def run(self):
     46         self.getProxy()
     47 
     48 #核对代理是否有效的类
     49 class ProxyCheck(threading.Thread):
     50     def __init__(self,proxyList):
     51         threading.Thread.__init__(self)
     52         self.proxyList = proxyList
     53         self.timeout = 5
     54         self.testUrl = "http://www.baidu.com/"
     55         self.testStr = "030173"
     56 
     57     def checkProxy(self):
     58         cookies = urllib2.HTTPCookieProcessor()
     59         for proxy in self.proxyList:
     60             proxyHandler = urllib2.ProxyHandler({"http": r'http://%s:%s' %(proxy[0], proxy[1])})
     61             #print r'http://%s:%s' %(proxy[0],proxy[1])
     62             opener = urllib2.build_opener(cookies, proxyHandler)
     63             opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')]
     64             #urllib2.install_opener(opener)
     65             t1 = time.time()
     66 
     67             try:
     68                 #req = urllib2.urlopen("http://www.baidu.com", timeout=self.timeout)
     69                 req = opener.open(self.testUrl, timeout=self.timeout)
     70                 #print "urlopen is ok...."
     71                 result = req.read()
     72                 #print "read html...."
     73                 timeused = time.time() - t1
     74                 pos = result.find(self.testStr)
     75                 #print "pos is %s" %pos
     76 
     77                 if pos >= 1:
     78                     checkedProxyList.append((proxy[0], proxy[1], proxy[2], timeused))
     79                     print "ok ip: %s %s %s %s" %(proxy[0],proxy[1],proxy[2],timeused)
     80                 else:
     81                      continue
     82             except Exception, e:
     83                 #print e.message
     84                 continue
     85 
     86     def run(self):
     87         self.checkProxy()
     88 
     89 
     90 if __name__ == "__main__":
     91     getThreads = []
     92     checkThreads = []
     93 
     94 #对每个目标网站开启一个线程负责抓取代理
     95 for i in range(len(targets)):
     96     t = ProxyGet(targets[i])
     97     getThreads.append(t)
     98 
     99 for i in range(len(getThreads)):
    100     getThreads[i].start()
    101 
    102 for i in range(len(getThreads)):
    103     getThreads[i].join()
    104 
    105 print '.'*10 + "总共抓取了%s个代理" % len(rawProxyList) + '.'*10
    106 
    107 #开启20个线程负责校验,将抓取到的代理分成20份,每个线程校验一份
    108 for i in range(20):
    109     t = ProxyCheck(rawProxyList[((len(rawProxyList)+19)/20) * i:((len(rawProxyList)+19)/20) * (i+1)])
    110     checkThreads.append(t)
    111 
    112 for i in range(len(checkThreads)):
    113     checkThreads[i].start()
    114 
    115 for i in range(len(checkThreads)):
    116     checkThreads[i].join()
    117 
    118 print '.'*10 + "总共抓取了%s个代理" % len(checkedProxyList) + '.'*10
    119 
    120 #插入数据库,四个字段ip, port, speed, addr
    121 def db_insert(insert_list):
    122     try:
    123         conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="meimei1118", db="ctdata", charset='utf8')
    124         cursor = conn.cursor()
    125         cursor.execute('delete from proxy')
    126         cursor.execute('alter table proxy AUTO_INCREMENT=1')
    127         cursor.executemany("INSERT INTO proxy(ip,port,speed,address) VALUES(%s, %s, %s,%s)", insert_list)
    128         conn.commit()
    129         cursor.close()
    130         conn.close()
    131 
    132     except MySQLdb.Error, e:
    133         print "Mysql Error %d: %s" %(e.args[0], e.args[1])
    134 
    135 #代理排序持久化
    136 proxy_ok = []
    137 for proxy in sorted(checkedProxyList, cmp=lambda x, y: cmp(x[3], y[3])):
    138     if proxy[3] < 8:
    139         #print "checked proxy is: %s:%s	%s	%s" %(proxy[0],proxy[1],proxy[2],proxy[3])
    140         proxy_ok.append((proxy[0], proxy[1], proxy[3], proxy[2]))
    141 
    142 db_insert(proxy_ok)
  • 相关阅读:
    git 派生子项目、分支、主干、合并
    C# 动态调用WebService
    sql导出数据库表结构Excel
    SQL Server 删除重复记录
    ThoughtWorks笔试题之Merchant's Guide To The Galaxy解析
    设置电信光猫为桥接模式
    Finder(文件内容搜索工具)
    数独解法(C#)
    Boyer-Moore (C#)
    Dijstra(C#)
  • 原文地址:https://www.cnblogs.com/nju2014/p/4614698.html
Copyright © 2011-2022 走看看