zoukankan      html  css  js  c++  java
  • Python爬取代理ip

     1 # -*- coding:utf-8 -*-
     2 #author : willowj
     3 import urllib
     4 import urllib2
     5 from bs4 import BeautifulSoup
     6 import re
     7 import bs4
     8 
     9 import sys
    10 
    11 
    12 reload(sys)  
    13 sys.setdefaultencoding('utf8') 
    14 
    15 
    16 def ip_test(ip,url= "https://www.baidu.com"):
    17     #test ip if can be used
    18     #url = "http://ip.chinaz.com/getip.aspx"# 默认测试网址
    19     ip1="http://"+ip
    20     try :
    21         res = urllib.urlopen(url,proxies={'http:':ip1}).read() #尝试代理访问
    22         print 'ok',ip1 #,res
    23         return True
    24     except Exception,e:
    25         print "failed"
    26         return False
    27     
    28 
    29 def get_iphtml_inyoudaili():
    30     url='http://www.youdaili.net'
    31     html=urllib2.urlopen(url) 
    32     code=html.read()
    33     #href="http://www.youdaili.net/Daili/http/26672.html" title="12月27号 最新代理http服务器ip地址"
    34     regexp='href="(.*?)" .*?最新代理http服务器ip地址'
    35     pat=re.compile(regexp)
    36     met=re.findall(pat,code)
    37     print met[0]
    38     #最新代理http服务器ip地址 html
    39     return met[0]
    40     
    41 
    42 def getIps(url):
    43     #getip from website, test,and  return,save aviable ips in 'ips.txt'
    44     htmlip=urllib2.urlopen(url) 
    45     codeip=htmlip.read()
    46 
    47     regexpip='([1-9][0-9]{0,2}.S*?)@HTTP#'  #IP样式
    48     pat_ip=re.compile(regexpip) 
    49 
    50     met_ip=re.findall(pat_ip,codeip)
    51 
    52     ips=[]
    53     file_open=open('ips.txt','w')
    54     for x in met_ip:
    55         print x
    56         if ip_test(x):
    57             ips.append(x)
    58             file_open.write(x+'
    ')
    59     file_open.close()
    60     #print ips,'youdaili'
    61     return ips
    62 
    63 
    64 def saveIps(list):
    65     file_open=open('ips.txt','w')
    66     for ip in list:
    67         file_open.write(ip+'
    ')
    68     file_open.close()
    69 
    70 
    71 def read_ips(file='ips.txt'):
    72     '''读取IP 以list返回'''
    73     file_open=open(file)
    74     lines=file_open.readlines()
    75     ips=[]
    76     for line in  lines:
    77         ip=line.strip("
    ")
    78         ips.append(ip)
    79     print ips
    80     return ips
    81  
    82  
    83 if __name__=="__main__":
    84     
    85     ips = getIps(get_iphtml_inyoudaili())
    86 
    87     saveIps(ips)
  • 相关阅读:
    栈:删除最外层的括号 (Leetcode 1021 / 155 / 1172 / 剑指31 / 面试 03.03)
    JDBC: Druid连接池
    JDBC: C3P0
    JDBC: C3P0
    JDBC: C3P0连接池
    JDBC: DBCP连接池
    JDBC: 数据库连接池
    JDBC: JDBC 控制事务
    JDBC: 预处理对象
    wpf 键盘快捷键响应
  • 原文地址:https://www.cnblogs.com/willowj/p/6246640.html
Copyright © 2011-2022 走看看