zoukankan      html  css  js  c++  java
  • IP+IDC-chinaz抓取

    #-*-coding:gbk-*-
    #code by anyun.org
    import urllib
    import re
    import time
    
    
    def getHtml(url):
        page = urllib.urlopen(url)
        html = page.read()
        html = html.replace('
    ', '')
        html = html.replace('       ', ' ')
        html = html.replace('   ', '')
        html = html.replace('   ', '')
        # html = html.replace(' ','')
        return html
    
    
    def getcontext(html):
        reg = (r'<span class="Whwtdhalf w15-0">(.*?)</span>')
        listre = re.compile(reg)
        mylist = re.findall(listre, html)
        return mylist
    
    def getadd(html):
        reg = (r'<span class="Whwtdhalf w50-0">(.*?)</span>')
        listre = re.compile(reg)
        mylist = re.findall(listre, html)
        return mylist
    
    def geterr(html):
        reg = (r'<div class="col-red lh30 fz14 jspu">(.*?)</div>')
        listre = re.compile(reg)
        mylist = re.findall(listre, html)
        return mylist
    
    if __name__ == '__main__':
    	f =open('list.txt','r')
    	for i in f.readlines():
    		i=i.strip()
    		
    		try:
    			Url='http://ip.chinaz.com/?ip=http://'+i
    		except:
    			print 'error'
    		Html = getHtml(Url)
    	#	print (getcontext(Html))
    		
    		if len(geterr(Html))==0:
    			print getcontext(Html)[0],getcontext(Html)[3] 
    			,getcontext(Html)[1],getcontext(Html)[4] 
    			,getcontext(Html)[2],getcontext(Html)[5] 
    			,getadd(Html)[0],getadd(Html)[1]
    			
    			f1 = open('ok.txt','a')
    			print >>f1,getcontext(Html)[0],getcontext(Html)[3] 
    			,getcontext(Html)[1],getcontext(Html)[4] 
    			,getcontext(Html)[2],getcontext(Html)[5] 
    			,getadd(Html)[0],getadd(Html)[1]
    			f1.close()
    			
    		else:
    			print i,'解析失败'
    			f2=open('err.txt','a')
    			print >>f2,i,'解析失败'
    			f2.close()
    		time.sleep(0.5)
    	print 'over'
    

      

  • 相关阅读:
    Python可视化库Matplotlib绘图基础学习
    字典特征和文本特征数据抽取
    ipc_11_快乐 happy
    关于Scanf的返回值问题
    [转]网站性能测试总结
    C语言运算符优先级
    成员运算符(·)和指向结构体成员运算符(->)的区别
    c++抛出异常与栈展开(stack unwinding)
    What is a Delegate?
    1.2 Variables and Arithmetic Expressions
  • 原文地址:https://www.cnblogs.com/crac/p/5778741.html
Copyright © 2011-2022 走看看