zoukankan      html  css  js  c++  java
  • 爬取人人网新鲜事python版本

    View Code
    from sgmllib import SGMLParser
    import sys,urllib2,urllib,cookielib
    class spider(SGMLParser):
        def __init__(self,email,password):
            SGMLParser.__init__(self)
            self.h3=False
            self.h3_is_ready=False
            self.div=False
            self.h3_and_div=False
            self.a=False
            self.depth=0
            self.names=""
            self.dic={}   
             
            self.email=email
            self.password=password
            self.domain='renren.com'
            try:
                cookie=cookielib.CookieJar()
                cookieProc=urllib2.HTTPCookieProcessor(cookie)
            except:
                raise
            else:
                opener=urllib2.build_opener(cookieProc)
                urllib2.install_opener(opener)       
    
        def login(self):
            url='http://www.renren.com/PLogin.do'
            postdata={
                      'email':self.email,
                      'password':self.password,
                      'domain':self.domain  
                      }
            req=urllib2.Request(
                                url,
                                urllib.urlencode(postdata)            
                                )
            
            self.file=urllib2.urlopen(req).read()
            #print self.file
        def start_h3(self,attrs):
            self.h3 = True
        def end_h3(self):
            self.h3=False
            self.h3_is_ready=True
            
        def start_a(self,attrs):
            if self.h3 or self.div:
                self.a=True
        def end_a(self):
            self.a=False
            
        def start_div(self,attrs):
            if self.h3_is_ready == False:
                return
            if self.div==True:
                self.depth += 1
                
            for k,v in attrs:
                if k == 'class' and v == 'content':
                    self.div=True;
                    self.h3_and_div=True   #h3 and div is connected
        def end_div(self):
            if self.depth == 0:
                self.div=False
                self.h3_and_div=False
                self.h3_is_ready=False
                self.names=""
            if self.div == True:
                self.depth-=1
        def handle_data(self,text):
            #record the name
            if self.h3 and self.a:
                self.names+=text
            #record says
            if self.h3 and (self.a==False):
                if not text:pass
                else: self.dic.setdefault(self.names,[]).append(text)
                return 
            if self.h3_and_div:
                self.dic.setdefault(self.names,[]).append(text)
                
        def show(self):
            type = sys.getfilesystemencoding()
            for key in self.dic:
                print ( (''.join(key)).replace(' ','')).decode('utf-8').encode(type), \
                      ( (''.join(self.dic[key])).replace(' ','')).decode('utf-8').encode(type)
    
    
    
    
    renrenspider=spider('kg15684@tom.com','60909601118')
    renrenspider.login()
    renrenspider.feed(renrenspider.file)
    renrenspider.show()
    
     
  • 相关阅读:
    JS高级-虚拟DOM
    JS高级-异步
    tomcat server.xml中文版
    java中的等于
    eclipse version
    angularjs中父,子,兄之间controller值得传递
    《那一天,那一月,那一年,那一世》-------仓央嘉措
    用jsonp格式的数据进行ajax post请求变成get
    git常用指令
    让div支持placeholder属性/模拟输入框的placeholder属性
  • 原文地址:https://www.cnblogs.com/buptmemory/p/2849456.html
Copyright © 2011-2022 走看看