zoukankan      html  css  js  c++  java
  • python网络爬虫(13)博客园用户信息爬取

    说明

    这里只放代码,方案技术没有更变

    代码说明

    需要cookies绕过登录,使用selenium在Firefox下模拟。需要安装geck...?插件,另外,数据存储在sqlite,需要安装。

    Spider.py

    import HtmlDownloader
    import HtmlParser
    import DataOutput
    import UrlManager
    import re
    from selenium import webdriver
    class Spider(object):
        def __init__(self):
            self.downloader=HtmlDownloader.HtmlDownloader()
            self.parser=HtmlParser.HtmlParser()
            self.output=DataOutput.DataOutput()
            self.urlManager=UrlManager.UrlManager()
            self.driver=webdriver.Firefox()
         
        def crawl(self,root_url):
            content=self.downloader.download_root(root_url,self.driver)
            urls=self.parser.parser_url(content)
            self.urlManager.add_urls(urls)
            i=0
            while self.urlManager.new_urls_size()>0 and self.urlManager.old_urls_size()<2000:
                url=self.urlManager.get_new_url()
                i=i+1
                print(str(i)+':'+str(url))
                pattern=re.compile('/.*?/')
                user_name=re.findall(pattern,url)
                url='https://home.cnblogs.com'+user_name[1]
                
                content=self.downloader.download(self.driver,url)
                new_urls=self.parser.parser_url(content)
                self.urlManager.add_urls(new_urls)
                
                try:
                    content=self.parser.parser_data(self.driver)
                    self.output.store_data(content)
                except:
                    i=i-1
                    print('error url may not exits:'+self.driver.current_url)
            self.output.output_end()
            self.urlManager.save_status()
            #self.driver.close()
            print('ed')
    if __name__=='__main__':
        spider=Spider()
        spider.crawl('https://www.cnblogs.com/')
        
        
        
    

    UrlManager.py

    import pickle
    import hashlib
    import re
    class UrlManager():
        def __init__(self):
            self.old_urls=self.load_process('new_urls')
            self.new_urls=self.load_process('old_urls')
            
        def load_process(self,file_name):
            print('loading .')
            try:
                with open(file_name,'rb') as f:
                    tmp=pickle.load(f)
                    return tmp
            except:
                print('file may not exist.will create it')
            new_set=set()
            self.save_process(file_name,new_set)
            return new_set
        
        def save_process(self,file_name,data):
            with open(file_name,'wb') as f:
                pickle.dump(data,f)
        
        def save_status(self):
            self.save_process('new_urls',self.new_urls)
            self.save_process('old_urls',self.old_urls)
        
        def add_urls(self,urls):
            for url in urls:
                m=hashlib.md5()
                m.update(url.encode('utf8'))
                url_md5=m.hexdigest()[8:-8]
                if url not in self.new_urls and url_md5 not in self.old_urls:
                    self.new_urls.add(url)
                
        def get_new_url(self):
            new_url=self.new_urls.pop()
            m=hashlib.md5()
            m.update(new_url.encode('utf8'))
            url_md5=m.hexdigest()[8:-8]
            self.old_urls.add(url_md5)
            return new_url
        
        def new_urls_size(self):
            return len(self.new_urls)
        
        def old_urls_size(self):
            return len(self.old_urls)
        
        
        
        
    

    HtmlParser.py

    import re
    import json
    class HtmlParser(object):
        def parser_url(self,content):
            pattern=re.compile(u'https://www.cnblogs.com/w*/')
            all_urls=re.findall(pattern,content)
            all_urls=list(set(all_urls))
            return all_urls
        
        def parser_data(self,driver):
            dict={}
            user_id=driver.find_element_by_class_name('display_name').text
            all_message=driver.find_element_by_class_name('user_profile').text
            all_message=all_message.split('
    ')
            all_message.insert(0,'用户ID:'+user_id+'
    ')
            switch={'用户ID':'user_id',
                    '姓名':'name',
                    '性别':'sex',
                    '出生日期':'birth_day',
                    '家乡':'hometown',
                    '现居住地':'live_place',
                    '单位':'work_for',
                    '工作状况':'job_status',
                    '感兴趣的技术':'interest_technology',
                    '最近目标':'recent_goal',
                    '座右铭':'mark_words',
                    '自我介绍':'introduce',
                    '园龄':'blog_age',
                    '博客':'blog_address',
                    '婚姻':'marriage',
                    '职位':'position',
                    'QQ':'qq',
                    'Email':'email'
                }
            key=''
            value=''
            for each in all_message:
                try:
                    each=each.replace('
    ','')
                    key=switch[each.split(':')[0]]
                    value=each.split(':')[1]
                    dict[key]=value
                except:
                    print('split error:'+each+'auto fixed..')
                    value=value+each
                    dict[key]=value
                    print(dict)
            return dict
        
        
    

    HtmlDownloader.py

    import json
    class HtmlDownloader(object):
        def download_root(self,url,driver):
            driver.get(url)
            with open('cookies.json', 'r', encoding='utf-8') as f:
                listCookies = json.loads(f.read())
            for cookie in listCookies:
                driver.add_cookie({
                    'domain': cookie['domain'],  # 此处xxx.com前,需要带点
                    'name': cookie['name'],
                    'value': cookie['value']
                })
            driver.refresh()
            return driver.page_source
        
        def download(self,driver,url):
            driver.get(url)
            return driver.page_source
    

    DataOutput.py

    import sqlite3
    class DataOutput(object):
        def __init__(self):
            self.cx=sqlite3.connect("cnblog.db")
            self.table_name='cnblog'
            self.create_table()
         
        def create_table(self):
            values='''
            id integer primary key autoincrement,
            user_id varchar(50) not null,
            name varchar(50),
            sex varchar(6),
            birth_day varchar(30),
            hometown varchar(50),
            live_place varchar(50),
            marriage varchar(20),
            position varchar(30),
            work_for varchar(50),
            job_status varchar(20),
            interest_technology varchar(200),
            recent_goal varchar(500),
            mark_words varchar(500),
            introduce varchar(500),
            blog_age varchar(30),
            blog_address varchar(100),
            qq varchar(15),
            email varchar(30)
            '''
            self.cx.execute('create table if not exists %s(%s)' %(self.table_name,values))
             
        def store_data(self,data):
            flag=0
            user_id=''
            for key,value in data.items():
                if flag==0:
                    cmd="insert into %s (%s) values ('%s')" %(self.table_name,key,value)
                    user_id=value
                    flag=1
                else:
                    cmd='update %s set %s="%s" where user_id="%s"' %(self.table_name,key,value,user_id)
                self.cx.execute(cmd)
            self.cx.commit()
             
        def output_end(self):
            self.cx.close()
            
            
    
  • 相关阅读:
    dev GridControl 代码自定义下拉框
    字符串utf-8相互转换
    .net 相关
    dev grid 样式
    winform 重置快捷写法
    winform 代码定义事件
    winform设置默认打印机
    可为空的对象必须具有一个值
    js获取当前日期之前或之后数据
    sql 快捷方法使用
  • 原文地址:https://www.cnblogs.com/bai2018/p/11219819.html
Copyright © 2011-2022 走看看