zoukankan      html  css  js  c++  java
  • python提取百度经验<标题,发布时间,平均流量,总流量,具体的链接>

    之前想研究下怎么抓网页数据.然后就有了下面的练习了.

    如有BUG.也纯属正常. 只是练习.请勿投入产品使用.

    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    #Filenaem: tqjinyan.py
    
    import os,sys,time,urllib2,re
    import cookielib
    import multiprocessing
    import datetime,time
    
    #定义存储完整的数据字典
    #keyword: 具体短链接地址,发布时间,标题,平均每天浏览量,更新时间,总浏览量,链接地址
    userJinYanAll={}
    
    #提取用户总共有多少经验
    def tiquNumber(url):
        regex = re.compile(r'&pn=(d{1,10})"')
        web=urllib2.urlopen(url).read()
        num= regex.findall(web)
        if not len(num):
            num.append(1)
        num=map(int,num)
        num.sort()
        return num[-1]
    
    #拼接每一页的链接,返回链接的列表
    def retJinYanYe(url,num):
        # print '准备获取分页页面...'
        yesNumLianjie=[]
        for i in range(0,num+1,7):
            yesNumLianjie.append(url+"&pn="+str(i))
        return yesNumLianjie
    
    #返回分页经验
    def retNumTitle(jylist):
        numjisu=0
        for url in jylist:
            numjisu+=1
            #定义正则,链接,发布时间,标题
            regex_href = re.compile(r'<p class="tit"><a href="(.{1,200})" title="')
            regex_time=re.compile('<span class="exp-time">(.{1,12})</span>')
            regex_title=re.compile('" title="(.{1,80})" target="_blank">')
            #定义字典关键词
            regex_keyword=re.compile('e/(.{1,50}).html')
            #获取web分页中的数据
            web=urllib2.urlopen(url).read()
            #获取链接,发布时间,标题
            href=regex_href.findall(web)
            exp_time=regex_time.findall(web)
            title=regex_title.findall(web)
            #进行循环添加至列表的字典中
            # print url
            for i in range(0,len(title)):
                #定义一个空列表,用于添加至列表字典中
                userlist=[]
                keyword = regex_keyword.findall(href[i])
                # print keyword
                userlist.append(href[i])
                userlist.append(exp_time[i])
                userlist.append(title[i])
                # print keyword
                userJinYanAll[keyword[0]]=userlist
            # printstdout('
    正在获取第  %i  页的经验信息...' % numjisu)
    
    
            # print userJinYanAll
    
    #根据地址,使用cookie浏览具体页面,返回浏览量,更新时间
    def retLiuLanNum(keyword,url,i):
        loginUrl='http://jingyan.baidu.com'+url
        #以cookie来访问具体的网页
        # cj = cookielib.CookieJar()
        # opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
        # urllib2.install_opener(opener)
        # resp = urllib2.urlopen(loginUrl)
        req=urllib2.Request(loginUrl,data="")
        f=urllib2.urlopen(req).read()
        regex_liulannum = re.compile(r'<span class="views">(d{1,10})</span>')
        regex_updateTime=re.compile(r'<time>(.{1,20})</time>')
        viewsNum=regex_liulannum.findall(f)
        updateTime=regex_updateTime.findall(f)
        #平均流量
        if int(viewsNum[0])!=0:
            jianGeDay=pingJunNum(keyword,updateTime[0],viewsNum[0])
            pjNum=int(viewsNum[0])/int(jianGeDay)/1.00
            if pjNum<1:
                userJinYanAll[keyword].append('-')
            else:
                userJinYanAll[keyword].append(str(pjNum))
            # print pjNum
        else:
            userJinYanAll[keyword].append('-')
        # print pingJunNum(keyword,updateTime,viewsNum)
        # sys.exit()
        # print viewsNum,updateTime
        userJinYanAll[keyword].append(updateTime[0])
        userJinYanAll[keyword].append(viewsNum[0])
        userJinYanAll[keyword].append(loginUrl)
        # print userJinYanAll
        # sys.exit()
    #下面这句.因为格式需要<换成两行,如需测试.自行优化下格式.
    print str(i)+" "+userJinYanAll[keyword][1]+" "+userJinYanAll[keyword][5]+
          " "+userJinYanAll[keyword][3]+" "+userJinYanAll[keyword][2]+" "+userJinYanAll[keyword][6] def getcookie(): loginUrl='http://jingyan.baidu.com/article/ed2a5d1f1938f909f7be174f.html' cj = cookielib.CookieJar() opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) resp=urllib2.urlopen(loginUrl) f = urllib2.urlopen(url= loginUrl) def menu(url): try: #获取用户姝经验分页 # print '提取经验总数量...' numYe=tiquNumber(url) #根据分页链接获取每页上面的链接 jylist=retJinYanYe(url,numYe) # print '总共有:%s页经验.'%len(jylist) # print '根据分页信息获取具体链接..' # print jylist retNumTitle(jylist) # for t in jylist: #根据链接生成字典数据 # retNumTitle(t) # print '共有:%s篇经验.'%len(userJinYanAll) # print '获取Cookie信息中...' getcookie() # print '获取每篇经验具体的经验信息,此时间比较久,请耐心等待...' # print '----------输出具体的经验列表----------' # print '序列 发布时间 总浏量 平均每天的浏览量 标题 具体链接' i=0 for k,v in userJinYanAll.items(): i+=1 retLiuLanNum(k,v[0],i) # print "%s:%s"%(k,v) # for k,v in userJinYanAll.items(): # print k,v # print v[4]+" "+v[2]+" "+v[1]+" "+v[3]+" "+v[5] # print '-'*50 # print userJinYanAll return userJinYanAll # userjianyanpaixu=sorted(userJinYanAll.iteritems(),key=lambda asd:asd[1],reverse=True) # for k,v in userjianyanpaixu.items(): # i+=1 # print str(i)+" "+userjianyanpaixu[1]+" "+userjianyanpaixu[5]+" "+userjianyanpaixu[3]+
         #    " "+userjianyanpaixu[2]+" "+userjianyanpaixu[6]
    # # print userJinYanAll except KeyboardInterrupt,e: return e # print "QUIT" def printstdout(printname): sys.stdout.write(" %s"%printname) sys.stdout.flush() def pingJunNum(keyword,update,num): # print keyword,update,num updatetime=datetime.datetime.strptime(update,'%Y-%m-%d %H:%M') newde=datetime.datetime.now() chadate= newde-updatetime return str(chadate).split(' ')[0] def sys_input(): url_baidu='http://jingyan.baidu.com/user/npublic/expList?un=' raw_str=urllib2.quote(raw_input('请输入用户百度经验ID: ')) url=url_baidu+raw_str menu(url) # userjianyanpaixu=sorted(userJinYanAll.iteritems(),key=lambda asd:asd[1],reverse=True) # print userjianyanpaixu # for i in userjianyanpaixu: # print i[1] def sys_input_wap(baidujyid): url_baidu='http://jingyan.baidu.com/user/npublic/expList?un=' # raw_str=urllib2.quote(baidujyid) url=url_baidu+baidujyid returntest(url) # return url def returntest(url): web=urllib2.urlopen(url).read() return web if __name__=="__main__": sys_input()

    =======================================================

    ||                  好吧. 代码如上. 或许有BUG.并且没有进行重构.优化.                                 ||
    ||            已知问题.百度会有相关IP访问频率屏蔽.所以不适合商用.仅测试.                          ||

    =======================================================

    版权声明:本文为博主原创文章,未经博主允许不得转载。

  • 相关阅读:
    [07] Redis 持久化
    [06] Redis 事务
    [05] Jedis
    [04] Redis 配置文件
    [03] Redis 数据类型
    [02] Redis 简介&安装
    [01] NoSQL 简介
    06-NULL&typedef
    05-动态内存分配
    朴素贝叶斯分类器Naive Bayes
  • 原文地址:https://www.cnblogs.com/drgcaosheng/p/4650693.html
Copyright © 2011-2022 走看看