zoukankan      html  css  js  c++  java
  • 使用正则表达式,取得点击次数,函数抽离

    import requests
    from bs4 import BeautifulSoup
    import string
    import time
    import datetime
    import re
    
    
    
    
    #获取文章详情
    def getNewDetail(newsrrl):
        #点击次数
        def getClickCount(newUrl):
                #获取新闻编号
                r2=re.findall('\_d+/(.*?).',d,re.S)
                #print(r2)
                r1='http://oa.gzcc.cn/api.php?op=count&id='
                r3='&modelid=80'
                r22="".join(r2)
    
                #生成点击次数的URL
                r_all=r1+r22+r3
                #print(r_all)
                rlink2=requests.get(r_all,headers=head)
    
                #获取点击次数
                hist=rlink2.text.split('.html')[-1].lstrip("(')").rstrip("');")
                return hist
        soup=BeautifulSoup(r.text,'html.parser')
        for i in soup.select('li'):
            if len(i.select(".news-list-title"))>0:
                a=i.select(".news-list-title")[0].text
                b=i.select(".news-list-info")[0].contents[0].text
                c=i.select(".news-list-info")[0].contents[1].text
                d=i.select("a")[0].attrs['href']
    
                hist=getClickCount(d)
    
    
    
                print("标题:"+a+'
    '+"时间:"+b+'
    '+"来源:"+c+'
    '+"链接:"+d+'
    '+"点击:"+hist+'
    
    ')
                print()
    
    
                rlink=requests.get(d,headers=head)
                rlink.encoding='utf-8'
                #print(rlink.text)
                soup=BeautifulSoup(rlink.text,'html.parser')
                e=soup.select(".show-info")[0].text
                f=e.split()
                for i in range(len(f)-1):
                    print(f[i],end=' ')
                print("点击:"+hist+"")
                print()
                print()
    
    
    
                #时间类型转换
                dt=e.lstrip('发布时间:')[:19]
                dt = datetime.datetime.strptime(dt,'%Y-%m-%d %H:%M:%S')
                print("datetime类型时间:",end=' ')
                print(dt)
                print()
    
    
    
                #作者
                i=e.find('作者:')
                if i>0:
                    s=e[e.find('作者:'):].split()[0].lstrip('作者:')
                print("作者:",end=' ')
                print(s)
                print()
    
    
    
                #审核
                i=e.find('审核:')
                if i>0:
                    s=e[e.find('审核:'):].split()[0].lstrip('审核:')
                print("审核:",end=' ')
                print(s)
                print()
    
    
    
                #来源
                i=e.find('来源:')
                if i>0:
                    s=e[e.find('来源:'):].split()[0].lstrip('来源:')
                print("来源:",end=' ')
                print(s)
                print()
            
            
            
                #摄影
                i=e.find('摄影:')
                if i>0:
                    s=e[e.find('摄影:'):].split()[0].lstrip('摄影:')
                print("摄影:",end=' ')
                print(s)
                print()
    
    
    
                #点击次数
                i=e.find('点击:')
                if i>0:
                    print("点击:",end=' ')
                print(hist)
    
                for pn in range(5):
                    print()
    
    
    
    
                print()
                #打印文章主体
                print(soup.select("#content")[0].text)
                print()
                print()
                print()
    
    #爬虫伪装
    head = {}
    head['user-agent']='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
    
    r=requests.get("http://news.gzcc.cn/html/xiaoyuanxinwen/",headers=head)
    r.encoding='utf-8'
    soup=BeautifulSoup(r.text,'html.parser')
    getNewDetail(r)
    
    
    
    #电话
    telephone=re.findall('(d{3,4})-(d{6,8})',soup.text,re.S)
    print(telephone)
    print()
    
    #邮箱
    email='308800902@qq.com'
    eroll='^([0-9a-zA-Z_]{0,19}@[0-9a-zA-Z_]{0,19}(?:.w{2,3}){0,2})$'
    efinadll=re.findall(eroll,email)
    print(efinadll)
    print()
    
    #英文分词
    estr='''Personal information such as names, birthdays, nicknames, pet's names, social security numbers, and the like 
    should never, ever, ever be used because these are way too obvious and too easy to crack. The more you avoid using 
    things like this as your passwords, the more secure your login areas will be.'''
    print(re.split("[s,.?!]+",estr))

  • 相关阅读:
    C#实战Microsoft Messaging Queue(MSMQ)消息队列(干货)
    实现动态的XML文件读写操作(依然带干货)
    多线程下访问控件的方式(您一定会用到,附源码啦!)
    Microsoft.VisualBasic.dll的妙用(开发中肯定会用到哦)
    vue使用element-ui的el-input监听不了键盘事件解决
    vue强制刷新组件
    asp.net微信公众平台本地调试设置
    武大女硕士面试被拒,改简历冒充本科生找工作的感想(原创)
    完整的站内搜索Demo(Lucene.Net+盘古分词)
    ASP.NET多线程下使用HttpContext.Current为null解决方案
  • 原文地址:https://www.cnblogs.com/wban48/p/8782518.html
Copyright © 2011-2022 走看看