zoukankan      html  css  js  c++  java
  • 使用正则表达式,取得点击次数,函数抽离

    import requests
    from bs4 import BeautifulSoup
    import string
    import time
    import datetime
    import re
    
    
    
    
    #获取文章详情
    def getNewDetail(newsrrl):
        #点击次数
        def getClickCount(newUrl):
                #获取新闻编号
                r2=re.findall('\_d+/(.*?).',d,re.S)
                #print(r2)
                r1='http://oa.gzcc.cn/api.php?op=count&id='
                r3='&modelid=80'
                r22="".join(r2)
    
                #生成点击次数的URL
                r_all=r1+r22+r3
                #print(r_all)
                rlink2=requests.get(r_all,headers=head)
    
                #获取点击次数
                hist=rlink2.text.split('.html')[-1].lstrip("(')").rstrip("');")
                return hist
        soup=BeautifulSoup(r.text,'html.parser')
        for i in soup.select('li'):
            if len(i.select(".news-list-title"))>0:
                a=i.select(".news-list-title")[0].text
                b=i.select(".news-list-info")[0].contents[0].text
                c=i.select(".news-list-info")[0].contents[1].text
                d=i.select("a")[0].attrs['href']
    
                hist=getClickCount(d)
    
    
    
                print("标题:"+a+'
    '+"时间:"+b+'
    '+"来源:"+c+'
    '+"链接:"+d+'
    '+"点击:"+hist+'
    
    ')
                print()
    
    
                rlink=requests.get(d,headers=head)
                rlink.encoding='utf-8'
                #print(rlink.text)
                soup=BeautifulSoup(rlink.text,'html.parser')
                e=soup.select(".show-info")[0].text
                f=e.split()
                for i in range(len(f)-1):
                    print(f[i],end=' ')
                print("点击:"+hist+"")
                print()
                print()
    
    
    
                #时间类型转换
                dt=e.lstrip('发布时间:')[:19]
                dt = datetime.datetime.strptime(dt,'%Y-%m-%d %H:%M:%S')
                print("datetime类型时间:",end=' ')
                print(dt)
                print()
    
    
    
                #作者
                i=e.find('作者:')
                if i>0:
                    s=e[e.find('作者:'):].split()[0].lstrip('作者:')
                print("作者:",end=' ')
                print(s)
                print()
    
    
    
                #审核
                i=e.find('审核:')
                if i>0:
                    s=e[e.find('审核:'):].split()[0].lstrip('审核:')
                print("审核:",end=' ')
                print(s)
                print()
    
    
    
                #来源
                i=e.find('来源:')
                if i>0:
                    s=e[e.find('来源:'):].split()[0].lstrip('来源:')
                print("来源:",end=' ')
                print(s)
                print()
            
            
            
                #摄影
                i=e.find('摄影:')
                if i>0:
                    s=e[e.find('摄影:'):].split()[0].lstrip('摄影:')
                print("摄影:",end=' ')
                print(s)
                print()
    
    
    
                #点击次数
                i=e.find('点击:')
                if i>0:
                    print("点击:",end=' ')
                print(hist)
    
                for pn in range(5):
                    print()
    
    
    
    
                print()
                #打印文章主体
                print(soup.select("#content")[0].text)
                print()
                print()
                print()
    
    #爬虫伪装
    head = {}
    head['user-agent']='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
    
    r=requests.get("http://news.gzcc.cn/html/xiaoyuanxinwen/",headers=head)
    r.encoding='utf-8'
    soup=BeautifulSoup(r.text,'html.parser')
    getNewDetail(r)
    
    
    
    #电话
    telephone=re.findall('(d{3,4})-(d{6,8})',soup.text,re.S)
    print(telephone)
    print()
    
    #邮箱
    email='308800902@qq.com'
    eroll='^([0-9a-zA-Z_]{0,19}@[0-9a-zA-Z_]{0,19}(?:.w{2,3}){0,2})$'
    efinadll=re.findall(eroll,email)
    print(efinadll)
    print()
    
    #英文分词
    estr='''Personal information such as names, birthdays, nicknames, pet's names, social security numbers, and the like 
    should never, ever, ever be used because these are way too obvious and too easy to crack. The more you avoid using 
    things like this as your passwords, the more secure your login areas will be.'''
    print(re.split("[s,.?!]+",estr))

  • 相关阅读:
    Android NDK学习(1) 简介
    wmsys.wm_concat结果长度限制的问题
    onInterceptTouchEvent和onTouchEvent调用时序
    滑动到底部或顶部响应的ScrollView实现
    Android ViewPager使用详解
    android include标签的使用,在RelativeLayout中使用include标签需注意!!!!!
    Eclipse中如何在指定工程中搜索指定的字符串
    android:windowSoftInputMode属性详解
    cocos2d-x中关于touch事件的响应
    《从零开始学Swift》学习笔记(Day 6)——哎呀常量和变量都该什么时候用啊?
  • 原文地址:https://www.cnblogs.com/wban48/p/8782518.html
Copyright © 2011-2022 走看看