zoukankan html css js c++ java

获取全部校园新闻

1.取出一个新闻列表页的全部新闻包装成函数。

2.获取总的新闻篇数，算出新闻总页数。

3.获取全部新闻列表页的全部新闻详情。

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

#获得新闻点击次数
def getclick(link):
    newId = re.search('\_(.*).html', link).group(1).split('/')[1]
    click = requests.get('http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newId))
    return click.text.split('.html')[-1].lstrip("('").rstrip("');")


def getnewsdetail(link):
        resd = requests.get(link)
        resd.encoding = 'utf-8'
        soupd = BeautifulSoup(resd.text, 'html.parser')

        content=soupd.select('.show-content')[0].text
        info=soupd.select('.show-info')[0].text
        clickcount = getclick(link)
        time=re.search('(d{4}.d{2}.d{2}sd{2}.d{2}.d{2})',info).group(1)
        if (info.find('作者') > 0):
            author = re.search('作者：((.{2,4}s|.{2,4}、|.{2,4}，|w*s){1,5})', info).group(1)
        else:
            author = 'none'
        if (info.find('审核') > 0):
            auditing = re.search('审核：((.{2,4}s|.{2,4}、|.{2,4}，|w*s){1,5})', info).group(1)
        else:
            auditingr = 'none'
        if (info.find('来源：') > 0):
            source = re.search('来源：(.*)s*摄|点', info).group(1)
        else:
            source = 'none'
        dateTime=datetime.strptime(time,'%Y-%m-%d %H:%M:%S')


        print('发布时间:{0}
作者：{1}
审核：{2}
来源：{3}
点击次数：{4}'.format(dateTime,author,auditing,source,clickcount))
        print(content)

def getlistpage(listlink):
    res=requests.get(listlink)
    res.encoding='utf-8'
    soup=BeautifulSoup(res.text,'html.parser')

    for news in soup.select('li'):
        if (len(news.select('.news-list-title')) > 0):
            title = news.select('.news-list-title')[0].text
            description = news.select('.news-list-description')[0].text
            link = news.a.attrs['href']
            print('新闻标题：{0}
新闻描述：{1}
新闻链接：{2}'.format(title,description,link))
            getnewsdetail(link)
            break

listlink='http://news.gzcc.cn/html/xiaoyuanxinwen/'

from datetime import datetime
getlistpage(listlink)
res=requests.get(listlink)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
listCount = int(soup.select('.a1')[0].text.rstrip('条'))//10+1

for i in range(2,listCount):
    listlink='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
    getlistpage(listlink)

4.找一个自己感兴趣的主题，进行数据爬取，并进行分词分析。不能与其它同学雷同。

# -*- coding: UTF-8 -*-
# -*- author: yjw -*-
import requests
import re
import jieba
from bs4 import BeautifulSoup
from datetime import datetime

def getnewdetail(link):
    res=requests.get(link)
    res.encoding='gb2312'
    soup=BeautifulSoup(res.text,'html.parser')
    Alltext=len(soup.select(".text"))
    content=''
    for p in range(0,Alltext):
        content+=soup.select('.text')[p].text+'
'
    if(Alltext>0):
        print(content+"
词频统计：")
        delword={['我', '他', '你', '了', '那', '又', '-', '的', '我们', '是', '但', '中', '这', '在', '也', '都', '而','你',' ','我','我们', '他', '他们', '我的', '他的', '你的', '呀', '和', '是','，','。','：','“','”','的','啊','?','在','了',
           '说','去','与','不','是','、','也','又','！','着','儿','这','到','就', '
','(',')','那','有','上','便','和','只','要','小','罢','那里',
           '…','一个','？','人','把','被','她','都','道','好','还','’','‘','呢','来','得','你们','才','们'
                   '
', '，', '。', '？', '！', '“', '”', '：', '；', '、', '.', '‘', '’', '（', '）', ' ', '【', '】', '…']
        }
        word={}
        newscontent=list(jieba.cut(content))
        wordfit=set(newscontent)-set(delword)
        for i in wordfit:
            word[i]=newscontent.count(i)
        text = sorted(text3.items(), key=lambda x: x[1], reverse=True)
        for i in range(20):
            print(text[i])
    else:
        print('picture')

def getnewlist(link):
    res=requests.get(link)
    res.encoding='gb2312'
    soup=BeautifulSoup(res.text,'html.parser')
    for newlist in soup.select('.listInfo')[0].select('li'):
        title = newsList.select('a')[0].text
        time = newsList.select('.info')[0].select('p')
        link = newsList.select('a')[0]['href']
        print('
新闻标题：{0}
发表时间:{1}
新闻链接:{2}
'.format(title, time, link))
        getnewdetail(link)

link='http://sports.qq.com/a/20180411/020544.htm'
getnewlist(link)
for i in range(1,20):
    if(i==1):
        getnewlist(link)
    else:
        link="http://sports.qq.com/a/20180411/020544_{}.htm".format(i)
        getnewslist(link)

查看全文

相关阅读:
通用爬虫和聚焦爬虫
 分布式缓存的介绍
 点击按钮执行后台方法
 jsp页面设置绝对路径
 vim调试
 图解Java 垃圾回收机制
 Java String 综述(上篇)
Java 内部类综述
 深入理解Java类加载器(二)：线程上下文类加载器
 深入理解Java类加载器(一)：Java类加载原理解析

原文地址：https://www.cnblogs.com/yjwamao/p/8798974.html

最新文章
最大子段和问题
 Colorful Subsequence
Reversi
某个面试题
 编辑距离
 灾后重建
 Kenken Race
Spinach和发牌姬
 Oooooooo AAAAE
2->SGA介绍