zoukankan html css js c++ java

爬虫大作业

# -*- coding: UTF-8 -*-# -*-
import requests
import re
import jieba
import locale
locale=locale.setlocale(locale.LC_CTYPE, 'chinese')

from bs4 import BeautifulSoup
from datetime import datetime


url = "http://ent.chinadaily.com.cn/"
res = requests.get(url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')

def getKeyWords(text):
    str = '''一！“”，。？、；’"',.、·《》()#	：
'''
    for s in str:
        text = text.replace(s, '')
    newsList=list(jieba.lcut(text))

    newsDict = {}
    deleteList = []

    for i in newsDict.keys():
        if len(i) < 2:
            deleteList.append(i)  # 生成单字无意义字符列表
    for i in deleteList:
        del newsDict[i]  # 在词云字典中删除无意义字符
    newsSet = set(newsList) - set(deleteList)
    for i in newsSet:
        newsDict[i] = newsList.count(i)  # 生成词云字典

    dictList = list(newsDict.items())
    dictList.sort(key=lambda x: x[1], reverse=True)






def getNewDetail(newsUrl):
    resd = requests.get(newsUrl)
    resd.encoding = 'utf-8'
    soupd = BeautifulSoup(resd.text, 'html.parser')

    title = soupd.select('h1')[0].text
    info = soupd.select('.xinf-le')[0].text

    t = soupd.select('#pubtime')[0].text
    dt = datetime.strptime(t, ' %Y-%m-%d %H:%M:%S')
    # source = soupd.select('#source')[0].text.lstrip('    来源：')
    biaoqian = soupd.select('.fenx-bq')[0].text.lstrip('标签：')

    if info.find('作者：') > 0:
        au = info[info.find('作者：'):].split()[0].lstrip('作者：')
    else:
        au = 'none'
    if info.find('来源：') > 0:
        source = info[info.find('来源：'):].split()[0].lstrip('来源：')
    else:
        source = 'none'

    content = soupd.select('#Content')[0].text.strip()

    print("标题：", title)
    print("作者：",au)
    print("来源：",source)
    print("发布时间：", dt)
    print("正文：",content)
    print("标签：", biaoqian)
    getKeyWords(content)


    fo = open('D:python/news.txt', 'a+', encoding='UTF-8')
    fo.write('标题：'+title+'
'+"作者："+au+'
'+"来源："+source+'
'+"正文："+content+'
'+"标签："+biaoqian)
    fo.write('
')
    fo.close()


def getListPage(ListPageUrl):
    res = requests.get(ListPageUrl)
    res.encoding = 'utf-8'
    soupd = BeautifulSoup(res.text, 'html.parser')
    pagedetail = []  # 存储一页所有新闻的详情
    for news in soupd.select('.busBox1'):
        atail = news.a.attrs['href']
        # a = 'http://ent.chinadaily.com.cn/' + atail
        getNewDetail(atail)

pagedetail = getListPage('http://ent.chinadaily.com.cn/node_53008149.htm')
for i in range(2, 40):
    listUrl='http://ent.chinadaily.com.cn/node_53008149_{}.htm'
    pagedetail = getListPage(listUrl)

查看全文

相关阅读:
NSCharacterSet 最经常使用的使用方法
 IOS
hdu 3117 Fibonacci Numbers
Hibernate5配置与使用具体解释
 SDNU 1206.蚂蚁感冒【代码如此简单，思维练习】【7月29】
2048游戏分析、讨论与扩展
 hash_set和hash_map
实现邮箱找回的思路分析
 学习OpenCV——粒子滤波（网上两篇文章总结）
学习OpenCV——配置CUDA环境

原文地址：https://www.cnblogs.com/plokm792413896/p/8974455.html