zoukankan      html  css  js  c++  java
  • 爬虫大作业

    # -*- coding: UTF-8 -*-# -*-
    import requests
    import re
    import jieba
    import locale
    locale=locale.setlocale(locale.LC_CTYPE, 'chinese')
    
    from bs4 import BeautifulSoup
    from datetime import datetime
    
    
    url = "http://ent.chinadaily.com.cn/"
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    
    def getKeyWords(text):
        str = '''一!“”,。?、;’"',.、·《》()#	:
    '''
        for s in str:
            text = text.replace(s, '')
        newsList=list(jieba.lcut(text))
    
        newsDict = {}
        deleteList = []
    
        for i in newsDict.keys():
            if len(i) < 2:
                deleteList.append(i)  # 生成单字无意义字符列表
        for i in deleteList:
            del newsDict[i]  # 在词云字典中删除无意义字符
        newsSet = set(newsList) - set(deleteList)
        for i in newsSet:
            newsDict[i] = newsList.count(i)  # 生成词云字典
    
        dictList = list(newsDict.items())
        dictList.sort(key=lambda x: x[1], reverse=True)
    
    
    
    
    
    
    def getNewDetail(newsUrl):
        resd = requests.get(newsUrl)
        resd.encoding = 'utf-8'
        soupd = BeautifulSoup(resd.text, 'html.parser')
    
        title = soupd.select('h1')[0].text
        info = soupd.select('.xinf-le')[0].text
    
        t = soupd.select('#pubtime')[0].text
        dt = datetime.strptime(t, ' %Y-%m-%d %H:%M:%S')
        # source = soupd.select('#source')[0].text.lstrip('    来源:')
        biaoqian = soupd.select('.fenx-bq')[0].text.lstrip('标签:')
    
        if info.find('作者:') > 0:
            au = info[info.find('作者:'):].split()[0].lstrip('作者:')
        else:
            au = 'none'
        if info.find('来源:') > 0:
            source = info[info.find('来源:'):].split()[0].lstrip('来源:')
        else:
            source = 'none'
    
        content = soupd.select('#Content')[0].text.strip()
    
        print("标题:", title)
        print("作者:",au)
        print("来源:",source)
        print("发布时间:", dt)
        print("正文:",content)
        print("标签:", biaoqian)
        getKeyWords(content)
    
    
        fo = open('D:python/news.txt', 'a+', encoding='UTF-8')
        fo.write('标题:'+title+'
    '+"作者:"+au+'
    '+"来源:"+source+'
    '+"正文:"+content+'
    '+"标签:"+biaoqian)
        fo.write('
    ')
        fo.close()
    
    
    def getListPage(ListPageUrl):
        res = requests.get(ListPageUrl)
        res.encoding = 'utf-8'
        soupd = BeautifulSoup(res.text, 'html.parser')
        pagedetail = []  # 存储一页所有新闻的详情
        for news in soupd.select('.busBox1'):
            atail = news.a.attrs['href']
            # a = 'http://ent.chinadaily.com.cn/' + atail
            getNewDetail(atail)
    
    pagedetail = getListPage('http://ent.chinadaily.com.cn/node_53008149.htm')
    for i in range(2, 40):
        listUrl='http://ent.chinadaily.com.cn/node_53008149_{}.htm'
        pagedetail = getListPage(listUrl)

  • 相关阅读:
    分享:liblfds 6.1.0 发布,C 数据结构库
    strchr C++ Reference
    爱上MVC3系列~PartialView中的页面重定向
    基础才是重中之重~Conditional特性使代码根据条件在debug或者release模式中执行
    爱上MVC3系列~Html.BeginForm与Ajax.BeginForm
    爱上MVC3系列~Razor页面中的共享namespace不起作用了(解决自定义扩展方法不能识别的问题)
    爱上MVC3系列~RenderAction与RenderPartial及一个页面多个表单提交
    NHibernate Criteria中的And, Or
    poj 2528 Mayor's posters(线段树区点)
    探索iptables BPF模块的悲惨历程
  • 原文地址:https://www.cnblogs.com/plokm792413896/p/8974455.html
Copyright © 2011-2022 走看看