zoukankan      html  css  js  c++  java
  • 爬虫大作业

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    import jieba
    
    def getNewsDetail(newsurl):#获取新闻详情
        resd=requests.get(newsurl)
        resd.encoding='utf-8'
        soupd=BeautifulSoup(resd.text,'html.parser')
    
        click=soupd.select('.like')[0].text.split(" ")[0]
        title = soupd.select('h1')[0].text
        info=soupd.select('.pdate')[0].text
        dt=datetime.strptime(info.lstrip('发布时间:')[0:19],'%Y-%m-%d %H:%M:%S')
        author=soupd.select('p')[0].text.split(" ")[1].strip('<p>')
        delcontent=soupd.select('p')[0].text
        newscontent=soupd.select('.maintext')[0].text.lstrip(delcontent)
    
        keyWords=getKeyWords(newscontent)
    
        print(dt)
        print(title)
        print(click)
        print(author)
        print(newscontent)
        print(keyWords)
        f = open("C:python/pachong.txt", 'w', encoding='utf8')
        f.write(newscontent)
        f.close()
    
    def getKeyWords(newscontent):#获取新闻关键词
        newscontent = ''.join(re.findall('[u4e00-u9fa5]', newscontent))
        wordSet=set(jieba._lcut(newscontent))
        wordDict={}
        for i in wordSet:
            wordDict[i]=newscontent.count(i)
        delList=[]
        for i in wordDict.keys():
            if len(i)<2:
                delList.append(i)
        for i in delList:
            del wordDict[i]
        dictList=list(wordDict.items())
        dictList.sort(key=lambda item: item[1], reverse=True)
        keyWords=[]
        for i in range(20):
            keyWords.append(dictList[i][0])
        return keyWords
    
    
    newsurl="http://news.gdufe.edu.cn/11499"
    getNewsDetail(newsurl)
    

      

  • 相关阅读:
    使用arcpy添加grb2数据到镶嵌数据集中
    使用python把gdb格式的文本文件转为utf-8的格式
    Spring Cloud
    windows 下nginx配置php支持
    nginx开启gzip
    ant design 修改tab样式
    使用arcpy替换工程文件中的栅格图层数据源异常
    cenos 安装hadoop
    失败
    linux挂载新硬盘
  • 原文地址:https://www.cnblogs.com/206cch/p/8974413.html
Copyright © 2011-2022 走看看