zoukankan      html  css  js  c++  java
  • 数据结构化与保存

    import requests
    import re
    import pandas
    from bs4 import BeautifulSoup
    
    url = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
    res = requests.get(url)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    
    
    # 获取点击次数
    def getclick(newurl):
        id = re.search('_(.*).html', newurl).group(1).split('/')[1]
        clickurl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(id)
        click = int(requests.get(clickurl).text.split(".")[-1].lstrip("html('").rstrip("');"))
        return click
    
    
    # 给定单挑新闻链接,返回新闻细节的字典
    def getdetail(listurl):
        res = requests.get(listurl)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        news={}
        news['url']=url
        news['title']=soup.select('.show-title')[0].text
        info = soup.select('.show-info')[0].text
        #news['dt']=datetime.strptime(info.lstrip('发布时间')[0:19],'%Y-%m-%d %H:%M:')
        #news['source']=re.search('来源:(.*)点击',info).group(1).strip()
        news['content']=soup.select('.show-content')[0].text.strip()
        news['click']=getclick(listurl)
        return (news)
    #print(getonpages('http://news.gzcc.cn/html/2017/xiaoyuanxinwen_1017/8338.html'))
    
    #给定新闻列表页的链接,返回所有新闻细节字典
    def onepage(pageurl):
        res = requests.get(pageurl)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        newsls=[]
        for news in soup.select('li'):
            if len(news.select('.news-list-title')) > 0:
                newsls.append(getdetail(news.select('a')[0]['href']))
        return(newsls)
    #print(onepage('http://news.gzcc.cn/html/xiaoyuanxinwen/'))
    newstotal=[]
    for i in range(2,3):
        listurl='http://news.gzcc.cn/html/xiaoyuanxinwen/'
        newstotal.extend(onepage(listurl))
    #print(len(newstotal))
    df =pandas.DataFrame(newstotal)
    #print(df.head())
    #print(df['title'])
    df.to_excel('gzccnews.xlsx')
    #with aqlite3.connect('gzccnewsdb2.sqlite') as db:
    #df.to_sql('gzccnewdb2',con=db)
    
    
       

  • 相关阅读:
    POJ 2240 Arbitrage spfa 判正环
    POJ 3259 Wormholes spfa 判负环
    POJ1680 Currency Exchange SPFA判正环
    HDU5649 DZY Loves Sorting 线段树
    HDU 5648 DZY Loves Math 暴力打表
    HDU5647 DZY Loves Connecting 树形DP
    CDOJ 1071 秋实大哥下棋 线段树
    HDU5046 Airport dancing links 重复覆盖+二分
    HDU 3335 Divisibility dancing links 重复覆盖
    FZU1686 神龙的难题 dancing links 重复覆盖
  • 原文地址:https://www.cnblogs.com/garxiu/p/7685221.html
Copyright © 2011-2022 走看看