zoukankan      html  css  js  c++  java
  • 数据结构化与保存

    import requests
    from bs4 import BeautifulSoup
    from datetime import datetime
    import re
    
    def getclick(newsurl):
        num = re.search('_(.*).html',newsurl)
        id = num.group(1)[5:]
        clickurl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(id)
        click = int(requests.get(clickurl).text.split('.')[-1].lstrip("html('").rstrip("');"))
        return(int(click))
    
    def getdetail(url):
        resd = requests.get(url)
        resd.encoding = 'utf-8'
        soupd = BeautifulSoup(resd.text,'html.parser')
        news = {}
        news['url'] = url
        news['title'] = soupd.select('.show-title')[0].text
        info = soupd.select('.show-info')[0].text
        news['dt']=datetime.strptime(info.lstrip('发表时间:')[0:19],'%Y-%m-%d %H:%M:%S')
        news['source'] = re.search('来源:(.*)点击',info).group(1).strip()
        news['click'] = getclick(url)
        return (news)
    
    def onepage(pageurl):
        res = requests.get(pageurl)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text,'html.parser')
        newsls = []
        for news in soup.select('li'):
            if len(news.select('.news-list-title'))>0:
                newsls.append(getdetail(news.select('a')[0]['href']))
        return(newsls)
    
    newstotal = []
    gzccurl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
    newstotal.extend(onepage(gzccurl))
    
    res = requests.get(gzccurl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text,'html.parser')
    n = int(soup.select('.a1')[0].text.rstrip(''))
    pages = n//10+1  #计算多少条多少页
    
    for i in range(2,3):
        listurl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
        newstotal.extend(onepage(listurl))
    
    
    lf=pandas.DataFrame(newstotal)
    
    df.to_excel('gzccnews.xlsx')
    
    
    with sqlital.connect('gzccnewsdb.sqlite')  as db:
        df.to_sql('gzccnewsdb',con=db)
  • 相关阅读:
    HBase with MapReduce (MultiTable Read)
    HBase with MapReduce (SummaryToFile)
    HBase with MapReduce (Summary)
    HBase with MapReduce (Read and Write)
    HBase with MapReduce (Only Read)
    Hbase中的BloomFilter(布隆过滤器)
    HBase的快照技术
    How To Use Hbase Bulk Loading
    Cloudera-Manager修改集群的IP
    Java中的HashSet和TreeSet
  • 原文地址:https://www.cnblogs.com/knight-hui/p/7685815.html
Copyright © 2011-2022 走看看