zoukankan html css js c++ java

数据结构化与保存

import requests
import re
from bs4 import BeautifulSoup


def uslHtml(url):
    res = requests.get(url)
    res.encoding = "utf-8"
    soup = BeautifulSoup(res.text, "html.parser")
    return soup
def page(url):
   soup = uslHtml(url)
   newsList = soup.select(".news-list")[0].select("li")
   for aList in newsList:
      a = aList.select("a")[0].attrs["href"]
      number = re.search("_(d+)/(d+)",a).group(0)
      ress = requests.get(a)
      ress.encoding="utf-8"
      soup1 = BeautifulSoup(ress.text,"html.parser")
      content = soup1.select("#content")[0].text
      f.write(content)
      print(content)
so = uslHtml("http://news.gzcc.cn/html/xiaoyuanxinwen/")
n = int(so.select("#pages")[0].select(".a1")[0].text.strip("条"))
n=int(n/10)+1
f = open("SchoolNews.txt","a+",encoding='utf-8')
page("http://news.gzcc.cn/html/xiaoyuanxinwen/")
for i in range(2,3):
    url1 = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i);
    page(url1)
f.close()

def Url(newsUrl):
  re = requests.get(newsUrl)
  re.encoding="utf-8"
  soup = BeautifulSoup(re.text,"html.parser")
  return soup

newsArr= []

def pageNumber():
    soup = Url("http://news.gzcc.cn/html/xiaoyuanxinwen/")
    newsPage = int(soup.select("#pages")[0].select(".a1")[0].text.rstrip("条"))
    newsPage = int((newsPage/10)+1)
    return newsPage

def site():
    newsPage = pageNumber()
    for i in (newsPage,newsPage+1):
        othersUrl = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i)
        soup = Url(othersUrl)
        News(soup)

def News(soup):
    newsList = soup.select(".news-list-text")
    for i in range(len(newsList)):
       dict = {}
       news = newsList[i]
       newsTitle = news.select(".news-list-title")[0].text
       dict["title"] = newsTitle
       newsDescription = news.select(".news-list-description")[0].text
       dict["description"] = newsDescription
       newsArr.insert(len(newsArr),dict)
    print(newsArr)
site()

df = pandas.DataFrame(newsArr)
df.to_excel("title.xlsx")

 print(df[['click', 'title', 'sources']].head(6))

  print(df[(df['click'] > 3000) & (df['sources'] == '学校综合办')])

  sou = ['国际学院', '学生工作处']
  print(df[df['sources'].isin(sou)])

查看全文

相关阅读:
递延收益的主要账务处理
 少数股东权益
 一揽子交易中处置价款与净资产账面价值差额为什么计入其他综合收益
 为什么权益法下其他综合收益合并时要计入投资收益
 R语言代写实现MCMC中的Metropolis–Hastings算法与吉布斯采样
 加速R语言代码的策略
 R语言代写：EM算法和高斯混合模型的实现
 R语言代写进行网站评论文本数据挖掘聚类
 R语言代写对推特数据进行文本情感分析
 WEKA代写文本挖掘分析垃圾邮件分类模型

原文地址：https://www.cnblogs.com/linweicong/p/8857122.html